49 files changed, 755 insertions, 376 deletions
diff --git a/Documentation/debugging-via-ohci1394.txt b/Documentation/debugging-via-ohci1394.txt
index de4804e8b396..c360d4e91b48 100644
--- a/Documentation/debugging-via-ohci1394.txt
+++ b/Documentation/debugging-via-ohci1394.txt
@@ -36,14 +36,15 @@ available (notebooks) or too slow for extensive debug information (like ACPI).
 Drivers
 -------
 
-The OHCI-1394 drivers in drivers/firewire and drivers/ieee1394 initialize
-the OHCI-1394 controllers to a working state and can be used to enable
-physical DMA. By default you only have to load the driver, and physical
-DMA access will be granted to all remote nodes, but it can be turned off
-when using the ohci1394 driver.
-
-Because these drivers depend on the PCI enumeration to be completed, an
-initialization routine which can runs pretty early (long before console_init(),
+The ohci1394 driver in drivers/ieee1394 initializes the OHCI-1394 controllers
+to a working state and enables physical DMA by default for all remote nodes.
+This can be turned off by ohci1394's module parameter phys_dma=0.
+
+The alternative firewire-ohci driver in drivers/firewire uses filtered physical
+DMA, hence is not yet suitable for remote debugging.
+
+Because ohci1394 depends on the PCI enumeration to be completed, an
+initialization routine which runs pretty early (long before console_init()
 which makes the printk buffer appear on the console can be called) was written.
 
 To activate it, enable CONFIG_PROVIDE_OHCI1394_DMA_INIT (Kernel hacking menu:
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index fc50d2f959d1..e8cb9ff183e9 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -128,8 +128,6 @@ void *get_current(void)
 	return current;
 }
 
-extern void schedule_tail(struct task_struct *prev);
-
 /*
  * This is called magically, by its address being stuffed in a jmp_buf
  * and being longjmp-d to.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e09a6b73a1aa..6d50064db182 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -377,6 +377,19 @@ config X86_OOSTORE
 	def_bool y
 	depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
 
+#
+# P6_NOPs are a relatively minor optimization that require a family >=
+# 6 processor, except that it is broken on certain VIA chips.
+# Furthermore, AMD chips prefer a totally different sequence of NOPs
+# (which work on all CPUs).  As a result, disallow these if we're
+# compiling X86_GENERIC but not X86_64 (these NOPs do work on all
+# x86-64 capable chips); the list of processors in the right-hand clause
+# are the cores that benefit from this optimization.
+#
+config X86_P6_NOP
+	def_bool y
+	depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4)
+
 config X86_TSC
 	def_bool y
 	depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
@@ -390,6 +403,7 @@ config X86_CMOV
 config X86_MINIMUM_CPU_FAMILY
 	int
 	default "64" if X86_64
+	default "6" if X86_32 && X86_P6_NOP
 	default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
 	default "3"
 
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 378353956b5d..e77d89f9e8aa 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -37,6 +37,12 @@ static int detect_memory_e820(void)
 		      "=m" (*desc)
 		    : "D" (desc), "d" (SMAP), "a" (0xe820));
 
+		/* BIOSes which terminate the chain with CF = 1 as opposed
+		   to %ebx = 0 don't always report the SMAP signature on
+		   the final, failing, probe. */
+		if (err)
+			break;
+
 		/* Some BIOSes stop returning SMAP in the middle of
 		   the search loop.  We don't know exactly how the BIOS
 		   screwed up the map at that point, we might have a
@@ -47,9 +53,6 @@ static int detect_memory_e820(void)
 			break;
 		}
 
-		if (err)
-			break;
-
 		count++;
 		desc++;
 	} while (next && count < E820MAX);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index a33d53017997..8ea040124f7d 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -128,13 +128,11 @@ void foo(void)
 	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
 #endif
 
-#ifdef CONFIG_LGUEST_GUEST
+#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
 	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
-#endif
 
-#ifdef CONFIG_LGUEST
 	BLANK();
 	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
 	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f86a3c4a2669..a38aafaefc23 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -504,7 +504,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	/* Clear all flags overriden by options */
 	for (i = 0; i < NCAPINTS; i++)
-		c->x86_capability[i] ^= cleared_cpu_caps[i];
+		c->x86_capability[i] &= ~cleared_cpu_caps[i];
 
 	/* Init Machine Check Exception if available. */
 	mcheck_init(c);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index b6e136f23d3d..be83336fddba 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -43,6 +43,7 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
+#include <asm/kvm_para.h>
 #include "mtrr.h"
 
 u32 num_var_ranges = 0;
@@ -649,6 +650,7 @@ static __init int amd_special_default_mtrr(void)
 
 /**
  * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ * @end_pfn: ending page frame number
  *
  * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
  * memory configurations.  This routine checks that the highest MTRR matches
@@ -688,8 +690,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 
 	/* kvm/qemu doesn't have mtrr set right, don't trim them all */
 	if (!highest_pfn) {
-		printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n");
-		WARN_ON(1);
+		if (!kvm_para_available()) {
+			printk(KERN_WARNING
+				"WARNING: strange, CPU MTRRs all blank?\n");
+			WARN_ON(1);
+		}
 		return 0;
 	}
 
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 200fb3f9ebfb..e8b422c1c512 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -76,13 +76,6 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 	/* All Transmeta CPUs have a constant TSC */
 	set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
 	
-	/* If we can run i686 user-space code, call us an i686 */
-#define USER686 ((1 << X86_FEATURE_TSC)|\
-		 (1 << X86_FEATURE_CX8)|\
-		 (1 << X86_FEATURE_CMOV))
-        if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686)
-		c->x86 = 6;
-
 #ifdef CONFIG_SYSCTL
 	/* randomize_va_space slows us down enormously;
 	   it probably triggers retranslation of x86->native bytecode */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2ad9a1bc6a73..c20c9e7e08dd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -453,6 +453,7 @@ ENTRY(stub_execve)
 	CFI_REGISTER rip, r11
 	SAVE_REST
 	FIXUP_TOP_OF_STACK %r11
+	movq %rsp, %rcx
 	call sys_execve
 	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
@@ -1036,15 +1037,16 @@ ENDPROC(child_rip)
  *	rdi: name, rsi: argv, rdx: envp
  *
  * We want to fallback into:
- *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
+ *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
  *
  * do_sys_execve asm fallback arguments:
- *	rdi: name, rsi: argv, rdx: envp, fake frame on the stack
+ *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
  */
 ENTRY(kernel_execve)
 	CFI_STARTPROC
 	FAKE_STACK_FRAME $0
 	SAVE_ALL	
+	movq %rsp,%rcx
 	call sys_execve
 	movq %rax, RAX(%rsp)	
 	RESTORE_REST
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 25eb98540a41..fd8ca53943a8 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -606,7 +606,7 @@ ENTRY(_stext)
 .section ".bss.page_aligned","wa"
 	.align PAGE_SIZE_asm
 #ifdef CONFIG_X86_PAE
-ENTRY(swapper_pg_pmd)
+swapper_pg_pmd:
 	.fill 1024*KPMDS,4,0
 #else
 ENTRY(swapper_pg_dir)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index eb415043a929..a007454133a3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -379,18 +379,24 @@ NEXT_PAGE(level2_ident_pgt)
 	/* Since I easily can, map the first 1G.
 	 * Don't set NX because code runs from these pages.
 	 */
-	PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
+	PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
 
 NEXT_PAGE(level2_kernel_pgt)
-	/* 40MB kernel mapping. The kernel code cannot be bigger than that.
-	   When you change this change KERNEL_TEXT_SIZE in page.h too. */
-	/* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
-	PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE)
-	/* Module mapping starts here */
-	.fill	(PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
+	/*
+	 * 128 MB kernel mapping. We spend a full page on this pagetable
+	 * anyway.
+	 *
+	 * The kernel code+data+bss must not be bigger than that.
+	 *
+	 * (NOTE: at +128MB starts the module area, see MODULES_VADDR.
+	 *  If you want to increase this then increase MODULES_VADDR
+	 *  too.)
+	 */
+	PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL,
+		KERNEL_IMAGE_SIZE/PMD_SIZE)
 
 NEXT_PAGE(level2_spare_pgt)
-	.fill   512,8,0
+	.fill   512, 8, 0
 
 #undef PMDS
 #undef NEXT_PAGE
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 429d084e014d..235fd6c77504 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -368,8 +368,8 @@ static int hpet_clocksource_register(void)
 	return 0;
 }
 
-/*
- * Try to setup the HPET timer
+/**
+ * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
  */
 int __init hpet_enable(void)
 {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b0cc8f0136d8..43f287744f9f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -730,16 +730,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
  */
 asmlinkage
 long sys_execve(char __user *name, char __user * __user *argv,
-		char __user * __user *envp, struct pt_regs regs)
+		char __user * __user *envp, struct pt_regs *regs)
 {
 	long error;
 	char * filename;
 
 	filename = getname(name);
 	error = PTR_ERR(filename);
-	if (IS_ERR(filename)) 
+	if (IS_ERR(filename))
 		return error;
-	error = do_execve(filename, argv, envp, &regs); 
+	error = do_execve(filename, argv, envp, regs);
 	putname(filename);
 	return error;
 }
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 6fd804f07821..7637dc91c79b 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -1021,7 +1021,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	/* Clear all flags overriden by options */
 	for (i = 0; i < NCAPINTS; i++)
-		c->x86_capability[i] ^= cleared_cpu_caps[i];
+		c->x86_capability[i] &= ~cleared_cpu_caps[i];
 
 #ifdef CONFIG_X86_MCE
 	mcheck_init(c);
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index d53bd6fcb428..0880f2c388a9 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -554,10 +554,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 	int timeout;
 	unsigned long start_rip;
 	struct create_idle c_idle = {
-		.work = __WORK_INITIALIZER(c_idle.work, do_fork_idle),
 		.cpu = cpu,
 		.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
 	};
+	INIT_WORK(&c_idle.work, do_fork_idle);
 
 	/* allocate memory for gdts of secondary cpus. Hotplug is considered */
 	if (!cpu_gdt_descr[cpu].address &&
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 02f0f61f5b11..c28c342c162f 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -25,6 +25,8 @@ static int save_stack_stack(void *data, char *name)
 static void save_stack_address(void *data, unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = data;
+	if (!reliable)
+		return;
 	if (trace->skip > 0) {
 		trace->skip--;
 		return;
@@ -37,6 +39,8 @@ static void
 save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = (struct stack_trace *)data;
+	if (!reliable)
+		return;
 	if (in_sched_functions(addr))
 		return;
 	if (trace->skip > 0) {
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 43517e324be8..f14cfd9d1f94 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -28,7 +28,8 @@ EXPORT_SYMBOL_GPL(tsc_khz);
 static int __init tsc_setup(char *str)
 {
 	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
-				"cannot disable TSC.\n");
+				"cannot disable TSC completely.\n");
+	mark_tsc_unstable("user disabled TSC");
 	return 1;
 }
 #else
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3f8242774580..b6be812fac05 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -44,11 +44,6 @@
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
 #define __syscall_clobber "r11","cx","memory"
-#define __pa_vsymbol(x)			\
-	({unsigned long v;  		\
-	extern char __vsyscall_0; 	\
-	  asm("" : "=r" (v) : "0" (x)); \
-	  ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
 
 /*
  * vsyscall_gtod_data contains data that is :
@@ -102,7 +97,7 @@ static __always_inline void do_get_tz(struct timezone * tz)
 static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	int ret;
-	asm volatile("vsysc2: syscall"
+	asm volatile("syscall"
 		: "=a" (ret)
 		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
 		: __syscall_clobber );
@@ -112,7 +107,7 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
 static __always_inline long time_syscall(long *t)
 {
 	long secs;
-	asm volatile("vsysc1: syscall"
+	asm volatile("syscall"
 		: "=a" (secs)
 		: "0" (__NR_time),"D" (t) : __syscall_clobber);
 	return secs;
@@ -227,50 +222,10 @@ long __vsyscall(3) venosys_1(void)
 }
 
 #ifdef CONFIG_SYSCTL
-
-#define SYSCALL 0x050f
-#define NOP2    0x9090
-
-/*
- * NOP out syscall in vsyscall page when not needed.
- */
-static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
-                        void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	extern u16 vsysc1, vsysc2;
-	u16 __iomem *map1;
-	u16 __iomem *map2;
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-	if (!write)
-		return ret;
-	/* gcc has some trouble with __va(__pa()), so just do it this
-	   way. */
-	map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
-	if (!map1)
-		return -ENOMEM;
-	map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
-	if (!map2) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	if (!vsyscall_gtod_data.sysctl_enabled) {
-		writew(SYSCALL, map1);
-		writew(SYSCALL, map2);
-	} else {
-		writew(NOP2, map1);
-		writew(NOP2, map2);
-	}
-	iounmap(map2);
-out:
-	iounmap(map1);
-	return ret;
-}
-
 static ctl_table kernel_table2[] = {
 	{ .procname = "vsyscall64",
 	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
-	  .mode = 0644,
-	  .proc_handler = vsyscall_sysctl_change },
+	  .mode = 0644 },
 	{}
 };
 
@@ -279,7 +234,6 @@ static ctl_table kernel_root_table2[] = {
 	  .child = kernel_table2 },
 	{}
 };
-
 #endif
 
 /* Assume __initcall executes before all user space. Hopefully kmod
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 5afdde4895dc..cccb38a59653 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -57,6 +57,7 @@
 #include <linux/lguest_launcher.h>
 #include <linux/virtio_console.h>
 #include <linux/pm.h>
+#include <asm/lguest.h>
 #include <asm/paravirt.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -75,15 +76,6 @@
  * behaving in simplified but equivalent ways.  In particular, the Guest is the
  * same kernel as the Host (or at least, built from the same source code). :*/
 
-/* Declarations for definitions in lguest_guest.S */
-extern char lguest_noirq_start[], lguest_noirq_end[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_sti[], lgend_sti[];
-extern const char lgstart_popf[], lgend_popf[];
-extern const char lgstart_pushf[], lgend_pushf[];
-extern const char lgstart_iret[], lgend_iret[];
-extern void lguest_iret(void);
-
 struct lguest_data lguest_data = {
 	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
 	.noirq_start = (u32)lguest_noirq_start,
@@ -489,7 +481,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
 	*pmdp = pmdval;
 	lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
-		   (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
+		   (__pa(pmdp)&(PAGE_SIZE-1)), 0);
 }
 
 /* There are a couple of legacy places where the kernel sets a PTE, but we
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bb652f5a93fb..a02a14f0f324 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -172,8 +172,9 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
 }
 
 /*
- * The head.S code sets up the kernel high mapping from:
- * __START_KERNEL_map to __START_KERNEL_map + KERNEL_TEXT_SIZE
+ * The head.S code sets up the kernel high mapping:
+ *
+ *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
  *
  * phys_addr holds the negative offset to the kernel, which is added
  * to the compile time generated pmds. This results in invalid pmds up
@@ -515,14 +516,6 @@ void __init mem_init(void)
 
 	/* clear_bss() already clear the empty_zero_page */
 
-	/* temporary debugging - double check it's true: */
-	{
-		int i;
-
-		for (i = 0; i < 1024; i++)
-			WARN_ON_ONCE(empty_zero_page[i]);
-	}
-
 	reservedpages = 0;
 
 	/* this will put all low memory onto the freelists */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 464d8fc21ce6..14e48b5a94ba 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -44,6 +44,12 @@ static inline unsigned long highmap_end_pfn(void)
 
 #endif
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+# define debug_pagealloc 1
+#else
+# define debug_pagealloc 0
+#endif
+
 static inline int
 within(unsigned long addr, unsigned long start, unsigned long end)
 {
@@ -355,45 +361,48 @@ out_unlock:
 
 static LIST_HEAD(page_pool);
 static unsigned long pool_size, pool_pages, pool_low;
-static unsigned long pool_used, pool_failed, pool_refill;
+static unsigned long pool_used, pool_failed;
 
-static void cpa_fill_pool(void)
+static void cpa_fill_pool(struct page **ret)
 {
-	struct page *p;
 	gfp_t gfp = GFP_KERNEL;
+	unsigned long flags;
+	struct page *p;
 
-	/* Do not allocate from interrupt context */
-	if (in_irq() || irqs_disabled())
-		return;
 	/*
-	 * Check unlocked. I does not matter when we have one more
-	 * page in the pool. The bit lock avoids recursive pool
-	 * allocations:
+	 * Avoid recursion (on debug-pagealloc) and also signal
+	 * our priority to get to these pagetables:
 	 */
-	if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill))
+	if (current->flags & PF_MEMALLOC)
 		return;
+	current->flags |= PF_MEMALLOC;
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
 	/*
-	 * We could do:
-	 * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL;
-	 * but this fails on !PREEMPT kernels
+	 * Allocate atomically from atomic contexts:
 	 */
-	gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
-#endif
+	if (in_atomic() || irqs_disabled() || debug_pagealloc)
+		gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
 
-	while (pool_pages < pool_size) {
+	while (pool_pages < pool_size || (ret && !*ret)) {
 		p = alloc_pages(gfp, 0);
 		if (!p) {
 			pool_failed++;
 			break;
 		}
-		spin_lock_irq(&pgd_lock);
+		/*
+		 * If the call site needs a page right now, provide it:
+		 */
+		if (ret && !*ret) {
+			*ret = p;
+			continue;
+		}
+		spin_lock_irqsave(&pgd_lock, flags);
 		list_add(&p->lru, &page_pool);
 		pool_pages++;
-		spin_unlock_irq(&pgd_lock);
+		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
-	clear_bit_unlock(0, &pool_refill);
+
+	current->flags &= ~PF_MEMALLOC;
 }
 
 #define SHIFT_MB		(20 - PAGE_SHIFT)
@@ -414,11 +423,15 @@ void __init cpa_init(void)
 	 * GiB. Shift MiB to Gib and multiply the result by
 	 * POOL_PAGES_PER_GB:
 	 */
-	gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
-	pool_size = POOL_PAGES_PER_GB * gb;
+	if (debug_pagealloc) {
+		gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
+		pool_size = POOL_PAGES_PER_GB * gb;
+	} else {
+		pool_size = 1;
+	}
 	pool_low = pool_size;
 
-	cpa_fill_pool();
+	cpa_fill_pool(NULL);
 	printk(KERN_DEBUG
 	       "CPA: page pool initialized %lu of %lu pages preallocated\n",
 	       pool_pages, pool_size);
@@ -440,16 +453,20 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	spin_lock_irqsave(&pgd_lock, flags);
 	if (list_empty(&page_pool)) {
 		spin_unlock_irqrestore(&pgd_lock, flags);
-		return -ENOMEM;
+		base = NULL;
+		cpa_fill_pool(&base);
+		if (!base)
+			return -ENOMEM;
+		spin_lock_irqsave(&pgd_lock, flags);
+	} else {
+		base = list_first_entry(&page_pool, struct page, lru);
+		list_del(&base->lru);
+		pool_pages--;
+
+		if (pool_pages < pool_low)
+			pool_low = pool_pages;
 	}
 
-	base = list_first_entry(&page_pool, struct page, lru);
-	list_del(&base->lru);
-	pool_pages--;
-
-	if (pool_pages < pool_low)
-		pool_low = pool_pages;
-
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
@@ -734,7 +751,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
 		cpa_flush_all(cache);
 
 out:
-	cpa_fill_pool();
+	cpa_fill_pool(NULL);
+
 	return ret;
 }
 
@@ -897,7 +915,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	 * Try to refill the page pool here. We can do this only after
 	 * the tlb flush.
 	 */
-	cpa_fill_pool();
+	cpa_fill_pool(NULL);
 }
 
 #ifdef CONFIG_HIBERNATION
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index f385a4b4a484..b8bd0c4aa02e 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -48,7 +48,7 @@ obj-$(VDSO64-y)			+= vdso-syms.lds
 # Match symbols in the DSO that look like VDSO*; produce a file of constants.
 #
 sed-vdsosym := -e 's/^00*/0/' \
-	-e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p'
+	-e 's/^\([[:xdigit:]]*\) . \(VDSO[[:alnum:]_]*\)$$/\2 = 0x\1;/p'
 quiet_cmd_vdsosym = VDSOSYM $@
       cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
 
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index fbc24358ada0..4fbcce758b04 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -113,7 +113,7 @@ int atapi_enabled = 1;
 module_param(atapi_enabled, int, 0444);
 MODULE_PARM_DESC(atapi_enabled, "Enable discovery of ATAPI devices (0=off, 1=on)");
 
-int atapi_dmadir = 0;
+static int atapi_dmadir = 0;
 module_param(atapi_dmadir, int, 0444);
 MODULE_PARM_DESC(atapi_dmadir, "Enable ATAPI DMADIR bridge support (0=off, 1=on)");
 
@@ -6567,6 +6567,8 @@ int ata_host_suspend(struct ata_host *host, pm_message_t mesg)
 	ata_lpm_enable(host);
 
 	rc = ata_host_request_pm(host, mesg, 0, ATA_EHI_QUIET, 1);
+	if (rc == 0)
+		host->dev->power.power_state = mesg;
 	return rc;
 }
 
@@ -6585,6 +6587,7 @@ void ata_host_resume(struct ata_host *host)
 {
 	ata_host_request_pm(host, PMSG_ON, ATA_EH_SOFTRESET,
 			    ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET, 0);
+	host->dev->power.power_state = PMSG_ON;
 
 	/* reenable link pm */
 	ata_lpm_disable(host);
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 6036dedfe377..aa884f71a12a 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -56,7 +56,6 @@ enum {
 extern unsigned int ata_print_id;
 extern struct workqueue_struct *ata_aux_wq;
 extern int atapi_enabled;
-extern int atapi_dmadir;
 extern int atapi_passthru16;
 extern int libata_fua;
 extern int libata_noacpi;
diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index 7e73cbaa4121..46bc197a047f 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -109,15 +109,17 @@ static int fw_device_op_open(struct inode *inode, struct file *file)
 	struct client *client;
 	unsigned long flags;
 
-	device = fw_device_from_devt(inode->i_rdev);
+	device = fw_device_get_by_devt(inode->i_rdev);
 	if (device == NULL)
 		return -ENODEV;
 
 	client = kzalloc(sizeof(*client), GFP_KERNEL);
-	if (client == NULL)
+	if (client == NULL) {
+		fw_device_put(device);
 		return -ENOMEM;
+	}
 
-	client->device = fw_device_get(device);
+	client->device = device;
 	INIT_LIST_HEAD(&client->event_list);
 	INIT_LIST_HEAD(&client->resource_list);
 	spin_lock_init(&client->lock);
@@ -644,6 +646,10 @@ static int ioctl_create_iso_context(struct client *client, void *buffer)
 	struct fw_cdev_create_iso_context *request = buffer;
 	struct fw_iso_context *context;
 
+	/* We only support one context at this time. */
+	if (client->iso_context != NULL)
+		return -EBUSY;
+
 	if (request->channel > 63)
 		return -EINVAL;
 
@@ -790,8 +796,9 @@ static int ioctl_start_iso(struct client *client, void *buffer)
 {
 	struct fw_cdev_start_iso *request = buffer;
 
-	if (request->handle != 0)
+	if (client->iso_context == NULL || request->handle != 0)
 		return -EINVAL;
+
 	if (client->iso_context->type == FW_ISO_CONTEXT_RECEIVE) {
 		if (request->tags == 0 || request->tags > 15)
 			return -EINVAL;
@@ -808,7 +815,7 @@ static int ioctl_stop_iso(struct client *client, void *buffer)
 {
 	struct fw_cdev_stop_iso *request = buffer;
 
-	if (request->handle != 0)
+	if (client->iso_context == NULL || request->handle != 0)
 		return -EINVAL;
 
 	return fw_iso_context_stop(client->iso_context);
diff --git a/drivers/firewire/fw-device.c b/drivers/firewire/fw-device.c
index de9066e69adf..2ab13e0f3469 100644
--- a/drivers/firewire/fw-device.c
+++ b/drivers/firewire/fw-device.c
@@ -358,12 +358,9 @@ static ssize_t
 guid_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct fw_device *device = fw_device(dev);
-	u64 guid;
 
-	guid = ((u64)device->config_rom[3] << 32) | device->config_rom[4];
-
-	return snprintf(buf, PAGE_SIZE, "0x%016llx\n",
-			(unsigned long long)guid);
+	return snprintf(buf, PAGE_SIZE, "0x%08x%08x\n",
+			device->config_rom[3], device->config_rom[4]);
 }
 
 static struct device_attribute fw_device_attributes[] = {
@@ -610,12 +607,14 @@ static DECLARE_RWSEM(idr_rwsem);
 static DEFINE_IDR(fw_device_idr);
 int fw_cdev_major;
 
-struct fw_device *fw_device_from_devt(dev_t devt)
+struct fw_device *fw_device_get_by_devt(dev_t devt)
 {
 	struct fw_device *device;
 
 	down_read(&idr_rwsem);
 	device = idr_find(&fw_device_idr, MINOR(devt));
+	if (device)
+		fw_device_get(device);
 	up_read(&idr_rwsem);
 
 	return device;
@@ -627,13 +626,14 @@ static void fw_device_shutdown(struct work_struct *work)
 		container_of(work, struct fw_device, work.work);
 	int minor = MINOR(device->device.devt);
 
-	down_write(&idr_rwsem);
-	idr_remove(&fw_device_idr, minor);
-	up_write(&idr_rwsem);
-
 	fw_device_cdev_remove(device);
 	device_for_each_child(&device->device, NULL, shutdown_unit);
 	device_unregister(&device->device);
+
+	down_write(&idr_rwsem);
+	idr_remove(&fw_device_idr, minor);
+	up_write(&idr_rwsem);
+	fw_device_put(device);
 }
 
 static struct device_type fw_device_type = {
@@ -682,10 +682,13 @@ static void fw_device_init(struct work_struct *work)
 	}
 
 	err = -ENOMEM;
+
+	fw_device_get(device);
 	down_write(&idr_rwsem);
 	if (idr_pre_get(&fw_device_idr, GFP_KERNEL))
 		err = idr_get_new(&fw_device_idr, device, &minor);
 	up_write(&idr_rwsem);
+
 	if (err < 0)
 		goto error;
 
@@ -717,13 +720,22 @@ static void fw_device_init(struct work_struct *work)
 	 */
 	if (atomic_cmpxchg(&device->state,
 		    FW_DEVICE_INITIALIZING,
-		    FW_DEVICE_RUNNING) == FW_DEVICE_SHUTDOWN)
+		    FW_DEVICE_RUNNING) == FW_DEVICE_SHUTDOWN) {
 		fw_device_shutdown(&device->work.work);
-	else
-		fw_notify("created new fw device %s "
-			  "(%d config rom retries, S%d00)\n",
-			  device->device.bus_id, device->config_rom_retries,
-			  1 << device->max_speed);
+	} else {
+		if (device->config_rom_retries)
+			fw_notify("created device %s: GUID %08x%08x, S%d00, "
+				  "%d config ROM retries\n",
+				  device->device.bus_id,
+				  device->config_rom[3], device->config_rom[4],
+				  1 << device->max_speed,
+				  device->config_rom_retries);
+		else
+			fw_notify("created device %s: GUID %08x%08x, S%d00\n",
+				  device->device.bus_id,
+				  device->config_rom[3], device->config_rom[4],
+				  1 << device->max_speed);
+	}
 
 	/*
 	 * Reschedule the IRM work if we just finished reading the
@@ -741,7 +753,9 @@ static void fw_device_init(struct work_struct *work)
 	idr_remove(&fw_device_idr, minor);
 	up_write(&idr_rwsem);
  error:
-	put_device(&device->device);
+	fw_device_put(device);		/* fw_device_idr's reference */
+
+	put_device(&device->device);	/* our reference */
 }
 
 static int update_unit(struct device *dev, void *data)
diff --git a/drivers/firewire/fw-device.h b/drivers/firewire/fw-device.h
index 0854fe2bc110..43808c02793e 100644
--- a/drivers/firewire/fw-device.h
+++ b/drivers/firewire/fw-device.h
@@ -77,13 +77,13 @@ fw_device_is_shutdown(struct fw_device *device)
 }
 
 struct fw_device *fw_device_get(struct fw_device *device);
+struct fw_device *fw_device_get_by_devt(dev_t devt);
 void fw_device_put(struct fw_device *device);
 int fw_device_enable_phys_dma(struct fw_device *device);
 
 void fw_device_cdev_update(struct fw_device *device);
 void fw_device_cdev_remove(struct fw_device *device);
 
-struct fw_device *fw_device_from_devt(dev_t devt);
 extern int fw_cdev_major;
 
 struct fw_unit {
diff --git a/drivers/firewire/fw-sbp2.c b/drivers/firewire/fw-sbp2.c
index 19ece9b6d742..5259491580fc 100644
--- a/drivers/firewire/fw-sbp2.c
+++ b/drivers/firewire/fw-sbp2.c
@@ -28,14 +28,15 @@
  * and many others.
  */
 
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
 #include <linux/kernel.h>
+#include <linux/mod_devicetable.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/mod_devicetable.h>
-#include <linux/device.h>
 #include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
-#include <linux/blkdev.h>
 #include <linux/string.h>
 #include <linux/stringify.h>
 #include <linux/timer.h>
@@ -47,9 +48,9 @@
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
 
-#include "fw-transaction.h"
-#include "fw-topology.h"
 #include "fw-device.h"
+#include "fw-topology.h"
+#include "fw-transaction.h"
 
 /*
  * So far only bridges from Oxford Semiconductor are known to support
@@ -82,6 +83,9 @@ MODULE_PARM_DESC(exclusive_login, "Exclusive login to sbp2 device "
  *   Avoids access beyond actual disk limits on devices with an off-by-one bug.
  *   Don't use this with devices which don't have this bug.
  *
+ * - delay inquiry
+ *   Wait extra SBP2_INQUIRY_DELAY seconds after login before SCSI inquiry.
+ *
  * - override internal blacklist
  *   Instead of adding to the built-in blacklist, use only the workarounds
  *   specified in the module load parameter.
@@ -91,6 +95,8 @@ MODULE_PARM_DESC(exclusive_login, "Exclusive login to sbp2 device "
 #define SBP2_WORKAROUND_INQUIRY_36	0x2
 #define SBP2_WORKAROUND_MODE_SENSE_8	0x4
 #define SBP2_WORKAROUND_FIX_CAPACITY	0x8
+#define SBP2_WORKAROUND_DELAY_INQUIRY	0x10
+#define SBP2_INQUIRY_DELAY		12
 #define SBP2_WORKAROUND_OVERRIDE	0x100
 
 static int sbp2_param_workarounds;
@@ -100,6 +106,7 @@ MODULE_PARM_DESC(workarounds, "Work around device bugs (default = 0"
 	", 36 byte inquiry = "    __stringify(SBP2_WORKAROUND_INQUIRY_36)
 	", skip mode page 8 = "   __stringify(SBP2_WORKAROUND_MODE_SENSE_8)
 	", fix capacity = "       __stringify(SBP2_WORKAROUND_FIX_CAPACITY)
+	", delay inquiry = "      __stringify(SBP2_WORKAROUND_DELAY_INQUIRY)
 	", override internal blacklist = " __stringify(SBP2_WORKAROUND_OVERRIDE)
 	", or a combination)");
 
@@ -132,6 +139,7 @@ struct sbp2_logical_unit {
 	int generation;
 	int retries;
 	struct delayed_work work;
+	bool blocked;
 };
 
 /*
@@ -141,16 +149,18 @@ struct sbp2_logical_unit {
 struct sbp2_target {
 	struct kref kref;
 	struct fw_unit *unit;
+	const char *bus_id;
+	struct list_head lu_list;
 
 	u64 management_agent_address;
 	int directory_id;
 	int node_id;
 	int address_high;
-
-	unsigned workarounds;
-	struct list_head lu_list;
-
+	unsigned int workarounds;
 	unsigned int mgt_orb_timeout;
+
+	int dont_block;	/* counter for each logical unit */
+	int blocked;	/* ditto */
 };
 
 /*
@@ -160,7 +170,7 @@ struct sbp2_target {
  */
 #define SBP2_MIN_LOGIN_ORB_TIMEOUT	5000U	/* Timeout in ms */
 #define SBP2_MAX_LOGIN_ORB_TIMEOUT	40000U	/* Timeout in ms */
-#define SBP2_ORB_TIMEOUT		2000	/* Timeout in ms */
+#define SBP2_ORB_TIMEOUT		2000U	/* Timeout in ms */
 #define SBP2_ORB_NULL			0x80000000
 #define SBP2_MAX_SG_ELEMENT_LENGTH	0xf000
 
@@ -297,7 +307,7 @@ struct sbp2_command_orb {
 static const struct {
 	u32 firmware_revision;
 	u32 model;
-	unsigned workarounds;
+	unsigned int workarounds;
 } sbp2_workarounds_table[] = {
 	/* DViCO Momobay CX-1 with TSB42AA9 bridge */ {
 		.firmware_revision	= 0x002800,
@@ -305,6 +315,11 @@ static const struct {
 		.workarounds		= SBP2_WORKAROUND_INQUIRY_36 |
 					  SBP2_WORKAROUND_MODE_SENSE_8,
 	},
+	/* DViCO Momobay FX-3A with TSB42AA9A bridge */ {
+		.firmware_revision	= 0x002800,
+		.model			= 0x000000,
+		.workarounds		= SBP2_WORKAROUND_DELAY_INQUIRY,
+	},
 	/* Initio bridges, actually only needed for some older ones */ {
 		.firmware_revision	= 0x000200,
 		.model			= ~0,
@@ -501,6 +516,9 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
 	unsigned int timeout;
 	int retval = -ENOMEM;
 
+	if (function == SBP2_LOGOUT_REQUEST && fw_device_is_shutdown(device))
+		return 0;
+
 	orb = kzalloc(sizeof(*orb), GFP_ATOMIC);
 	if (orb == NULL)
 		return -ENOMEM;
@@ -553,20 +571,20 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
 
 	retval = -EIO;
 	if (sbp2_cancel_orbs(lu) == 0) {
-		fw_error("orb reply timed out, rcode=0x%02x\n",
-			 orb->base.rcode);
+		fw_error("%s: orb reply timed out, rcode=0x%02x\n",
+			 lu->tgt->bus_id, orb->base.rcode);
 		goto out;
 	}
 
 	if (orb->base.rcode != RCODE_COMPLETE) {
-		fw_error("management write failed, rcode 0x%02x\n",
-			 orb->base.rcode);
+		fw_error("%s: management write failed, rcode 0x%02x\n",
+			 lu->tgt->bus_id, orb->base.rcode);
 		goto out;
 	}
 
 	if (STATUS_GET_RESPONSE(orb->status) != 0 ||
 	    STATUS_GET_SBP_STATUS(orb->status) != 0) {
-		fw_error("error status: %d:%d\n",
+		fw_error("%s: error status: %d:%d\n", lu->tgt->bus_id,
 			 STATUS_GET_RESPONSE(orb->status),
 			 STATUS_GET_SBP_STATUS(orb->status));
 		goto out;
@@ -590,29 +608,147 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
 
 static void
 complete_agent_reset_write(struct fw_card *card, int rcode,
-			   void *payload, size_t length, void *data)
+			   void *payload, size_t length, void *done)
+{
+	complete(done);
+}
+
+static void sbp2_agent_reset(struct sbp2_logical_unit *lu)
 {
-	struct fw_transaction *t = data;
+	struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct fw_transaction t;
+	static u32 z;
+
+	fw_send_request(device->card, &t, TCODE_WRITE_QUADLET_REQUEST,
+			lu->tgt->node_id, lu->generation, device->max_speed,
+			lu->command_block_agent_address + SBP2_AGENT_RESET,
+			&z, sizeof(z), complete_agent_reset_write, &done);
+	wait_for_completion(&done);
+}
 
-	kfree(t);
+static void
+complete_agent_reset_write_no_wait(struct fw_card *card, int rcode,
+				   void *payload, size_t length, void *data)
+{
+	kfree(data);
 }
 
-static int sbp2_agent_reset(struct sbp2_logical_unit *lu)
+static void sbp2_agent_reset_no_wait(struct sbp2_logical_unit *lu)
 {
 	struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
 	struct fw_transaction *t;
-	static u32 zero;
+	static u32 z;
 
-	t = kzalloc(sizeof(*t), GFP_ATOMIC);
+	t = kmalloc(sizeof(*t), GFP_ATOMIC);
 	if (t == NULL)
-		return -ENOMEM;
+		return;
 
 	fw_send_request(device->card, t, TCODE_WRITE_QUADLET_REQUEST,
 			lu->tgt->node_id, lu->generation, device->max_speed,
 			lu->command_block_agent_address + SBP2_AGENT_RESET,
-			&zero, sizeof(zero), complete_agent_reset_write, t);
+			&z, sizeof(z), complete_agent_reset_write_no_wait, t);
+}
 
-	return 0;
+static void sbp2_set_generation(struct sbp2_logical_unit *lu, int generation)
+{
+	struct fw_card *card = fw_device(lu->tgt->unit->device.parent)->card;
+	unsigned long flags;
+
+	/* serialize with comparisons of lu->generation and card->generation */
+	spin_lock_irqsave(&card->lock, flags);
+	lu->generation = generation;
+	spin_unlock_irqrestore(&card->lock, flags);
+}
+
+static inline void sbp2_allow_block(struct sbp2_logical_unit *lu)
+{
+	/*
+	 * We may access dont_block without taking card->lock here:
+	 * All callers of sbp2_allow_block() and all callers of sbp2_unblock()
+	 * are currently serialized against each other.
+	 * And a wrong result in sbp2_conditionally_block()'s access of
+	 * dont_block is rather harmless, it simply misses its first chance.
+	 */
+	--lu->tgt->dont_block;
+}
+
+/*
+ * Blocks lu->tgt if all of the following conditions are met:
+ *   - Login, INQUIRY, and high-level SCSI setup of all of the target's
+ *     logical units have been finished (indicated by dont_block == 0).
+ *   - lu->generation is stale.
+ *
+ * Note, scsi_block_requests() must be called while holding card->lock,
+ * otherwise it might foil sbp2_[conditionally_]unblock()'s attempt to
+ * unblock the target.
+ */
+static void sbp2_conditionally_block(struct sbp2_logical_unit *lu)
+{
+	struct sbp2_target *tgt = lu->tgt;
+	struct fw_card *card = fw_device(tgt->unit->device.parent)->card;
+	struct Scsi_Host *shost =
+		container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->lock, flags);
+	if (!tgt->dont_block && !lu->blocked &&
+	    lu->generation != card->generation) {
+		lu->blocked = true;
+		if (++tgt->blocked == 1) {
+			scsi_block_requests(shost);
+			fw_notify("blocked %s\n", lu->tgt->bus_id);
+		}
+	}
+	spin_unlock_irqrestore(&card->lock, flags);
+}
+
+/*
+ * Unblocks lu->tgt as soon as all its logical units can be unblocked.
+ * Note, it is harmless to run scsi_unblock_requests() outside the
+ * card->lock protected section.  On the other hand, running it inside
+ * the section might clash with shost->host_lock.
+ */
+static void sbp2_conditionally_unblock(struct sbp2_logical_unit *lu)
+{
+	struct sbp2_target *tgt = lu->tgt;
+	struct fw_card *card = fw_device(tgt->unit->device.parent)->card;
+	struct Scsi_Host *shost =
+		container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
+	unsigned long flags;
+	bool unblock = false;
+
+	spin_lock_irqsave(&card->lock, flags);
+	if (lu->blocked && lu->generation == card->generation) {
+		lu->blocked = false;
+		unblock = --tgt->blocked == 0;
+	}
+	spin_unlock_irqrestore(&card->lock, flags);
+
+	if (unblock) {
+		scsi_unblock_requests(shost);
+		fw_notify("unblocked %s\n", lu->tgt->bus_id);
+	}
+}
+
+/*
+ * Prevents future blocking of tgt and unblocks it.
+ * Note, it is harmless to run scsi_unblock_requests() outside the
+ * card->lock protected section.  On the other hand, running it inside
+ * the section might clash with shost->host_lock.
+ */
+static void sbp2_unblock(struct sbp2_target *tgt)
+{
+	struct fw_card *card = fw_device(tgt->unit->device.parent)->card;
+	struct Scsi_Host *shost =
+		container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->lock, flags);
+	++tgt->dont_block;
+	spin_unlock_irqrestore(&card->lock, flags);
+
+	scsi_unblock_requests(shost);
 }
 
 static void sbp2_release_target(struct kref *kref)
@@ -621,23 +757,24 @@ static void sbp2_release_target(struct kref *kref)
 	struct sbp2_logical_unit *lu, *next;
 	struct Scsi_Host *shost =
 		container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
-	struct fw_device *device = fw_device(tgt->unit->device.parent);
+
+	/* prevent deadlocks */
+	sbp2_unblock(tgt);
 
 	list_for_each_entry_safe(lu, next, &tgt->lu_list, link) {
-		if (lu->sdev)
+		if (lu->sdev) {
 			scsi_remove_device(lu->sdev);
-
-		if (!fw_device_is_shutdown(device))
-			sbp2_send_management_orb(lu, tgt->node_id,
-					lu->generation, SBP2_LOGOUT_REQUEST,
-					lu->login_id, NULL);
+			scsi_device_put(lu->sdev);
+		}
+		sbp2_send_management_orb(lu, tgt->node_id, lu->generation,
+				SBP2_LOGOUT_REQUEST, lu->login_id, NULL);
 
 		fw_core_remove_address_handler(&lu->address_handler);
 		list_del(&lu->link);
 		kfree(lu);
 	}
 	scsi_remove_host(shost);
-	fw_notify("released %s\n", tgt->unit->device.bus_id);
+	fw_notify("released %s\n", tgt->bus_id);
 
 	put_device(&tgt->unit->device);
 	scsi_host_put(shost);
@@ -666,33 +803,43 @@ static void sbp2_login(struct work_struct *work)
 {
 	struct sbp2_logical_unit *lu =
 		container_of(work, struct sbp2_logical_unit, work.work);
-	struct Scsi_Host *shost =
-		container_of((void *)lu->tgt, struct Scsi_Host, hostdata[0]);
+	struct sbp2_target *tgt = lu->tgt;
+	struct fw_device *device = fw_device(tgt->unit->device.parent);
+	struct Scsi_Host *shost;
 	struct scsi_device *sdev;
 	struct scsi_lun eight_bytes_lun;
-	struct fw_unit *unit = lu->tgt->unit;
-	struct fw_device *device = fw_device(unit->device.parent);
 	struct sbp2_login_response response;
 	int generation, node_id, local_node_id;
 
+	if (fw_device_is_shutdown(device))
+		goto out;
+
 	generation    = device->generation;
 	smp_rmb();    /* node_id must not be older than generation */
 	node_id       = device->node_id;
 	local_node_id = device->card->node_id;
 
+	/* If this is a re-login attempt, log out, or we might be rejected. */
+	if (lu->sdev)
+		sbp2_send_management_orb(lu, device->node_id, generation,
+				SBP2_LOGOUT_REQUEST, lu->login_id, NULL);
+
 	if (sbp2_send_management_orb(lu, node_id, generation,
 				SBP2_LOGIN_REQUEST, lu->lun, &response) < 0) {
-		if (lu->retries++ < 5)
+		if (lu->retries++ < 5) {
 			sbp2_queue_work(lu, DIV_ROUND_UP(HZ, 5));
-		else
-			fw_error("failed to login to %s LUN %04x\n",
-				 unit->device.bus_id, lu->lun);
+		} else {
+			fw_error("%s: failed to login to LUN %04x\n",
+				 tgt->bus_id, lu->lun);
+			/* Let any waiting I/O fail from now on. */
+			sbp2_unblock(lu->tgt);
+		}
 		goto out;
 	}
 
-	lu->generation        = generation;
-	lu->tgt->node_id      = node_id;
-	lu->tgt->address_high = local_node_id << 16;
+	tgt->node_id	  = node_id;
+	tgt->address_high = local_node_id << 16;
+	sbp2_set_generation(lu, generation);
 
 	/* Get command block agent offset and login id. */
 	lu->command_block_agent_address =
@@ -700,8 +847,8 @@ static void sbp2_login(struct work_struct *work)
 		response.command_block_agent.low;
 	lu->login_id = LOGIN_RESPONSE_GET_LOGIN_ID(response);
 
-	fw_notify("logged in to %s LUN %04x (%d retries)\n",
-		  unit->device.bus_id, lu->lun, lu->retries);
+	fw_notify("%s: logged in to LUN %04x (%d retries)\n",
+		  tgt->bus_id, lu->lun, lu->retries);
 
 #if 0
 	/* FIXME: The linux1394 sbp2 does this last step. */
@@ -711,26 +858,62 @@ static void sbp2_login(struct work_struct *work)
 	PREPARE_DELAYED_WORK(&lu->work, sbp2_reconnect);
 	sbp2_agent_reset(lu);
 
+	/* This was a re-login. */
+	if (lu->sdev) {
+		sbp2_cancel_orbs(lu);
+		sbp2_conditionally_unblock(lu);
+		goto out;
+	}
+
+	if (lu->tgt->workarounds & SBP2_WORKAROUND_DELAY_INQUIRY)
+		ssleep(SBP2_INQUIRY_DELAY);
+
 	memset(&eight_bytes_lun, 0, sizeof(eight_bytes_lun));
 	eight_bytes_lun.scsi_lun[0] = (lu->lun >> 8) & 0xff;
 	eight_bytes_lun.scsi_lun[1] = lu->lun & 0xff;
+	shost = container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
 
 	sdev = __scsi_add_device(shost, 0, 0,
 				 scsilun_to_int(&eight_bytes_lun), lu);
-	if (IS_ERR(sdev)) {
-		sbp2_send_management_orb(lu, node_id, generation,
-				SBP2_LOGOUT_REQUEST, lu->login_id, NULL);
-		/*
-		 * Set this back to sbp2_login so we fall back and
-		 * retry login on bus reset.
-		 */
-		PREPARE_DELAYED_WORK(&lu->work, sbp2_login);
-	} else {
-		lu->sdev = sdev;
+	/*
+	 * FIXME:  We are unable to perform reconnects while in sbp2_login().
+	 * Therefore __scsi_add_device() will get into trouble if a bus reset
+	 * happens in parallel.  It will either fail or leave us with an
+	 * unusable sdev.  As a workaround we check for this and retry the
+	 * whole login and SCSI probing.
+	 */
+
+	/* Reported error during __scsi_add_device() */
+	if (IS_ERR(sdev))
+		goto out_logout_login;
+
+	/* Unreported error during __scsi_add_device() */
+	smp_rmb(); /* get current card generation */
+	if (generation != device->card->generation) {
+		scsi_remove_device(sdev);
 		scsi_device_put(sdev);
+		goto out_logout_login;
 	}
+
+	/* No error during __scsi_add_device() */
+	lu->sdev = sdev;
+	sbp2_allow_block(lu);
+	goto out;
+
+ out_logout_login:
+	smp_rmb(); /* generation may have changed */
+	generation = device->generation;
+	smp_rmb(); /* node_id must not be older than generation */
+
+	sbp2_send_management_orb(lu, device->node_id, generation,
+				 SBP2_LOGOUT_REQUEST, lu->login_id, NULL);
+	/*
+	 * If a bus reset happened, sbp2_update will have requeued
+	 * lu->work already.  Reset the work from reconnect to login.
+	 */
+	PREPARE_DELAYED_WORK(&lu->work, sbp2_login);
  out:
-	sbp2_target_put(lu->tgt);
+	sbp2_target_put(tgt);
 }
 
 static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry)
@@ -755,6 +938,8 @@ static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry)
 	lu->sdev = NULL;
 	lu->lun  = lun_entry & 0xffff;
 	lu->retries = 0;
+	lu->blocked = false;
+	++tgt->dont_block;
 	INIT_LIST_HEAD(&lu->orb_list);
 	INIT_DELAYED_WORK(&lu->work, sbp2_login);
 
@@ -813,7 +998,7 @@ static int sbp2_scan_unit_dir(struct sbp2_target *tgt, u32 *directory,
 			if (timeout > tgt->mgt_orb_timeout)
 				fw_notify("%s: config rom contains %ds "
 					  "management ORB timeout, limiting "
-					  "to %ds\n", tgt->unit->device.bus_id,
+					  "to %ds\n", tgt->bus_id,
 					  timeout / 1000,
 					  tgt->mgt_orb_timeout / 1000);
 			break;
@@ -836,12 +1021,12 @@ static void sbp2_init_workarounds(struct sbp2_target *tgt, u32 model,
 				  u32 firmware_revision)
 {
 	int i;
-	unsigned w = sbp2_param_workarounds;
+	unsigned int w = sbp2_param_workarounds;
 
 	if (w)
 		fw_notify("Please notify linux1394-devel@lists.sourceforge.net "
 			  "if you need the workarounds parameter for %s\n",
-			  tgt->unit->device.bus_id);
+			  tgt->bus_id);
 
 	if (w & SBP2_WORKAROUND_OVERRIDE)
 		goto out;
@@ -863,8 +1048,7 @@ static void sbp2_init_workarounds(struct sbp2_target *tgt, u32 model,
 	if (w)
 		fw_notify("Workarounds for %s: 0x%x "
 			  "(firmware_revision 0x%06x, model_id 0x%06x)\n",
-			  tgt->unit->device.bus_id,
-			  w, firmware_revision, model);
+			  tgt->bus_id, w, firmware_revision, model);
 	tgt->workarounds = w;
 }
 
@@ -888,6 +1072,7 @@ static int sbp2_probe(struct device *dev)
 	tgt->unit = unit;
 	kref_init(&tgt->kref);
 	INIT_LIST_HEAD(&tgt->lu_list);
+	tgt->bus_id = unit->device.bus_id;
 
 	if (fw_device_enable_phys_dma(device) < 0)
 		goto fail_shost_put;
@@ -938,10 +1123,13 @@ static void sbp2_reconnect(struct work_struct *work)
 {
 	struct sbp2_logical_unit *lu =
 		container_of(work, struct sbp2_logical_unit, work.work);
-	struct fw_unit *unit = lu->tgt->unit;
-	struct fw_device *device = fw_device(unit->device.parent);
+	struct sbp2_target *tgt = lu->tgt;
+	struct fw_device *device = fw_device(tgt->unit->device.parent);
 	int generation, node_id, local_node_id;
 
+	if (fw_device_is_shutdown(device))
+		goto out;
+
 	generation    = device->generation;
 	smp_rmb();    /* node_id must not be older than generation */
 	node_id       = device->node_id;
@@ -950,10 +1138,17 @@ static void sbp2_reconnect(struct work_struct *work)
 	if (sbp2_send_management_orb(lu, node_id, generation,
 				     SBP2_RECONNECT_REQUEST,
 				     lu->login_id, NULL) < 0) {
-		if (lu->retries++ >= 5) {
-			fw_error("failed to reconnect to %s\n",
-				 unit->device.bus_id);
-			/* Fall back and try to log in again. */
+		/*
+		 * If reconnect was impossible even though we are in the
+		 * current generation, fall back and try to log in again.
+		 *
+		 * We could check for "Function rejected" status, but
+		 * looking at the bus generation as simpler and more general.
+		 */
+		smp_rmb(); /* get current card generation */
+		if (generation == device->card->generation ||
+		    lu->retries++ >= 5) {
+			fw_error("%s: failed to reconnect\n", tgt->bus_id);
 			lu->retries = 0;
 			PREPARE_DELAYED_WORK(&lu->work, sbp2_login);
 		}
@@ -961,17 +1156,18 @@ static void sbp2_reconnect(struct work_struct *work)
 		goto out;
 	}
 
-	lu->generation        = generation;
-	lu->tgt->node_id      = node_id;
-	lu->tgt->address_high = local_node_id << 16;
+	tgt->node_id      = node_id;
+	tgt->address_high = local_node_id << 16;
+	sbp2_set_generation(lu, generation);
 
-	fw_notify("reconnected to %s LUN %04x (%d retries)\n",
-		  unit->device.bus_id, lu->lun, lu->retries);
+	fw_notify("%s: reconnected to LUN %04x (%d retries)\n",
+		  tgt->bus_id, lu->lun, lu->retries);
 
 	sbp2_agent_reset(lu);
 	sbp2_cancel_orbs(lu);
+	sbp2_conditionally_unblock(lu);
  out:
-	sbp2_target_put(lu->tgt);
+	sbp2_target_put(tgt);
 }
 
 static void sbp2_update(struct fw_unit *unit)
@@ -986,6 +1182,7 @@ static void sbp2_update(struct fw_unit *unit)
 	 * Iteration over tgt->lu_list is therefore safe here.
 	 */
 	list_for_each_entry(lu, &tgt->lu_list, link) {
+		sbp2_conditionally_block(lu);
 		lu->retries = 0;
 		sbp2_queue_work(lu, 0);
 	}
@@ -1063,7 +1260,7 @@ complete_command_orb(struct sbp2_orb *base_orb, struct sbp2_status *status)
 
 	if (status != NULL) {
 		if (STATUS_GET_DEAD(*status))
-			sbp2_agent_reset(orb->lu);
+			sbp2_agent_reset_no_wait(orb->lu);
 
 		switch (STATUS_GET_RESPONSE(*status)) {
 		case SBP2_STATUS_REQUEST_COMPLETE:
@@ -1089,6 +1286,7 @@ complete_command_orb(struct sbp2_orb *base_orb, struct sbp2_status *status)
 		 * or when sending the write (less likely).
 		 */
 		result = DID_BUS_BUSY << 16;
+		sbp2_conditionally_block(orb->lu);
 	}
 
 	dma_unmap_single(device->card->device, orb->base.request_bus,
@@ -1197,7 +1395,7 @@ static int sbp2_scsi_queuecommand(struct scsi_cmnd *cmd, scsi_done_fn_t done)
 	struct sbp2_logical_unit *lu = cmd->device->hostdata;
 	struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
 	struct sbp2_command_orb *orb;
-	unsigned max_payload;
+	unsigned int max_payload;
 	int retval = SCSI_MLQUEUE_HOST_BUSY;
 
 	/*
@@ -1275,6 +1473,10 @@ static int sbp2_scsi_slave_alloc(struct scsi_device *sdev)
 {
 	struct sbp2_logical_unit *lu = sdev->hostdata;
 
+	/* (Re-)Adding logical units via the SCSI stack is not supported. */
+	if (!lu)
+		return -ENOSYS;
+
 	sdev->allow_restart = 1;
 
 	/*
@@ -1319,7 +1521,7 @@ static int sbp2_scsi_abort(struct scsi_cmnd *cmd)
 {
 	struct sbp2_logical_unit *lu = cmd->device->hostdata;
 
-	fw_notify("sbp2_scsi_abort\n");
+	fw_notify("%s: sbp2_scsi_abort\n", lu->tgt->bus_id);
 	sbp2_agent_reset(lu);
 	sbp2_cancel_orbs(lu);
 
diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index 28e155a9e2a5..9e2b1964d71a 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -183,6 +183,9 @@ MODULE_PARM_DESC(exclusive_login, "Exclusive login to sbp2 device "
  *   Avoids access beyond actual disk limits on devices with an off-by-one bug.
  *   Don't use this with devices which don't have this bug.
  *
+ * - delay inquiry
+ *   Wait extra SBP2_INQUIRY_DELAY seconds after login before SCSI inquiry.
+ *
  * - override internal blacklist
  *   Instead of adding to the built-in blacklist, use only the workarounds
  *   specified in the module load parameter.
@@ -195,6 +198,7 @@ MODULE_PARM_DESC(workarounds, "Work around device bugs (default = 0"
 	", 36 byte inquiry = "    __stringify(SBP2_WORKAROUND_INQUIRY_36)
 	", skip mode page 8 = "   __stringify(SBP2_WORKAROUND_MODE_SENSE_8)
 	", fix capacity = "       __stringify(SBP2_WORKAROUND_FIX_CAPACITY)
+	", delay inquiry = "      __stringify(SBP2_WORKAROUND_DELAY_INQUIRY)
 	", override internal blacklist = " __stringify(SBP2_WORKAROUND_OVERRIDE)
 	", or a combination)");
 
@@ -357,6 +361,11 @@ static const struct {
 		.workarounds		= SBP2_WORKAROUND_INQUIRY_36 |
 					  SBP2_WORKAROUND_MODE_SENSE_8,
 	},
+	/* DViCO Momobay FX-3A with TSB42AA9A bridge */ {
+		.firmware_revision	= 0x002800,
+		.model_id		= 0x000000,
+		.workarounds		= SBP2_WORKAROUND_DELAY_INQUIRY,
+	},
 	/* Initio bridges, actually only needed for some older ones */ {
 		.firmware_revision	= 0x000200,
 		.model_id		= SBP2_ROM_VALUE_WILDCARD,
@@ -914,6 +923,9 @@ static int sbp2_start_device(struct sbp2_lu *lu)
 	sbp2_agent_reset(lu, 1);
 	sbp2_max_speed_and_size(lu);
 
+	if (lu->workarounds & SBP2_WORKAROUND_DELAY_INQUIRY)
+		ssleep(SBP2_INQUIRY_DELAY);
+
 	error = scsi_add_device(lu->shost, 0, lu->ud->id, 0);
 	if (error) {
 		SBP2_ERR("scsi_add_device failed");
@@ -1962,6 +1974,9 @@ static int sbp2scsi_slave_alloc(struct scsi_device *sdev)
 {
 	struct sbp2_lu *lu = (struct sbp2_lu *)sdev->host->hostdata[0];
 
+	if (sdev->lun != 0 || sdev->id != lu->ud->id || sdev->channel != 0)
+		return -ENODEV;
+
 	lu->sdev = sdev;
 	sdev->allow_restart = 1;
 
diff --git a/drivers/ieee1394/sbp2.h b/drivers/ieee1394/sbp2.h
index d2ecb0d8a1bb..80d8e097b065 100644
--- a/drivers/ieee1394/sbp2.h
+++ b/drivers/ieee1394/sbp2.h
@@ -343,6 +343,8 @@ enum sbp2lu_state_types {
 #define SBP2_WORKAROUND_INQUIRY_36	0x2
 #define SBP2_WORKAROUND_MODE_SENSE_8	0x4
 #define SBP2_WORKAROUND_FIX_CAPACITY	0x8
+#define SBP2_WORKAROUND_DELAY_INQUIRY	0x10
+#define SBP2_INQUIRY_DELAY		12
 #define SBP2_WORKAROUND_OVERRIDE	0x100
 
 #endif /* SBP2_H */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 33888bb58144..2c23bade9aa6 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -46,7 +46,7 @@ const struct file_operations ext4_dir_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
-	.fsync		= ext4_sync_file,	/* BKL held */
+	.fsync		= ext4_sync_file,
 	.release	= ext4_release_dir,
 };
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc7081f1fbe8..9ae6e67090cd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -148,6 +148,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	ext4_fsblk_t bg_start;
+	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
 	int depth;
 
@@ -169,8 +170,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	/* OK. use inode's group */
 	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
 		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
-	colour = (current->pid % 16) *
+	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+
+	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	else
+		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
 	return bg_start + colour + block;
 }
 
@@ -349,7 +355,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 #define ext4_ext_show_leaf(inode,path)
 #endif
 
-static void ext4_ext_drop_refs(struct ext4_ext_path *path)
+void ext4_ext_drop_refs(struct ext4_ext_path *path)
 {
 	int depth = path->p_depth;
 	int i;
@@ -2168,6 +2174,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	newblock = iblock - ee_block + ext_pblock(ex);
 	ex2 = ex;
 
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+
 	/* ex1: ee_block to iblock - 1 : uninitialized */
 	if (iblock > ee_block) {
 		ex1 = ex;
@@ -2200,16 +2210,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		newdepth = ext_depth(inode);
 		if (newdepth != depth) {
 			depth = newdepth;
-			path = ext4_ext_find_extent(inode, iblock, NULL);
+			ext4_ext_drop_refs(path);
+			path = ext4_ext_find_extent(inode, iblock, path);
 			if (IS_ERR(path)) {
 				err = PTR_ERR(path);
-				path = NULL;
 				goto out;
 			}
 			eh = path[depth].p_hdr;
 			ex = path[depth].p_ext;
 			if (ex2 != &newex)
 				ex2 = ex;
+
+			err = ext4_ext_get_access(handle, inode, path + depth);
+			if (err)
+				goto out;
 		}
 		allocated = max_blocks;
 	}
@@ -2230,9 +2244,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	ex2->ee_len = cpu_to_le16(allocated);
 	if (ex2 != ex)
 		goto insert;
-	err = ext4_ext_get_access(handle, inode, path + depth);
-	if (err)
-		goto out;
 	/*
 	 * New (initialized) extent starts from the first block
 	 * in the current extent. i.e., ex2 == ex
@@ -2276,9 +2287,22 @@ out:
 }
 
 /*
+ * Block allocation/map/preallocation routine for extents based files
+ *
+ *
  * Need to be called with
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ *
+ * return > 0, number of of blocks already mapped/allocated
+ *          if create == 0 and these are pre-allocated blocks
+ *          	buffer head is unmapped
+ *          otherwise blocks are mapped
+ *
+ * return = 0, if plain look up failed (blocks have not been allocated)
+ *          buffer head is unmapped
+ *
+ * return < 0, error case.
  */
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
@@ -2623,7 +2647,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	 * modify 1 super block, 1 block bitmap and 1 group descriptor.
 	 */
 	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
-	down_write((&EXT4_I(inode)->i_data_sem));
+	mutex_lock(&inode->i_mutex);
 retry:
 	while (ret >= 0 && ret < max_blocks) {
 		block = block + ret;
@@ -2634,16 +2658,17 @@ retry:
 			break;
 		}
 
-		ret = ext4_ext_get_blocks(handle, inode, block,
+		ret = ext4_get_blocks_wrap(handle, inode, block,
 					  max_blocks, &map_bh,
 					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
-		WARN_ON(ret <= 0);
 		if (ret <= 0) {
-			ext4_error(inode->i_sb, "ext4_fallocate",
-				    "ext4_ext_get_blocks returned error: "
-				    "inode#%lu, block=%u, max_blocks=%lu",
+#ifdef EXT4FS_DEBUG
+			WARN_ON(ret <= 0);
+			printk(KERN_ERR "%s: ext4_ext_get_blocks "
+				    "returned error inode#%lu, block=%u, "
+				    "max_blocks=%lu", __func__,
 				    inode->i_ino, block, max_blocks);
-			ret = -EIO;
+#endif
 			ext4_mark_inode_dirty(handle, inode);
 			ret2 = ext4_journal_stop(handle);
 			break;
@@ -2680,7 +2705,6 @@ retry:
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 
-	up_write((&EXT4_I(inode)->i_data_sem));
 	/*
 	 * Time to update the file size.
 	 * Update only when preallocation was requested beyond the file size.
@@ -2692,21 +2716,18 @@ retry:
 			 * if no error, we assume preallocation succeeded
 			 * completely
 			 */
-			mutex_lock(&inode->i_mutex);
 			i_size_write(inode, offset + len);
 			EXT4_I(inode)->i_disksize = i_size_read(inode);
-			mutex_unlock(&inode->i_mutex);
 		} else if (ret < 0 && nblocks) {
 			/* Handle partial allocation scenario */
 			loff_t newsize;
 
-			mutex_lock(&inode->i_mutex);
 			newsize  = (nblocks << blkbits) + i_size_read(inode);
 			i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits));
 			EXT4_I(inode)->i_disksize = i_size_read(inode);
-			mutex_unlock(&inode->i_mutex);
 		}
 	}
 
+	mutex_unlock(&inode->i_mutex);
 	return ret > 0 ? ret2 : ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index da18a74b966a..8036b9b5376b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -702,7 +702,12 @@ got:
 	ei->i_dir_start_lookup = 0;
 	ei->i_disksize = 0;
 
-	ei->i_flags = EXT4_I(dir)->i_flags & ~EXT4_INDEX_FL;
+	/*
+	 * Don't inherit extent flag from directory. We set extent flag on
+	 * newly created directory and file only if -o extent mount option is
+	 * specified
+	 */
+	ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
 	if (S_ISLNK(mode))
 		ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
 	/* dirsync only applies to directories */
@@ -745,12 +750,15 @@ got:
 		goto fail_free_drop;
 	}
 	if (test_opt(sb, EXTENTS)) {
-		EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
-		ext4_ext_tree_init(handle, inode);
-		err = ext4_update_incompat_feature(handle, sb,
-						EXT4_FEATURE_INCOMPAT_EXTENTS);
-		if (err)
-			goto fail;
+		/* set extent flag only for directory and file */
+		if (S_ISDIR(mode) || S_ISREG(mode)) {
+			EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+			ext4_ext_tree_init(handle, inode);
+			err = ext4_update_incompat_feature(handle, sb,
+					EXT4_FEATURE_INCOMPAT_EXTENTS);
+			if (err)
+				goto fail;
+		}
 	}
 
 	ext4_debug("allocating inode %lu\n", inode->i_ino);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7dd9b50d5ebc..945cbf6cb1fc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -403,6 +403,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
 	__le32 *p;
 	ext4_fsblk_t bg_start;
+	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
 
 	/* Try to find previous block */
@@ -420,8 +421,13 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 	 * into the same cylinder group then.
 	 */
 	bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
-	colour = (current->pid % 16) *
+	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+
+	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	else
+		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
 	return bg_start + colour;
 }
 
@@ -768,7 +774,6 @@ err_out:
  *
  * `handle' can be NULL if create == 0.
  *
- * The BKL may not be held on entry here.  Be sure to take it early.
  * return > 0, # of blocks mapped or allocated.
  * return = 0, if plain lookup failed.
  * return < 0, error case.
@@ -903,11 +908,38 @@ out:
  */
 #define DIO_CREDITS 25
 
+
+/*
+ *
+ *
+ * ext4_ext4 get_block() wrapper function
+ * It will do a look up first, and returns if the blocks already mapped.
+ * Otherwise it takes the write lock of the i_data_sem and allocate blocks
+ * and store the allocated blocks in the result buffer head and mark it
+ * mapped.
+ *
+ * If file type is extents based, it will call ext4_ext_get_blocks(),
+ * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * based files
+ *
+ * On success, it returns the number of blocks being mapped or allocate.
+ * if create==0 and the blocks are pre-allocated and uninitialized block,
+ * the result buffer head is unmapped. If the create ==1, it will make sure
+ * the buffer head is mapped.
+ *
+ * It returns 0 if plain look up failed (blocks have not been allocated), in
+ * that casem, buffer head is unmapped
+ *
+ * It returns the error in case of allocation failure.
+ */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 			unsigned long max_blocks, struct buffer_head *bh,
 			int create, int extend_disksize)
 {
 	int retval;
+
+	clear_buffer_mapped(bh);
+
 	/*
 	 * Try to see if we can get  the block without requesting
 	 * for new file system block.
@@ -921,12 +953,26 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 				inode, block, max_blocks, bh, 0, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
-	if (!create || (retval > 0))
+
+	/* If it is only a block(s) look up */
+	if (!create)
+		return retval;
+
+	/*
+	 * Returns if the blocks have already allocated
+	 *
+	 * Note that if blocks have been preallocated
+	 * ext4_ext_get_block() returns th create = 0
+	 * with buffer head unmapped.
+	 */
+	if (retval > 0 && buffer_mapped(bh))
 		return retval;
 
 	/*
-	 * We need to allocate new blocks which will result
-	 * in i_data update
+	 * New blocks allocate and/or writing to uninitialized extent
+	 * will possibly result in updating i_data, so we take
+	 * the write lock of i_data_sem, and call get_blocks()
+	 * with create == 1 flag.
 	 */
 	down_write((&EXT4_I(inode)->i_data_sem));
 	/*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dd0fcfcb35ce..ef97f19c2f9d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -627,21 +627,19 @@ static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 	return block;
 }
 
+static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
+{
 #if BITS_PER_LONG == 64
-#define mb_correct_addr_and_bit(bit, addr)		\
-{							\
-	bit += ((unsigned long) addr & 7UL) << 3;	\
-	addr = (void *) ((unsigned long) addr & ~7UL);	\
-}
+	*bit += ((unsigned long) addr & 7UL) << 3;
+	addr = (void *) ((unsigned long) addr & ~7UL);
 #elif BITS_PER_LONG == 32
-#define mb_correct_addr_and_bit(bit, addr)		\
-{							\
-	bit += ((unsigned long) addr & 3UL) << 3;	\
-	addr = (void *) ((unsigned long) addr & ~3UL);	\
-}
+	*bit += ((unsigned long) addr & 3UL) << 3;
+	addr = (void *) ((unsigned long) addr & ~3UL);
 #else
 #error "how many bits you are?!"
 #endif
+	return addr;
+}
 
 static inline int mb_test_bit(int bit, void *addr)
 {
@@ -649,34 +647,54 @@ static inline int mb_test_bit(int bit, void *addr)
 	 * ext4_test_bit on architecture like powerpc
 	 * needs unsigned long aligned address
 	 */
-	mb_correct_addr_and_bit(bit, addr);
+	addr = mb_correct_addr_and_bit(&bit, addr);
 	return ext4_test_bit(bit, addr);
 }
 
 static inline void mb_set_bit(int bit, void *addr)
 {
-	mb_correct_addr_and_bit(bit, addr);
+	addr = mb_correct_addr_and_bit(&bit, addr);
 	ext4_set_bit(bit, addr);
 }
 
 static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
 {
-	mb_correct_addr_and_bit(bit, addr);
+	addr = mb_correct_addr_and_bit(&bit, addr);
 	ext4_set_bit_atomic(lock, bit, addr);
 }
 
 static inline void mb_clear_bit(int bit, void *addr)
 {
-	mb_correct_addr_and_bit(bit, addr);
+	addr = mb_correct_addr_and_bit(&bit, addr);
 	ext4_clear_bit(bit, addr);
 }
 
 static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
 {
-	mb_correct_addr_and_bit(bit, addr);
+	addr = mb_correct_addr_and_bit(&bit, addr);
 	ext4_clear_bit_atomic(lock, bit, addr);
 }
 
+static inline int mb_find_next_zero_bit(void *addr, int max, int start)
+{
+	int fix = 0;
+	addr = mb_correct_addr_and_bit(&fix, addr);
+	max += fix;
+	start += fix;
+
+	return ext4_find_next_zero_bit(addr, max, start) - fix;
+}
+
+static inline int mb_find_next_bit(void *addr, int max, int start)
+{
+	int fix = 0;
+	addr = mb_correct_addr_and_bit(&fix, addr);
+	max += fix;
+	start += fix;
+
+	return ext4_find_next_bit(addr, max, start) - fix;
+}
+
 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 {
 	char *bb;
@@ -906,7 +924,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
 	unsigned short chunk;
 	unsigned short border;
 
-	BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
+	BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
 
 	border = 2 << sb->s_blocksize_bits;
 
@@ -946,12 +964,12 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 
 	/* initialize buddy from bitmap which is aggregation
 	 * of on-disk bitmap and preallocations */
-	i = ext4_find_next_zero_bit(bitmap, max, 0);
+	i = mb_find_next_zero_bit(bitmap, max, 0);
 	grp->bb_first_free = i;
 	while (i < max) {
 		fragments++;
 		first = i;
-		i = ext4_find_next_bit(bitmap, max, i);
+		i = mb_find_next_bit(bitmap, max, i);
 		len = i - first;
 		free += len;
 		if (len > 1)
@@ -959,7 +977,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 		else
 			grp->bb_counters[0]++;
 		if (i < max)
-			i = ext4_find_next_zero_bit(bitmap, max, i);
+			i = mb_find_next_zero_bit(bitmap, max, i);
 	}
 	grp->bb_fragments = fragments;
 
@@ -967,6 +985,10 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 		ext4_error(sb, __FUNCTION__,
 			"EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
 			group, free, grp->bb_free);
+		/*
+		 * If we intent to continue, we consider group descritor
+		 * corrupt and update bb_free using bitmap value
+		 */
 		grp->bb_free = free;
 	}
 
@@ -1778,7 +1800,7 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
 		buddy = mb_find_buddy(e4b, i, &max);
 		BUG_ON(buddy == NULL);
 
-		k = ext4_find_next_zero_bit(buddy, max, 0);
+		k = mb_find_next_zero_bit(buddy, max, 0);
 		BUG_ON(k >= max);
 
 		ac->ac_found++;
@@ -1818,11 +1840,11 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 	i = e4b->bd_info->bb_first_free;
 
 	while (free && ac->ac_status == AC_STATUS_CONTINUE) {
-		i = ext4_find_next_zero_bit(bitmap,
+		i = mb_find_next_zero_bit(bitmap,
 						EXT4_BLOCKS_PER_GROUP(sb), i);
 		if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
 			/*
-			 * IF we corrupt the bitmap  we won't find any
+			 * IF we have corrupt bitmap, we won't find any
 			 * free blocks even though group info says we
 			 * we have free blocks
 			 */
@@ -1838,6 +1860,12 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 			ext4_error(sb, __FUNCTION__, "%d free blocks as per "
 					"group info. But got %d blocks\n",
 					free, ex.fe_len);
+			/*
+			 * The number of free blocks differs. This mostly
+			 * indicate that the bitmap is corrupt. So exit
+			 * without claiming the space.
+			 */
+			break;
 		}
 
 		ext4_mb_measure_extent(ac, &ex, e4b);
@@ -3740,10 +3768,10 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
 	}
 
 	while (bit < end) {
-		bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit);
+		bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
 		if (bit >= end)
 			break;
-		next = ext4_find_next_bit(bitmap_bh->b_data, end, bit);
+		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
 		if (next > end)
 			next = end;
 		start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
@@ -3771,6 +3799,10 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
 			(unsigned long) pa->pa_len);
 		ext4_error(sb, __FUNCTION__, "free %u, pa_free %u\n",
 						free, pa->pa_free);
+		/*
+		 * pa is already deleted so we use the value obtained
+		 * from the bitmap and continue.
+		 */
 	}
 	atomic_add(free, &sbi->s_mb_discarded);
 	if (ac)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8c6c685b9d22..5c1e27de7755 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -43,6 +43,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
 
 	if (IS_ERR(path)) {
 		retval = PTR_ERR(path);
+		path = NULL;
 		goto err_out;
 	}
 
@@ -74,6 +75,10 @@ static int finish_range(handle_t *handle, struct inode *inode,
 	}
 	retval = ext4_ext_insert_extent(handle, inode, path, &newext);
 err_out:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
 	lb->first_pblock = 0;
 	return retval;
 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a9347fb43bcc..28aa2ed4297e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1804,12 +1804,8 @@ retry:
 	inode->i_fop = &ext4_dir_operations;
 	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
 	dir_block = ext4_bread (handle, inode, 0, 1, &err);
-	if (!dir_block) {
-		ext4_dec_count(handle, inode); /* is this nlink == 0? */
-		ext4_mark_inode_dirty(handle, inode);
-		iput (inode);
-		goto out_stop;
-	}
+	if (!dir_block)
+		goto out_clear_inode;
 	BUFFER_TRACE(dir_block, "get_write_access");
 	ext4_journal_get_write_access(handle, dir_block);
 	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
@@ -1832,7 +1828,8 @@ retry:
 	ext4_mark_inode_dirty(handle, inode);
 	err = ext4_add_entry (handle, dentry, inode);
 	if (err) {
-		inode->i_nlink = 0;
+out_clear_inode:
+		clear_nlink(inode);
 		ext4_mark_inode_dirty(handle, inode);
 		iput (inode);
 		goto out_stop;
@@ -2164,7 +2161,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 	dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
-	ext4_dec_count(handle, inode);
+	drop_nlink(inode);
 	if (!inode->i_nlink)
 		ext4_orphan_add(handle, inode);
 	inode->i_ctime = ext4_current_time(inode);
@@ -2214,7 +2211,7 @@ retry:
 		err = __page_symlink(inode, symname, l,
 				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 		if (err) {
-			ext4_dec_count(handle, inode);
+			clear_nlink(inode);
 			ext4_mark_inode_dirty(handle, inode);
 			iput (inode);
 			goto out_stop;
@@ -2223,7 +2220,6 @@ retry:
 		inode->i_op = &ext4_fast_symlink_inode_operations;
 		memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
 		inode->i_size = l-1;
-		EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
 	}
 	EXT4_I(inode)->i_disksize = inode->i_size;
 	err = ext4_add_nondir(handle, dentry, inode);
@@ -2407,7 +2403,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 		ext4_dec_count(handle, old_dir);
 		if (new_inode) {
 			/* checked empty_dir above, can't have another parent,
-			 * ext3_dec_count() won't work for many-linked dirs */
+			 * ext4_dec_count() won't work for many-linked dirs */
 			new_inode->i_nlink = 0;
 		} else {
 			ext4_inc_count(handle, new_dir);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9477a2bd6ff2..e29efa0f9d62 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1037,6 +1037,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		ext4_warning(sb, __FUNCTION__,
 			     "multiple resizers run on filesystem!");
 		unlock_super(sb);
+		ext4_journal_stop(handle);
 		err = -EBUSY;
 		goto exit_put;
 	}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 96ee899d6502..91a1bd67ac1d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -314,9 +314,12 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 static int lstats_show_proc(struct seq_file *m, void *v)
 {
 	int i;
-	struct task_struct *task = m->private;
-	seq_puts(m, "Latency Top version : v0.1\n");
+	struct inode *inode = m->private;
+	struct task_struct *task = get_proc_task(inode);
 
+	if (!task)
+		return -ESRCH;
+	seq_puts(m, "Latency Top version : v0.1\n");
 	for (i = 0; i < 32; i++) {
 		if (task->latency_record[i].backtrace[0]) {
 			int q;
@@ -341,32 +344,24 @@ static int lstats_show_proc(struct seq_file *m, void *v)
 		}
 
 	}
+	put_task_struct(task);
 	return 0;
 }
 
 static int lstats_open(struct inode *inode, struct file *file)
 {
-	int ret;
-	struct seq_file *m;
-	struct task_struct *task = get_proc_task(inode);
-
-	ret = single_open(file, lstats_show_proc, NULL);
-	if (!ret) {
-		m = file->private_data;
-		m->private = task;
-	}
-	return ret;
+	return single_open(file, lstats_show_proc, inode);
 }
 
 static ssize_t lstats_write(struct file *file, const char __user *buf,
 			    size_t count, loff_t *offs)
 {
-	struct seq_file *m;
-	struct task_struct *task;
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 
-	m = file->private_data;
-	task = m->private;
+	if (!task)
+		return -ESRCH;
 	clear_all_latency_tracing(task);
+	put_task_struct(task);
 
 	return count;
 }
diff --git a/include/asm-x86/futex.h b/include/asm-x86/futex.h
index cd9f894dd2d7..c9952ea9f698 100644
--- a/include/asm-x86/futex.h
+++ b/include/asm-x86/futex.h
@@ -102,6 +102,13 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 static inline int
 futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
+	/* Real i386 machines have no cmpxchg instruction */
+	if (boot_cpu_data.x86 == 3)
+		return -ENOSYS;
+#endif
+
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h
index 4d9367b72976..9b17571e9bc3 100644
--- a/include/asm-x86/lguest.h
+++ b/include/asm-x86/lguest.h
@@ -23,6 +23,17 @@
 /* Found in switcher.S */
 extern unsigned long default_idt_entries[];
 
+/* Declarations for definitions in lguest_guest.S */
+extern char lguest_noirq_start[], lguest_noirq_end[];
+extern const char lgstart_cli[], lgend_cli[];
+extern const char lgstart_sti[], lgend_sti[];
+extern const char lgstart_popf[], lgend_popf[];
+extern const char lgstart_pushf[], lgend_pushf[];
+extern const char lgstart_iret[], lgend_iret[];
+
+extern void lguest_iret(void);
+extern void lguest_init(void);
+
 struct lguest_regs
 {
 	/* Manually saved part. */
diff --git a/include/asm-x86/nops.h b/include/asm-x86/nops.h
index fec025c7f58c..e3b2bce0aff8 100644
--- a/include/asm-x86/nops.h
+++ b/include/asm-x86/nops.h
@@ -3,17 +3,29 @@
 
 /* Define nops for use with alternative() */
 
-/* generic versions from gas */
-#define GENERIC_NOP1	".byte 0x90\n"
-#define GENERIC_NOP2    	".byte 0x89,0xf6\n"
-#define GENERIC_NOP3        ".byte 0x8d,0x76,0x00\n"
-#define GENERIC_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
-#define GENERIC_NOP5        GENERIC_NOP1 GENERIC_NOP4
-#define GENERIC_NOP6	".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP7	".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP8	GENERIC_NOP1 GENERIC_NOP7
+/* generic versions from gas
+   1: nop
+   2: movl %esi,%esi
+   3: leal 0x00(%esi),%esi
+   4: leal 0x00(,%esi,1),%esi
+   6: leal 0x00000000(%esi),%esi
+   7: leal 0x00000000(,%esi,1),%esi
+*/
+#define GENERIC_NOP1 ".byte 0x90\n"
+#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
+#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
+#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
+#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
+#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
+#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
+#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
 
-/* Opteron 64bit nops */
+/* Opteron 64bit nops
+   1: nop
+   2: osp nop
+   3: osp osp nop
+   4: osp osp osp nop
+*/
 #define K8_NOP1 GENERIC_NOP1
 #define K8_NOP2	".byte 0x66,0x90\n"
 #define K8_NOP3	".byte 0x66,0x66,0x90\n"
@@ -23,19 +35,35 @@
 #define K8_NOP7	K8_NOP4 K8_NOP3
 #define K8_NOP8	K8_NOP4 K8_NOP4
 
-/* K7 nops */
-/* uses eax dependencies (arbitary choice) */
-#define K7_NOP1  GENERIC_NOP1
+/* K7 nops
+   uses eax dependencies (arbitary choice)
+   1: nop
+   2: movl %eax,%eax
+   3: leal (,%eax,1),%eax
+   4: leal 0x00(,%eax,1),%eax
+   6: leal 0x00000000(%eax),%eax
+   7: leal 0x00000000(,%eax,1),%eax
+*/
+#define K7_NOP1	GENERIC_NOP1
 #define K7_NOP2	".byte 0x8b,0xc0\n"
 #define K7_NOP3	".byte 0x8d,0x04,0x20\n"
 #define K7_NOP4	".byte 0x8d,0x44,0x20,0x00\n"
 #define K7_NOP5	K7_NOP4 ASM_NOP1
 #define K7_NOP6	".byte 0x8d,0x80,0,0,0,0\n"
-#define K7_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
-#define K7_NOP8        K7_NOP7 ASM_NOP1
+#define K7_NOP7	".byte 0x8D,0x04,0x05,0,0,0,0\n"
+#define K7_NOP8	K7_NOP7 ASM_NOP1
 
-/* P6 nops */
-/* uses eax dependencies (Intel-recommended choice) */
+/* P6 nops
+   uses eax dependencies (Intel-recommended choice)
+   1: nop
+   2: osp nop
+   3: nopl (%eax)
+   4: nopl 0x00(%eax)
+   5: nopl 0x00(%eax,%eax,1)
+   6: osp nopl 0x00(%eax,%eax,1)
+   7: nopl 0x00000000(%eax)
+   8: nopl 0x00000000(%eax,%eax,1)
+*/
 #define P6_NOP1	GENERIC_NOP1
 #define P6_NOP2	".byte 0x66,0x90\n"
 #define P6_NOP3	".byte 0x0f,0x1f,0x00\n"
@@ -63,9 +91,7 @@
 #define ASM_NOP6 K7_NOP6
 #define ASM_NOP7 K7_NOP7
 #define ASM_NOP8 K7_NOP8
-#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
-      defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
-      defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
+#elif defined(CONFIG_X86_P6_NOP)
 #define ASM_NOP1 P6_NOP1
 #define ASM_NOP2 P6_NOP2
 #define ASM_NOP3 P6_NOP3
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h
index f7393bc516ef..143546073b95 100644
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -47,8 +47,12 @@
 #define __PHYSICAL_MASK_SHIFT	46
 #define __VIRTUAL_MASK_SHIFT	48
 
-#define KERNEL_TEXT_SIZE  (40*1024*1024)
-#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
+/*
+ * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
+ * arch/x86/kernel/head_64.S), and it is mapped here:
+ */
+#define KERNEL_IMAGE_SIZE	(128*1024*1024)
+#define KERNEL_IMAGE_START	_AC(0xffffffff80000000, UL)
 
 #ifndef __ASSEMBLY__
 void clear_page(void *page);
diff --git a/include/linux/ext4_fs_extents.h b/include/linux/ext4_fs_extents.h
index 697da4bce6c5..1285c583b2d8 100644
--- a/include/linux/ext4_fs_extents.h
+++ b/include/linux/ext4_fs_extents.h
@@ -227,5 +227,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
 						ext4_lblk_t *, ext4_fsblk_t *);
 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
 						ext4_lblk_t *, ext4_fsblk_t *);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 #endif /* _LINUX_EXT4_EXTENTS */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e217d188a102..2c9621f8bf87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -242,6 +242,7 @@ struct task_struct;
 
 extern void sched_init(void);
 extern void sched_init_smp(void);
+extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
@@ -1189,7 +1190,7 @@ struct task_struct {
 	int softirq_context;
 #endif
 #ifdef CONFIG_LOCKDEP
-# define MAX_LOCK_DEPTH 30UL
+# define MAX_LOCK_DEPTH 48UL
 	u64 curr_chain_key;
 	int lockdep_depth;
 	struct held_lock held_locks[MAX_LOCK_DEPTH];
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3574379f4d62..81a4e4a3f087 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -779,6 +779,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 * parallel walking of the hash-list safe:
 	 */
 	list_add_tail_rcu(&class->hash_entry, hash_head);
+	/*
+	 * Add it to the global list of classes:
+	 */
+	list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
 
 	if (verbose(class)) {
 		graph_unlock();
@@ -2282,10 +2286,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 			return 0;
 		break;
 	case LOCK_USED:
-		/*
-		 * Add it to the global list of classes:
-		 */
-		list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
 		debug_atomic_dec(&nr_unused_locks);
 		break;
 	default:
diff --git a/kernel/printk.c b/kernel/printk.c
index bee36100f110..9adc2a473e6e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -666,7 +666,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	}
 	/* Emit the output into the temporary buffer */
 	printed_len += vscnprintf(printk_buf + printed_len,
-				  sizeof(printk_buf), fmt, args);
+				  sizeof(printk_buf) - printed_len, fmt, args);
 
 	/*
 	 * Copy the output into log_buf.  If the caller didn't provide
diff --git a/kernel/sched.c b/kernel/sched.c
index b387a8de26a5..f06950c8a6ce 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -668,6 +668,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
  */
 unsigned int sysctl_sched_rt_period = 1000000;
 
+static __read_mostly int scheduler_running;
+
 /*
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
@@ -689,14 +691,16 @@ unsigned long long cpu_clock(int cpu)
 	unsigned long flags;
 	struct rq *rq;
 
-	local_irq_save(flags);
-	rq = cpu_rq(cpu);
 	/*
 	 * Only call sched_clock() if the scheduler has already been
 	 * initialized (some code might call cpu_clock() very early):
 	 */
-	if (rq->idle)
-		update_rq_clock(rq);
+	if (unlikely(!scheduler_running))
+		return 0;
+
+	local_irq_save(flags);
+	rq = cpu_rq(cpu);
+	update_rq_clock(rq);
 	now = rq->clock;
 	local_irq_restore(flags);
 
@@ -3885,7 +3889,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
-	long *switch_count;
+	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
@@ -7284,6 +7288,8 @@ void __init sched_init(void)
 	 * During early bootup we pretend to be a normal task:
 	 */
 	current->sched_class = &fair_sched_class;
+
+	scheduler_running = 1;
 }
 
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6c091d6e159d..c8e6492c5925 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -202,17 +202,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 
 static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
-	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
-	struct sched_entity *se = NULL;
-	struct rb_node *parent;
+	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 
-	while (*link) {
-		parent = *link;
-		se = rb_entry(parent, struct sched_entity, run_node);
-		link = &parent->rb_right;
-	}
+	if (!last)
+		return NULL;
 
-	return se;
+	return rb_entry(last, struct sched_entity, run_node);
 }
 
 /**************************************************************