Merge branch 'next' (accumulated 3.16 merge window patches) into master

Now that 3.15 is released, this merges the 'next' branch into 'master', bringing us to the normal situation where my 'master' branch is the merge window. * accumulated work in next: (6809 commits) ufs: sb mutex merge + mutex_destroy powerpc: update comments for generic idle conversion cris: update comments for generic idle conversion idle: remove cpu_idle() forward declarations nbd: zero from and len fields in NBD_CMD_DISCONNECT. mm: convert some level-less printks to pr_* MAINTAINERS: adi-buildroot-devel is moderated MAINTAINERS: add linux-api for review of API/ABI changes mm/kmemleak-test.c: use pr_fmt for logging fs/dlm/debug_fs.c: replace seq_printf by seq_puts fs/dlm/lockspace.c: convert simple_str to kstr fs/dlm/config.c: convert simple_str to kstr mm: mark remap_file_pages() syscall as deprecated mm: memcontrol: remove unnecessary memcg argument from soft limit functions mm: memcontrol: clean up memcg zoneinfo lookup mm/memblock.c: call kmemleak directly from memblock_(alloc|free) mm/mempool.c: update the kmemleak stack trace for mempool allocations lib/radix-tree.c: update the kmemleak stack trace for radix tree allocations mm: introduce kmemleak_update_trace() mm/kmemleak.c: use %u to print ->checksum ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-06-08 11:31:16 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-06-08 11:31:16 -0700
commit: 3f17ea6dea8ba5668873afa54628a91aaa3fb1c0 (patch)
tree: afbeb2accd4c2199ddd705ae943995b143a0af02 /arch/arm64/lib/memcpy.S
parent: 1860e379875dfe7271c649058aeddffe5afd9d0d (diff)
parent: 1a5700bc2d10cd379a795fd2bb377a190af5acd4 (diff)
1 files changed, 170 insertions, 22 deletions
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 27b5003609b6..8a9a96d3ddae 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,5 +1,13 @@
 /*
  * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
 /*
  * Copy a buffer from src to dest (alignment handled by the hardware)
@@ -27,27 +36,166 @@
  * Returns:
  *	x0 - dest
  */
+dstin	.req	x0
+src	.req	x1
+count	.req	x2
+tmp1	.req	x3
+tmp1w	.req	w3
+tmp2	.req	x4
+tmp2w	.req	w4
+tmp3	.req	x5
+tmp3w	.req	w5
+dst	.req	x6
+
+A_l	.req	x7
+A_h	.req	x8
+B_l	.req	x9
+B_h	.req	x10
+C_l	.req	x11
+C_h	.req	x12
+D_l	.req	x13
+D_h	.req	x14
+
 ENTRY(memcpy)
-	mov	x4, x0
-	subs	x2, x2, #8
-	b.mi	2f
-1:	ldr	x3, [x1], #8
-	subs	x2, x2, #8
-	str	x3, [x4], #8
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-	ldr	w3, [x1], #4
-	sub	x2, x2, #4
-	str	w3, [x4], #4
-3:	adds	x2, x2, #2
-	b.mi	4f
-	ldrh	w3, [x1], #2
-	sub	x2, x2, #2
-	strh	w3, [x4], #2
-4:	adds	x2, x2, #1
-	b.mi	5f
-	ldrb	w3, [x1]
-	strb	w3, [x4]
-5:	ret
+	mov	dst, dstin
+	cmp	count, #16
+	/*When memory length is less than 16, the accessed are not aligned.*/
+	b.lo	.Ltiny15
+
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+	/*
+	* Copy the leading memory data from src to dst in an increasing
+	* address order.By this way,the risk of overwritting the source
+	* memory data is eliminated when the distance between src and
+	* dst is less than 16. The memory accesses here are alignment.
+	*/
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src], #1
+	strb	tmp1w, [dst], #1
+1:
+	tbz	tmp2, #1, 2f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+2:
+	tbz	tmp2, #2, 3f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+3:
+	tbz	tmp2, #3, .LSrcAligned
+	ldr	tmp1, [src],#8
+	str	tmp1, [dst],#8
+
+.LSrcAligned:
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+1:
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+2:
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+.Ltiny15:
+	/*
+	* Prefer to break one ldp/stp into several load/store to access
+	* memory in an increasing address order,rather than to load/store 16
+	* bytes from (src-16) to (dst-16) and to backward the src to aligned
+	* address,which way is used in original cortex memcpy. If keeping
+	* the original memcpy process here, memmove need to satisfy the
+	* precondition that src address is at least 16 bytes bigger than dst
+	* address,otherwise some source data will be overwritten when memove
+	* call memcpy directly. To make memmove simpler and decouple the
+	* memcpy's dependency on memmove, withdrew the original process.
+	*/
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	count, #2, 2f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+2:
+	tbz	count, #1, 3f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+3:
+	tbz	count, #0, .Lexitfunc
+	ldrb	tmp1w, [src]
+	strb	tmp1w, [dst]
+
+.Lexitfunc:
+	ret
+
+.Lcpy_over64:
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+	ldp	A_l, A_h, [src],#16
+	stp	A_l, A_h, [dst],#16
+	ldp	B_l, B_h, [src],#16
+	ldp	C_l, C_h, [src],#16
+	stp	B_l, B_h, [dst],#16
+	stp	C_l, C_h, [dst],#16
+	ldp	D_l, D_h, [src],#16
+	stp	D_l, D_h, [dst],#16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+	.p2align	L1_CACHE_SHIFT
+.Lcpy_body_large:
+	/* pre-get 64 bytes data. */
+	ldp	A_l, A_h, [src],#16
+	ldp	B_l, B_h, [src],#16
+	ldp	C_l, C_h, [src],#16
+	ldp	D_l, D_h, [src],#16
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+	stp	A_l, A_h, [dst],#16
+	ldp	A_l, A_h, [src],#16
+	stp	B_l, B_h, [dst],#16
+	ldp	B_l, B_h, [src],#16
+	stp	C_l, C_h, [dst],#16
+	ldp	C_l, C_h, [src],#16
+	stp	D_l, D_h, [dst],#16
+	ldp	D_l, D_h, [src],#16
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst],#16
+	stp	B_l, B_h, [dst],#16
+	stp	C_l, C_h, [dst],#16
+	stp	D_l, D_h, [dst],#16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
 ENDPROC(memcpy)
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-06-08 11:31:16 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-06-08 11:31:16 -0700
commit	3f17ea6dea8ba5668873afa54628a91aaa3fb1c0 (patch)
tree	afbeb2accd4c2199ddd705ae943995b143a0af02 /arch/arm64/lib/memcpy.S
parent	1860e379875dfe7271c649058aeddffe5afd9d0d (diff)
parent	1a5700bc2d10cd379a795fd2bb377a190af5acd4 (diff)