1/*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
23 */
24
25#include <asm/cache.h>
26#include "assembler.h"
27
28/*
29 * Copy a buffer from src to dest (alignment handled by the hardware)
30 *
31 * Parameters:
32 *	x0 - dest
33 *	x1 - src
34 *	x2 - n
35 * Returns:
36 *	x0 - dest
37 */
38dstin	.req	x0
39src	.req	x1
40count	.req	x2
41tmp1	.req	x3
42tmp1w	.req	w3
43tmp2	.req	x4
44tmp2w	.req	w4
45tmp3	.req	x5
46tmp3w	.req	w5
47dst	.req	x6
48
49A_l	.req	x7
50A_h	.req	x8
51B_l	.req	x9
52B_h	.req	x10
53C_l	.req	x11
54C_h	.req	x12
55D_l	.req	x13
56D_h	.req	x14
57
58ENTRY(memcpy)
59	mov	dst, dstin
60	cmp	count, #16
61	/*When memory length is less than 16, the accessed are not aligned.*/
62	b.lo	.Ltiny15
63
64	neg	tmp2, src
65	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
66	b.eq	.LSrcAligned
67	sub	count, count, tmp2
68	/*
69	* Copy the leading memory data from src to dst in an increasing
70	* address order.By this way,the risk of overwritting the source
71	* memory data is eliminated when the distance between src and
72	* dst is less than 16. The memory accesses here are alignment.
73	*/
74	tbz	tmp2, #0, 1f
75	ldrb	tmp1w, [src], #1
76	strb	tmp1w, [dst], #1
771:
78	tbz	tmp2, #1, 2f
79	ldrh	tmp1w, [src], #2
80	strh	tmp1w, [dst], #2
812:
82	tbz	tmp2, #2, 3f
83	ldr	tmp1w, [src], #4
84	str	tmp1w, [dst], #4
853:
86	tbz	tmp2, #3, .LSrcAligned
87	ldr	tmp1, [src],#8
88	str	tmp1, [dst],#8
89
90.LSrcAligned:
91	cmp	count, #64
92	b.ge	.Lcpy_over64
93	/*
94	* Deal with small copies quickly by dropping straight into the
95	* exit block.
96	*/
97.Ltail63:
98	/*
99	* Copy up to 48 bytes of data. At this point we only need the
100	* bottom 6 bits of count to be accurate.
101	*/
102	ands	tmp1, count, #0x30
103	b.eq	.Ltiny15
104	cmp	tmp1w, #0x20
105	b.eq	1f
106	b.lt	2f
107	ldp	A_l, A_h, [src], #16
108	stp	A_l, A_h, [dst], #16
1091:
110	ldp	A_l, A_h, [src], #16
111	stp	A_l, A_h, [dst], #16
1122:
113	ldp	A_l, A_h, [src], #16
114	stp	A_l, A_h, [dst], #16
115.Ltiny15:
116	/*
117	* Prefer to break one ldp/stp into several load/store to access
118	* memory in an increasing address order,rather than to load/store 16
119	* bytes from (src-16) to (dst-16) and to backward the src to aligned
120	* address,which way is used in original cortex memcpy. If keeping
121	* the original memcpy process here, memmove need to satisfy the
122	* precondition that src address is at least 16 bytes bigger than dst
123	* address,otherwise some source data will be overwritten when memove
124	* call memcpy directly. To make memmove simpler and decouple the
125	* memcpy's dependency on memmove, withdrew the original process.
126	*/
127	tbz	count, #3, 1f
128	ldr	tmp1, [src], #8
129	str	tmp1, [dst], #8
1301:
131	tbz	count, #2, 2f
132	ldr	tmp1w, [src], #4
133	str	tmp1w, [dst], #4
1342:
135	tbz	count, #1, 3f
136	ldrh	tmp1w, [src], #2
137	strh	tmp1w, [dst], #2
1383:
139	tbz	count, #0, .Lexitfunc
140	ldrb	tmp1w, [src]
141	strb	tmp1w, [dst]
142
143.Lexitfunc:
144	ret
145
146.Lcpy_over64:
147	subs	count, count, #128
148	b.ge	.Lcpy_body_large
149	/*
150	* Less than 128 bytes to copy, so handle 64 here and then jump
151	* to the tail.
152	*/
153	ldp	A_l, A_h, [src],#16
154	stp	A_l, A_h, [dst],#16
155	ldp	B_l, B_h, [src],#16
156	ldp	C_l, C_h, [src],#16
157	stp	B_l, B_h, [dst],#16
158	stp	C_l, C_h, [dst],#16
159	ldp	D_l, D_h, [src],#16
160	stp	D_l, D_h, [dst],#16
161
162	tst	count, #0x3f
163	b.ne	.Ltail63
164	ret
165
166	/*
167	* Critical loop.  Start at a new cache line boundary.  Assuming
168	* 64 bytes per line this ensures the entire loop is in one line.
169	*/
170	.p2align	L1_CACHE_SHIFT
171.Lcpy_body_large:
172	/* pre-get 64 bytes data. */
173	ldp	A_l, A_h, [src],#16
174	ldp	B_l, B_h, [src],#16
175	ldp	C_l, C_h, [src],#16
176	ldp	D_l, D_h, [src],#16
1771:
178	/*
179	* interlace the load of next 64 bytes data block with store of the last
180	* loaded 64 bytes data.
181	*/
182	stp	A_l, A_h, [dst],#16
183	ldp	A_l, A_h, [src],#16
184	stp	B_l, B_h, [dst],#16
185	ldp	B_l, B_h, [src],#16
186	stp	C_l, C_h, [dst],#16
187	ldp	C_l, C_h, [src],#16
188	stp	D_l, D_h, [dst],#16
189	ldp	D_l, D_h, [src],#16
190	subs	count, count, #64
191	b.ge	1b
192	stp	A_l, A_h, [dst],#16
193	stp	B_l, B_h, [dst],#16
194	stp	C_l, C_h, [dst],#16
195	stp	D_l, D_h, [dst],#16
196
197	tst	count, #0x3f
198	b.ne	.Ltail63
199	ret
200ENDPROC(memcpy)
201