1/*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
23 */
24
25#include <asm/cache.h>
26#include "assembler.h"
27
28/*
29 * Move a buffer from src to test (alignment handled by the hardware).
30 * If dest <= src, call memcpy, otherwise copy in reverse order.
31 *
32 * Parameters:
33 *	x0 - dest
34 *	x1 - src
35 *	x2 - n
36 * Returns:
37 *	x0 - dest
38 */
39dstin	.req	x0
40src	.req	x1
41count	.req	x2
42tmp1	.req	x3
43tmp1w	.req	w3
44tmp2	.req	x4
45tmp2w	.req	w4
46tmp3	.req	x5
47tmp3w	.req	w5
48dst	.req	x6
49
50A_l	.req	x7
51A_h	.req	x8
52B_l	.req	x9
53B_h	.req	x10
54C_l	.req	x11
55C_h	.req	x12
56D_l	.req	x13
57D_h	.req	x14
58
59ENTRY(memmove)
60	cmp	dstin, src
61	b.lo	memcpy
62	add	tmp1, src, count
63	cmp	dstin, tmp1
64	b.hs	memcpy		/* No overlap.  */
65
66	add	dst, dstin, count
67	add	src, src, count
68	cmp	count, #16
69	b.lo	.Ltail15  /*probably non-alignment accesses.*/
70
71	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
72	b.eq	.LSrcAligned
73	sub	count, count, tmp2
74	/*
75	* process the aligned offset length to make the src aligned firstly.
76	* those extra instructions' cost is acceptable. It also make the
77	* coming accesses are based on aligned address.
78	*/
79	tbz	tmp2, #0, 1f
80	ldrb	tmp1w, [src, #-1]!
81	strb	tmp1w, [dst, #-1]!
821:
83	tbz	tmp2, #1, 2f
84	ldrh	tmp1w, [src, #-2]!
85	strh	tmp1w, [dst, #-2]!
862:
87	tbz	tmp2, #2, 3f
88	ldr	tmp1w, [src, #-4]!
89	str	tmp1w, [dst, #-4]!
903:
91	tbz	tmp2, #3, .LSrcAligned
92	ldr	tmp1, [src, #-8]!
93	str	tmp1, [dst, #-8]!
94
95.LSrcAligned:
96	cmp	count, #64
97	b.ge	.Lcpy_over64
98
99	/*
100	* Deal with small copies quickly by dropping straight into the
101	* exit block.
102	*/
103.Ltail63:
104	/*
105	* Copy up to 48 bytes of data. At this point we only need the
106	* bottom 6 bits of count to be accurate.
107	*/
108	ands	tmp1, count, #0x30
109	b.eq	.Ltail15
110	cmp	tmp1w, #0x20
111	b.eq	1f
112	b.lt	2f
113	ldp	A_l, A_h, [src, #-16]!
114	stp	A_l, A_h, [dst, #-16]!
1151:
116	ldp	A_l, A_h, [src, #-16]!
117	stp	A_l, A_h, [dst, #-16]!
1182:
119	ldp	A_l, A_h, [src, #-16]!
120	stp	A_l, A_h, [dst, #-16]!
121
122.Ltail15:
123	tbz	count, #3, 1f
124	ldr	tmp1, [src, #-8]!
125	str	tmp1, [dst, #-8]!
1261:
127	tbz	count, #2, 2f
128	ldr	tmp1w, [src, #-4]!
129	str	tmp1w, [dst, #-4]!
1302:
131	tbz	count, #1, 3f
132	ldrh	tmp1w, [src, #-2]!
133	strh	tmp1w, [dst, #-2]!
1343:
135	tbz	count, #0, .Lexitfunc
136	ldrb	tmp1w, [src, #-1]
137	strb	tmp1w, [dst, #-1]
138
139.Lexitfunc:
140	ret
141
142.Lcpy_over64:
143	subs	count, count, #128
144	b.ge	.Lcpy_body_large
145	/*
146	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
147	* to the tail.
148	*/
149	ldp	A_l, A_h, [src, #-16]
150	stp	A_l, A_h, [dst, #-16]
151	ldp	B_l, B_h, [src, #-32]
152	ldp	C_l, C_h, [src, #-48]
153	stp	B_l, B_h, [dst, #-32]
154	stp	C_l, C_h, [dst, #-48]
155	ldp	D_l, D_h, [src, #-64]!
156	stp	D_l, D_h, [dst, #-64]!
157
158	tst	count, #0x3f
159	b.ne	.Ltail63
160	ret
161
162	/*
163	* Critical loop. Start at a new cache line boundary. Assuming
164	* 64 bytes per line this ensures the entire loop is in one line.
165	*/
166	.p2align	L1_CACHE_SHIFT
167.Lcpy_body_large:
168	/* pre-load 64 bytes data. */
169	ldp	A_l, A_h, [src, #-16]
170	ldp	B_l, B_h, [src, #-32]
171	ldp	C_l, C_h, [src, #-48]
172	ldp	D_l, D_h, [src, #-64]!
1731:
174	/*
175	* interlace the load of next 64 bytes data block with store of the last
176	* loaded 64 bytes data.
177	*/
178	stp	A_l, A_h, [dst, #-16]
179	ldp	A_l, A_h, [src, #-16]
180	stp	B_l, B_h, [dst, #-32]
181	ldp	B_l, B_h, [src, #-32]
182	stp	C_l, C_h, [dst, #-48]
183	ldp	C_l, C_h, [src, #-48]
184	stp	D_l, D_h, [dst, #-64]!
185	ldp	D_l, D_h, [src, #-64]!
186	subs	count, count, #64
187	b.ge	1b
188	stp	A_l, A_h, [dst, #-16]
189	stp	B_l, B_h, [dst, #-32]
190	stp	C_l, C_h, [dst, #-48]
191	stp	D_l, D_h, [dst, #-64]!
192
193	tst	count, #0x3f
194	b.ne	.Ltail63
195	ret
196ENDPROC(memmove)
197