1/*
2 * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
3 * Copyright (C) 2007 ARC International (UK) LTD
4 *
5 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
6 */
7
8#include <sysdep.h>
9
10ENTRY(memcpy)
11
12#if defined(__ARC700__)
13/* This memcpy implementation does not support objects of 1GB or larger -
14   the check for alignment does not work then.  */
15/* We assume that most sources and destinations are aligned, and
16   that also lengths are mostly a multiple of four, although to a lesser
17   extent.  */
18	or	r3,r0,r1
19	asl_s	r3,r3,30
20	mov_s	r5,r0
21	brls.d	r2,r3,.Lcopy_bytewise
22	sub.f	r3,r2,1
23	ld_s	r12,[r1,0]
24	asr.f	lp_count,r3,3
25	bbit0.d	r3,2,.Lnox4
26	bmsk_s	r2,r2,1
27	st.ab	r12,[r5,4]
28	ld.a	r12,[r1,4]
29.Lnox4:
30	lppnz	.Lendloop
31	ld_s	r3,[r1,4]
32	st.ab	r12,[r5,4]
33	ld.a	r12,[r1,8]
34	st.ab	r3,[r5,4]
35.Lendloop:
36	breq	r2,0,.Last_store
37	ld	r3,[r5,0]
38#ifdef __LITTLE_ENDIAN__
39	add3	r2,-1,r2
40	; uses long immediate
41	xor_s	r12,r12,r3
42	bmsk	r12,r12,r2
43        xor_s	r12,r12,r3
44#else /* BIG ENDIAN */
45	sub3	r2,31,r2
46	; uses long immediate
47        xor_s	r3,r3,r12
48        bmsk	r3,r3,r2
49        xor_s	r12,r12,r3
50#endif /* ENDIAN */
51.Last_store:
52	j_s.d	[blink]
53	st	r12,[r5,0]
54
55	.balign	4
56.Lcopy_bytewise:
57	jcs	[blink]
58	ldb_s	r12,[r1,0]
59	lsr.f	lp_count,r3
60	bhs_s	.Lnox1
61	stb.ab	r12,[r5,1]
62	ldb.a	r12,[r1,1]
63.Lnox1:
64	lppnz	.Lendbloop
65	ldb_s	r3,[r1,1]
66	stb.ab	r12,[r5,1]
67	ldb.a	r12,[r1,2]
68	stb.ab	r3,[r5,1]
69.Lendbloop:
70	j_s.d	[blink]
71	stb	r12,[r5,0]
72
73#elif defined(__ARCHS__)
74
75#ifdef __LITTLE_ENDIAN__
76# define SHIFT_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
77# define SHIFT_2(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
78# define MERGE_1(RX,RY,IMM)	asl	RX, RY, IMM
79# define MERGE_2(RX,RY,IMM)
80# define EXTRACT_1(RX,RY,IMM)	and	RX, RY, 0xFFFF
81# define EXTRACT_2(RX,RY,IMM)	lsr	RX, RY, IMM
82#else
83# define SHIFT_1(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
84# define SHIFT_2(RX,RY,IMM)	asl	RX, RY, IMM	; <<
85# define MERGE_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
86# define MERGE_2(RX,RY,IMM)	asl	RX, RY, IMM	; <<
87# define EXTRACT_1(RX,RY,IMM)	lsr	RX, RY, IMM
88# define EXTRACT_2(RX,RY,IMM)	lsr	RX, RY, 0x08
89#endif
90
91#if defined(__LL64__) || defined(__ARC_LL64__)
92# define PREFETCH_READ(RX)	prefetch [RX, 56]
93# define PREFETCH_WRITE(RX)	prefetchw [RX, 64]
94# define LOADX(DST,RX)		ldd.ab	DST, [RX, 8]
95# define STOREX(SRC,RX)		std.ab	SRC, [RX, 8]
96# define ZOLSHFT		5
97# define ZOLAND			0x1F
98#else
99# define PREFETCH_READ(RX)	prefetch [RX, 28]
100# define PREFETCH_WRITE(RX)	prefetchw [RX, 32]
101# define LOADX(DST,RX)		ld.ab	DST, [RX, 4]
102# define STOREX(SRC,RX)		st.ab	SRC, [RX, 4]
103# define ZOLSHFT		4
104# define ZOLAND			0xF
105#endif
106
107	prefetch  [r1]		; Prefetch the read location
108	prefetchw [r0]		; Prefetch the write location
109	mov.f	0, r2
110;;; if size is zero
111	jz.d	[blink]
112	mov	r3, r0		; don't clobber ret val
113
114;;; if size <= 8
115	cmp	r2, 8
116	bls.d	@.Lsmallchunk
117	mov.f	lp_count, r2
118
119	and.f	r4, r0, 0x03
120	rsub	lp_count, r4, 4
121	lpnz	@.Laligndestination
122	;; LOOP BEGIN
123	ldb.ab	r5, [r1,1]
124	sub	r2, r2, 1
125	stb.ab	r5, [r3,1]
126.Laligndestination:
127
128;;; Check the alignment of the source
129	and.f	r4, r1, 0x03
130	bnz.d	@.Lsourceunaligned
131
132;;; CASE 0: Both source and destination are 32bit aligned
133;;; Convert len to Dwords, unfold x4
134	lsr.f	lp_count, r2, ZOLSHFT
135	lpnz	@.Lcopy32_64bytes
136	;; LOOP START
137	LOADX (r6, r1)
138	PREFETCH_READ (r1)
139	PREFETCH_WRITE (r3)
140	LOADX (r8, r1)
141	LOADX (r10, r1)
142	LOADX (r4, r1)
143	STOREX (r6, r3)
144	STOREX (r8, r3)
145	STOREX (r10, r3)
146	STOREX (r4, r3)
147.Lcopy32_64bytes:
148
149	and.f	lp_count, r2, ZOLAND ;Last remaining 31 bytes
150.Lsmallchunk:
151	lpnz	@.Lcopyremainingbytes
152	;; LOOP START
153	ldb.ab	r5, [r1,1]
154	stb.ab	r5, [r3,1]
155.Lcopyremainingbytes:
156
157	j	[blink]
158;;; END CASE 0
159
160.Lsourceunaligned:
161	cmp	r4, 2
162	beq.d	@.LunalignedOffby2
163	sub	r2, r2, 1
164
165	bhi.d	@.LunalignedOffby3
166	ldb.ab	r5, [r1, 1]
167
168;;; CASE 1: The source is unaligned, off by 1
169	;; Hence I need to read 1 byte for a 16bit alignment
170	;; and 2bytes to reach 32bit alignment
171	ldh.ab	r6, [r1, 2]
172	sub	r2, r2, 2
173	;; Convert to words, unfold x2
174	lsr.f	lp_count, r2, 3
175	MERGE_1 (r6, r6, 8)
176	MERGE_2 (r5, r5, 24)
177	or	r5, r5, r6
178
179	;; Both src and dst are aligned
180	lpnz	@.Lcopy8bytes_1
181	;; LOOP START
182	ld.ab	r6, [r1, 4]
183	prefetch [r1, 28]	;Prefetch the next read location
184	ld.ab	r8, [r1,4]
185	prefetchw [r3, 32]	;Prefetch the next write location
186
187	SHIFT_1	(r7, r6, 24)
188	or	r7, r7, r5
189	SHIFT_2	(r5, r6, 8)
190
191	SHIFT_1	(r9, r8, 24)
192	or	r9, r9, r5
193	SHIFT_2	(r5, r8, 8)
194
195	st.ab	r7, [r3, 4]
196	st.ab	r9, [r3, 4]
197.Lcopy8bytes_1:
198
199	;; Write back the remaining 16bits
200	EXTRACT_1 (r6, r5, 16)
201	sth.ab	r6, [r3, 2]
202	;; Write back the remaining 8bits
203	EXTRACT_2 (r5, r5, 16)
204	stb.ab	r5, [r3, 1]
205
206	and.f	lp_count, r2, 0x07 ;Last 8bytes
207	lpnz	@.Lcopybytewise_1
208	;; LOOP START
209	ldb.ab	r6, [r1,1]
210	stb.ab	r6, [r3,1]
211.Lcopybytewise_1:
212	j	[blink]
213
214.LunalignedOffby2:
215;;; CASE 2: The source is unaligned, off by 2
216	ldh.ab	r5, [r1, 2]
217	sub	r2, r2, 1
218
219	;; Both src and dst are aligned
220	;; Convert to words, unfold x2
221	lsr.f	lp_count, r2, 3
222#ifdef __BIG_ENDIAN__
223	asl.nz	r5, r5, 16
224#endif
225	lpnz	@.Lcopy8bytes_2
226	;; LOOP START
227	ld.ab	r6, [r1, 4]
228	prefetch [r1, 28]	;Prefetch the next read location
229	ld.ab	r8, [r1,4]
230	prefetchw [r3, 32]	;Prefetch the next write location
231
232	SHIFT_1	(r7, r6, 16)
233	or	r7, r7, r5
234	SHIFT_2	(r5, r6, 16)
235
236	SHIFT_1	(r9, r8, 16)
237	or	r9, r9, r5
238	SHIFT_2	(r5, r8, 16)
239
240	st.ab	r7, [r3, 4]
241	st.ab	r9, [r3, 4]
242.Lcopy8bytes_2:
243
244#ifdef __BIG_ENDIAN__
245	lsr.nz	r5, r5, 16
246#endif
247	sth.ab	r5, [r3, 2]
248
249	and.f	lp_count, r2, 0x07 ;Last 8bytes
250	lpnz	@.Lcopybytewise_2
251	;; LOOP START
252	ldb.ab	r6, [r1,1]
253	stb.ab	r6, [r3,1]
254.Lcopybytewise_2:
255	j	[blink]
256
257.LunalignedOffby3:
258;;; CASE 3: The source is unaligned, off by 3
259;;; Hence, I need to read 1byte for achieve the 32bit alignment
260
261	;; Both src and dst are aligned
262	;; Convert to words, unfold x2
263	lsr.f	lp_count, r2, 3
264#ifdef __BIG_ENDIAN__
265	asl.ne	r5, r5, 24
266#endif
267	lpnz	@.Lcopy8bytes_3
268	;; LOOP START
269	ld.ab	r6, [r1, 4]
270	prefetch [r1, 28]	;Prefetch the next read location
271	ld.ab	r8, [r1,4]
272	prefetchw [r3, 32]	;Prefetch the next write location
273
274	SHIFT_1	(r7, r6, 8)
275	or	r7, r7, r5
276	SHIFT_2	(r5, r6, 24)
277
278	SHIFT_1	(r9, r8, 8)
279	or	r9, r9, r5
280	SHIFT_2	(r5, r8, 24)
281
282	st.ab	r7, [r3, 4]
283	st.ab	r9, [r3, 4]
284.Lcopy8bytes_3:
285
286#ifdef __BIG_ENDIAN__
287	lsr.nz	r5, r5, 24
288#endif
289	stb.ab	r5, [r3, 1]
290
291	and.f	lp_count, r2, 0x07 ;Last 8bytes
292	lpnz	@.Lcopybytewise_3
293	;; LOOP START
294	ldb.ab	r6, [r1,1]
295	stb.ab	r6, [r3,1]
296.Lcopybytewise_3:
297	j	[blink]
298
299#elif defined(__ARC64_ARCH32__)
300	;; Based on Synopsys code from newlib's arc64/memcpy.S
301	lsr.f	r11, r2, 4		; counter for 16-byte chunks
302	beq.d	@.L_write_15_bytes
303	mov	r3, r0			; work on a copy of "r0"
304
305.L_write_16_bytes:
306#if defined(__ARC64_LL64__)
307	ldd.ab	r4, [r1, 8]
308	ldd.ab	r6, [r1, 8]
309	std.ab	r4, [r3, 8]
310	std.ab	r6, [r3, 8]
311	dbnz	r11, @.L_write_16_bytes
312#else
313	ld.ab	r4, [r1, 4]
314	ld.ab	r5, [r1, 4]
315	ld.ab	r6, [r1, 4]
316	ld.ab	r7, [r1, 4]
317	st.ab	r4, [r3, 4]
318	st.ab	r5, [r3, 4]
319	st.ab	r6, [r3, 4]
320	dbnz.d	r11, @.L_write_16_bytes
321	st.ab	r7, [r3, 4]
322#endif
323	bmsk_s	r2, r2, 3
324
325.L_write_15_bytes:
326	bbit0.d	r2, 1, @1f
327	lsr	r11, r2, 2
328	ldh.ab	r4, [r1, 2]
329	sth.ab	r4, [r3, 2]
3301:
331	bbit0.d	r2, 0, @1f
332	xor	r11, r11, 3
333	ldb.ab	r4, [r1, 1]
334	stb.ab	r4, [r3, 1]
3351:
336	asl	r11, r11, 1
337	bi	[r11]
338	ld.ab	r4,[r1, 4]
339	st.ab	r4,[r3, 4]
340	ld.ab	r4,[r1, 4]
341	st.ab	r4,[r3, 4]
342	ld	r4,[r1]
343	st	r4,[r3]
344
345	j_s	[blink]
346
347#else
348#error "Unsupported ARC CPU type"
349#endif
350
351END(memcpy)
352libc_hidden_def(memcpy)
353