1/* 2 * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com) 3 * Copyright (C) 2007 ARC International (UK) LTD 4 * 5 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. 6 */ 7 8#include <sysdep.h> 9 10ENTRY(memcpy) 11 12#if defined(__ARC700__) 13/* This memcpy implementation does not support objects of 1GB or larger - 14 the check for alignment does not work then. */ 15/* We assume that most sources and destinations are aligned, and 16 that also lengths are mostly a multiple of four, although to a lesser 17 extent. */ 18 or r3,r0,r1 19 asl_s r3,r3,30 20 mov_s r5,r0 21 brls.d r2,r3,.Lcopy_bytewise 22 sub.f r3,r2,1 23 ld_s r12,[r1,0] 24 asr.f lp_count,r3,3 25 bbit0.d r3,2,.Lnox4 26 bmsk_s r2,r2,1 27 st.ab r12,[r5,4] 28 ld.a r12,[r1,4] 29.Lnox4: 30 lppnz .Lendloop 31 ld_s r3,[r1,4] 32 st.ab r12,[r5,4] 33 ld.a r12,[r1,8] 34 st.ab r3,[r5,4] 35.Lendloop: 36 breq r2,0,.Last_store 37 ld r3,[r5,0] 38#ifdef __LITTLE_ENDIAN__ 39 add3 r2,-1,r2 40 ; uses long immediate 41 xor_s r12,r12,r3 42 bmsk r12,r12,r2 43 xor_s r12,r12,r3 44#else /* BIG ENDIAN */ 45 sub3 r2,31,r2 46 ; uses long immediate 47 xor_s r3,r3,r12 48 bmsk r3,r3,r2 49 xor_s r12,r12,r3 50#endif /* ENDIAN */ 51.Last_store: 52 j_s.d [blink] 53 st r12,[r5,0] 54 55 .balign 4 56.Lcopy_bytewise: 57 jcs [blink] 58 ldb_s r12,[r1,0] 59 lsr.f lp_count,r3 60 bhs_s .Lnox1 61 stb.ab r12,[r5,1] 62 ldb.a r12,[r1,1] 63.Lnox1: 64 lppnz .Lendbloop 65 ldb_s r3,[r1,1] 66 stb.ab r12,[r5,1] 67 ldb.a r12,[r1,2] 68 stb.ab r3,[r5,1] 69.Lendbloop: 70 j_s.d [blink] 71 stb r12,[r5,0] 72 73#elif defined(__ARCHS__) 74 75#ifdef __LITTLE_ENDIAN__ 76# define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << 77# define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> 78# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM 79# define MERGE_2(RX,RY,IMM) 80# define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF 81# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM 82#else 83# define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> 84# define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << 85# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << 86# define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << 87# define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM 88# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 89#endif 90 91#if defined(__LL64__) || defined(__ARC_LL64__) 92# define PREFETCH_READ(RX) prefetch [RX, 56] 93# define PREFETCH_WRITE(RX) prefetchw [RX, 64] 94# define LOADX(DST,RX) ldd.ab DST, [RX, 8] 95# define STOREX(SRC,RX) std.ab SRC, [RX, 8] 96# define ZOLSHFT 5 97# define ZOLAND 0x1F 98#else 99# define PREFETCH_READ(RX) prefetch [RX, 28] 100# define PREFETCH_WRITE(RX) prefetchw [RX, 32] 101# define LOADX(DST,RX) ld.ab DST, [RX, 4] 102# define STOREX(SRC,RX) st.ab SRC, [RX, 4] 103# define ZOLSHFT 4 104# define ZOLAND 0xF 105#endif 106 107 prefetch [r1] ; Prefetch the read location 108 prefetchw [r0] ; Prefetch the write location 109 mov.f 0, r2 110;;; if size is zero 111 jz.d [blink] 112 mov r3, r0 ; don't clobber ret val 113 114;;; if size <= 8 115 cmp r2, 8 116 bls.d @.Lsmallchunk 117 mov.f lp_count, r2 118 119 and.f r4, r0, 0x03 120 rsub lp_count, r4, 4 121 lpnz @.Laligndestination 122 ;; LOOP BEGIN 123 ldb.ab r5, [r1,1] 124 sub r2, r2, 1 125 stb.ab r5, [r3,1] 126.Laligndestination: 127 128;;; Check the alignment of the source 129 and.f r4, r1, 0x03 130 bnz.d @.Lsourceunaligned 131 132;;; CASE 0: Both source and destination are 32bit aligned 133;;; Convert len to Dwords, unfold x4 134 lsr.f lp_count, r2, ZOLSHFT 135 lpnz @.Lcopy32_64bytes 136 ;; LOOP START 137 LOADX (r6, r1) 138 PREFETCH_READ (r1) 139 PREFETCH_WRITE (r3) 140 LOADX (r8, r1) 141 LOADX (r10, r1) 142 LOADX (r4, r1) 143 STOREX (r6, r3) 144 STOREX (r8, r3) 145 STOREX (r10, r3) 146 STOREX (r4, r3) 147.Lcopy32_64bytes: 148 149 and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes 150.Lsmallchunk: 151 lpnz @.Lcopyremainingbytes 152 ;; LOOP START 153 ldb.ab r5, [r1,1] 154 stb.ab r5, [r3,1] 155.Lcopyremainingbytes: 156 157 j [blink] 158;;; END CASE 0 159 160.Lsourceunaligned: 161 cmp r4, 2 162 beq.d @.LunalignedOffby2 163 sub r2, r2, 1 164 165 bhi.d @.LunalignedOffby3 166 ldb.ab r5, [r1, 1] 167 168;;; CASE 1: The source is unaligned, off by 1 169 ;; Hence I need to read 1 byte for a 16bit alignment 170 ;; and 2bytes to reach 32bit alignment 171 ldh.ab r6, [r1, 2] 172 sub r2, r2, 2 173 ;; Convert to words, unfold x2 174 lsr.f lp_count, r2, 3 175 MERGE_1 (r6, r6, 8) 176 MERGE_2 (r5, r5, 24) 177 or r5, r5, r6 178 179 ;; Both src and dst are aligned 180 lpnz @.Lcopy8bytes_1 181 ;; LOOP START 182 ld.ab r6, [r1, 4] 183 prefetch [r1, 28] ;Prefetch the next read location 184 ld.ab r8, [r1,4] 185 prefetchw [r3, 32] ;Prefetch the next write location 186 187 SHIFT_1 (r7, r6, 24) 188 or r7, r7, r5 189 SHIFT_2 (r5, r6, 8) 190 191 SHIFT_1 (r9, r8, 24) 192 or r9, r9, r5 193 SHIFT_2 (r5, r8, 8) 194 195 st.ab r7, [r3, 4] 196 st.ab r9, [r3, 4] 197.Lcopy8bytes_1: 198 199 ;; Write back the remaining 16bits 200 EXTRACT_1 (r6, r5, 16) 201 sth.ab r6, [r3, 2] 202 ;; Write back the remaining 8bits 203 EXTRACT_2 (r5, r5, 16) 204 stb.ab r5, [r3, 1] 205 206 and.f lp_count, r2, 0x07 ;Last 8bytes 207 lpnz @.Lcopybytewise_1 208 ;; LOOP START 209 ldb.ab r6, [r1,1] 210 stb.ab r6, [r3,1] 211.Lcopybytewise_1: 212 j [blink] 213 214.LunalignedOffby2: 215;;; CASE 2: The source is unaligned, off by 2 216 ldh.ab r5, [r1, 2] 217 sub r2, r2, 1 218 219 ;; Both src and dst are aligned 220 ;; Convert to words, unfold x2 221 lsr.f lp_count, r2, 3 222#ifdef __BIG_ENDIAN__ 223 asl.nz r5, r5, 16 224#endif 225 lpnz @.Lcopy8bytes_2 226 ;; LOOP START 227 ld.ab r6, [r1, 4] 228 prefetch [r1, 28] ;Prefetch the next read location 229 ld.ab r8, [r1,4] 230 prefetchw [r3, 32] ;Prefetch the next write location 231 232 SHIFT_1 (r7, r6, 16) 233 or r7, r7, r5 234 SHIFT_2 (r5, r6, 16) 235 236 SHIFT_1 (r9, r8, 16) 237 or r9, r9, r5 238 SHIFT_2 (r5, r8, 16) 239 240 st.ab r7, [r3, 4] 241 st.ab r9, [r3, 4] 242.Lcopy8bytes_2: 243 244#ifdef __BIG_ENDIAN__ 245 lsr.nz r5, r5, 16 246#endif 247 sth.ab r5, [r3, 2] 248 249 and.f lp_count, r2, 0x07 ;Last 8bytes 250 lpnz @.Lcopybytewise_2 251 ;; LOOP START 252 ldb.ab r6, [r1,1] 253 stb.ab r6, [r3,1] 254.Lcopybytewise_2: 255 j [blink] 256 257.LunalignedOffby3: 258;;; CASE 3: The source is unaligned, off by 3 259;;; Hence, I need to read 1byte for achieve the 32bit alignment 260 261 ;; Both src and dst are aligned 262 ;; Convert to words, unfold x2 263 lsr.f lp_count, r2, 3 264#ifdef __BIG_ENDIAN__ 265 asl.ne r5, r5, 24 266#endif 267 lpnz @.Lcopy8bytes_3 268 ;; LOOP START 269 ld.ab r6, [r1, 4] 270 prefetch [r1, 28] ;Prefetch the next read location 271 ld.ab r8, [r1,4] 272 prefetchw [r3, 32] ;Prefetch the next write location 273 274 SHIFT_1 (r7, r6, 8) 275 or r7, r7, r5 276 SHIFT_2 (r5, r6, 24) 277 278 SHIFT_1 (r9, r8, 8) 279 or r9, r9, r5 280 SHIFT_2 (r5, r8, 24) 281 282 st.ab r7, [r3, 4] 283 st.ab r9, [r3, 4] 284.Lcopy8bytes_3: 285 286#ifdef __BIG_ENDIAN__ 287 lsr.nz r5, r5, 24 288#endif 289 stb.ab r5, [r3, 1] 290 291 and.f lp_count, r2, 0x07 ;Last 8bytes 292 lpnz @.Lcopybytewise_3 293 ;; LOOP START 294 ldb.ab r6, [r1,1] 295 stb.ab r6, [r3,1] 296.Lcopybytewise_3: 297 j [blink] 298 299#elif defined(__ARC64_ARCH32__) 300 ;; Based on Synopsys code from newlib's arc64/memcpy.S 301 lsr.f r11, r2, 4 ; counter for 16-byte chunks 302 beq.d @.L_write_15_bytes 303 mov r3, r0 ; work on a copy of "r0" 304 305.L_write_16_bytes: 306#if defined(__ARC64_LL64__) 307 ldd.ab r4, [r1, 8] 308 ldd.ab r6, [r1, 8] 309 std.ab r4, [r3, 8] 310 std.ab r6, [r3, 8] 311 dbnz r11, @.L_write_16_bytes 312#else 313 ld.ab r4, [r1, 4] 314 ld.ab r5, [r1, 4] 315 ld.ab r6, [r1, 4] 316 ld.ab r7, [r1, 4] 317 st.ab r4, [r3, 4] 318 st.ab r5, [r3, 4] 319 st.ab r6, [r3, 4] 320 dbnz.d r11, @.L_write_16_bytes 321 st.ab r7, [r3, 4] 322#endif 323 bmsk_s r2, r2, 3 324 325.L_write_15_bytes: 326 bbit0.d r2, 1, @1f 327 lsr r11, r2, 2 328 ldh.ab r4, [r1, 2] 329 sth.ab r4, [r3, 2] 3301: 331 bbit0.d r2, 0, @1f 332 xor r11, r11, 3 333 ldb.ab r4, [r1, 1] 334 stb.ab r4, [r3, 1] 3351: 336 asl r11, r11, 1 337 bi [r11] 338 ld.ab r4,[r1, 4] 339 st.ab r4,[r3, 4] 340 ld.ab r4,[r1, 4] 341 st.ab r4,[r3, 4] 342 ld r4,[r1] 343 st r4,[r3] 344 345 j_s [blink] 346 347#else 348#error "Unsupported ARC CPU type" 349#endif 350 351END(memcpy) 352libc_hidden_def(memcpy) 353