1/* Optimized version of the standard memmove() function. 2 This file is part of the GNU C Library. 3 Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc. 4 Contributed by Dan Pop <Dan.Pop@cern.ch>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <http://www.gnu.org/licenses/>. */ 19 20/* Return: dest 21 22 Inputs: 23 in0: dest 24 in1: src 25 in2: byte count 26 27 The core of the function is the memcpy implementation used in memcpy.S. 28 When bytes have to be copied backwards, only the easy case, when 29 all arguments are multiples of 8, is optimised. 30 31 In this form, it assumes little endian mode. For big endian mode, 32 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1 33 or the UM.be bit should be cleared at the beginning and set at the end. */ 34 35#include <sysdep.h> 36#undef ret 37 38#define OP_T_THRES 16 39#define OPSIZ 8 40 41#define adest r15 42#define saved_pr r17 43#define saved_lc r18 44#define dest r19 45#define src r20 46#define len r21 47#define asrc r22 48#define tmp2 r23 49#define tmp3 r24 50#define tmp4 r25 51#define ptable r26 52#define ploop56 r27 53#define loopaddr r28 54#define sh1 r29 55#define loopcnt r30 56#define value r31 57 58#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO 59# define ALIGN(n) { nop 0 } 60#else 61# define ALIGN(n) .align n 62#endif 63 64#define LOOP(shift) \ 65 ALIGN(32); \ 66.loop##shift : \ 67(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \ 68(p[MEMLAT+1]) st8 [dest] = value, 8 ; \ 69(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \ 70 nop.b 0 ; \ 71 nop.b 0 ; \ 72 br.ctop.sptk .loop##shift ; \ 73 br.cond.sptk .cpyfew ; /* deal with the remaining bytes */ 74 75#define MEMLAT 21 76#define Nrot (((2*MEMLAT+3) + 7) & ~7) 77 78ENTRY(memmove) 79 .prologue 80 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot 81 .rotr r[MEMLAT + 2], q[MEMLAT + 1] 82 .rotp p[MEMLAT + 2] 83 mov ret0 = in0 /* return value = dest */ 84 .save pr, saved_pr 85 mov saved_pr = pr /* save the predicate registers */ 86 .save ar.lc, saved_lc 87 mov saved_lc = ar.lc /* save the loop counter */ 88 .body 89 or tmp3 = in0, in1 ;; /* tmp3 = dest | src */ 90 or tmp3 = tmp3, in2 /* tmp3 = dest | src | len */ 91 mov dest = in0 /* dest */ 92 mov src = in1 /* src */ 93 mov len = in2 /* len */ 94 sub tmp2 = r0, in0 /* tmp2 = -dest */ 95 cmp.eq p6, p0 = in2, r0 /* if (len == 0) */ 96(p6) br.cond.spnt .restore_and_exit;;/* return dest; */ 97 and tmp4 = 7, tmp3 /* tmp4 = (dest | src | len) & 7 */ 98 cmp.le p6, p0 = dest, src /* if dest <= src it's always safe */ 99(p6) br.cond.spnt .forward /* to copy forward */ 100 add tmp3 = src, len;; 101 cmp.lt p6, p0 = dest, tmp3 /* if dest > src && dest < src + len */ 102(p6) br.cond.spnt .backward /* we have to copy backward */ 103 104.forward: 105 shr.u loopcnt = len, 4 ;; /* loopcnt = len / 16 */ 106 cmp.ne p6, p0 = tmp4, r0 /* if ((dest | src | len) & 7 != 0) */ 107(p6) br.cond.sptk .next /* goto next; */ 108 109/* The optimal case, when dest, src and len are all multiples of 8 */ 110 111 and tmp3 = 0xf, len 112 mov pr.rot = 1 << 16 /* set rotating predicates */ 113 mov ar.ec = MEMLAT + 1 ;; /* set the epilog counter */ 114 cmp.ne p6, p0 = tmp3, r0 /* do we have to copy an extra word? */ 115 adds loopcnt = -1, loopcnt;; /* --loopcnt */ 116(p6) ld8 value = [src], 8;; 117(p6) st8 [dest] = value, 8 /* copy the "odd" word */ 118 mov ar.lc = loopcnt /* set the loop counter */ 119 cmp.eq p6, p0 = 8, len 120(p6) br.cond.spnt .restore_and_exit;;/* the one-word special case */ 121 adds adest = 8, dest /* set adest one word ahead of dest */ 122 adds asrc = 8, src ;; /* set asrc one word ahead of src */ 123 nop.b 0 /* get the "golden" alignment for */ 124 nop.b 0 /* the next loop */ 125.l0: 126(p[0]) ld8 r[0] = [src], 16 127(p[0]) ld8 q[0] = [asrc], 16 128(p[MEMLAT]) st8 [dest] = r[MEMLAT], 16 129(p[MEMLAT]) st8 [adest] = q[MEMLAT], 16 130 br.ctop.dptk .l0 ;; 131 132 mov pr = saved_pr, -1 /* restore the predicate registers */ 133 mov ar.lc = saved_lc /* restore the loop counter */ 134 br.ret.sptk.many b0 135.next: 136 cmp.ge p6, p0 = OP_T_THRES, len /* is len <= OP_T_THRES */ 137 and loopcnt = 7, tmp2 /* loopcnt = -dest % 8 */ 138(p6) br.cond.spnt .cpyfew /* copy byte by byte */ 139 ;; 140 cmp.eq p6, p0 = loopcnt, r0 141(p6) br.cond.sptk .dest_aligned 142 sub len = len, loopcnt /* len -= -dest % 8 */ 143 adds loopcnt = -1, loopcnt /* --loopcnt */ 144 ;; 145 mov ar.lc = loopcnt 146.l1: /* copy -dest % 8 bytes */ 147 ld1 value = [src], 1 /* value = *src++ */ 148 ;; 149 st1 [dest] = value, 1 /* *dest++ = value */ 150 br.cloop.dptk .l1 151.dest_aligned: 152 and sh1 = 7, src /* sh1 = src % 8 */ 153 and tmp2 = -8, len /* tmp2 = len & -OPSIZ */ 154 and asrc = -8, src /* asrc = src & -OPSIZ -- align src */ 155 shr.u loopcnt = len, 3 /* loopcnt = len / 8 */ 156 and len = 7, len;; /* len = len % 8 */ 157 adds loopcnt = -1, loopcnt /* --loopcnt */ 158 addl tmp4 = @ltoff(.table), gp 159 addl tmp3 = @ltoff(.loop56), gp 160 mov ar.ec = MEMLAT + 1 /* set EC */ 161 mov pr.rot = 1 << 16;; /* set rotating predicates */ 162 mov ar.lc = loopcnt /* set LC */ 163 cmp.eq p6, p0 = sh1, r0 /* is the src aligned? */ 164(p6) br.cond.sptk .src_aligned 165 add src = src, tmp2 /* src += len & -OPSIZ */ 166 shl sh1 = sh1, 3 /* sh1 = 8 * (src % 8) */ 167 ld8 ploop56 = [tmp3] /* ploop56 = &loop56 */ 168 ld8 ptable = [tmp4];; /* ptable = &table */ 169 add tmp3 = ptable, sh1;; /* tmp3 = &table + sh1 */ 170 mov ar.ec = MEMLAT + 1 + 1 /* one more pass needed */ 171 ld8 tmp4 = [tmp3];; /* tmp4 = loop offset */ 172 sub loopaddr = ploop56,tmp4 /* loopadd = &loop56 - loop offset */ 173 ld8 r[1] = [asrc], 8;; /* w0 */ 174 mov b6 = loopaddr;; 175 br b6 /* jump to the appropriate loop */ 176 177 LOOP(8) 178 LOOP(16) 179 LOOP(24) 180 LOOP(32) 181 LOOP(40) 182 LOOP(48) 183 LOOP(56) 184 185.src_aligned: 186.l3: 187(p[0]) ld8 r[0] = [src], 8 188(p[MEMLAT]) st8 [dest] = r[MEMLAT], 8 189 br.ctop.dptk .l3 190.cpyfew: 191 cmp.eq p6, p0 = len, r0 /* is len == 0 ? */ 192 adds len = -1, len /* --len; */ 193(p6) br.cond.spnt .restore_and_exit ;; 194 mov ar.lc = len 195.l4: 196 ld1 value = [src], 1 197 ;; 198 st1 [dest] = value, 1 199 br.cloop.dptk .l4 ;; 200.restore_and_exit: 201 mov pr = saved_pr, -1 /* restore the predicate registers */ 202 mov ar.lc = saved_lc /* restore the loop counter */ 203 br.ret.sptk.many b0 204 205/* In the case of a backward copy, optimise only the case when everything 206 is a multiple of 8, otherwise copy byte by byte. The backward copy is 207 used only when the blocks are overlapping and dest > src. 208*/ 209.backward: 210 shr.u loopcnt = len, 3 /* loopcnt = len / 8 */ 211 add src = src, len /* src points one byte past the end */ 212 add dest = dest, len ;; /* dest points one byte past the end */ 213 mov ar.ec = MEMLAT + 1 /* set the epilog counter */ 214 mov pr.rot = 1 << 16 /* set rotating predicates */ 215 adds loopcnt = -1, loopcnt /* --loopcnt */ 216 cmp.ne p6, p0 = tmp4, r0 /* if ((dest | src | len) & 7 != 0) */ 217(p6) br.cond.sptk .bytecopy ;; /* copy byte by byte backward */ 218 adds src = -8, src /* src points to the last word */ 219 adds dest = -8, dest /* dest points to the last word */ 220 mov ar.lc = loopcnt;; /* set the loop counter */ 221.l5: 222(p[0]) ld8 r[0] = [src], -8 223(p[MEMLAT]) st8 [dest] = r[MEMLAT], -8 224 br.ctop.dptk .l5 225 br.cond.sptk .restore_and_exit 226.bytecopy: 227 adds src = -1, src /* src points to the last byte */ 228 adds dest = -1, dest /* dest points to the last byte */ 229 adds loopcnt = -1, len;; /* loopcnt = len - 1 */ 230 mov ar.lc = loopcnt;; /* set the loop counter */ 231.l6: 232(p[0]) ld1 r[0] = [src], -1 233(p[MEMLAT]) st1 [dest] = r[MEMLAT], -1 234 br.ctop.dptk .l6 235 br.cond.sptk .restore_and_exit 236END(memmove) 237 238 .rodata 239 .align 8 240.table: 241 data8 0 /* dummy entry */ 242 data8 .loop56 - .loop8 243 data8 .loop56 - .loop16 244 data8 .loop56 - .loop24 245 data8 .loop56 - .loop32 246 data8 .loop56 - .loop40 247 data8 .loop56 - .loop48 248 data8 .loop56 - .loop56 249 250libc_hidden_def (memmove) 251