1/* 2 * Copyright (C) 2013 ARM Ltd. 3 * Copyright (C) 2013 Linaro. 4 * 5 * This code is based on glibc cortex strings work originally authored by Linaro 6 * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 * be found @ 8 * 9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 * files/head:/src/aarch64/ 11 * 12 * This program is free software; you can redistribute it and/or modify 13 * it under the terms of the GNU General Public License version 2 as 14 * published by the Free Software Foundation. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public License 22 * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 */ 24 25#include <asm/cache.h> 26#include "assembler.h" 27 28/* 29 * Move a buffer from src to test (alignment handled by the hardware). 30 * If dest <= src, call memcpy, otherwise copy in reverse order. 31 * 32 * Parameters: 33 * x0 - dest 34 * x1 - src 35 * x2 - n 36 * Returns: 37 * x0 - dest 38 */ 39dstin .req x0 40src .req x1 41count .req x2 42tmp1 .req x3 43tmp1w .req w3 44tmp2 .req x4 45tmp2w .req w4 46tmp3 .req x5 47tmp3w .req w5 48dst .req x6 49 50A_l .req x7 51A_h .req x8 52B_l .req x9 53B_h .req x10 54C_l .req x11 55C_h .req x12 56D_l .req x13 57D_h .req x14 58 59ENTRY(memmove) 60 cmp dstin, src 61 b.lo memcpy 62 add tmp1, src, count 63 cmp dstin, tmp1 64 b.hs memcpy /* No overlap. */ 65 66 add dst, dstin, count 67 add src, src, count 68 cmp count, #16 69 b.lo .Ltail15 /*probably non-alignment accesses.*/ 70 71 ands tmp2, src, #15 /* Bytes to reach alignment. */ 72 b.eq .LSrcAligned 73 sub count, count, tmp2 74 /* 75 * process the aligned offset length to make the src aligned firstly. 76 * those extra instructions' cost is acceptable. It also make the 77 * coming accesses are based on aligned address. 78 */ 79 tbz tmp2, #0, 1f 80 ldrb tmp1w, [src, #-1]! 81 strb tmp1w, [dst, #-1]! 821: 83 tbz tmp2, #1, 2f 84 ldrh tmp1w, [src, #-2]! 85 strh tmp1w, [dst, #-2]! 862: 87 tbz tmp2, #2, 3f 88 ldr tmp1w, [src, #-4]! 89 str tmp1w, [dst, #-4]! 903: 91 tbz tmp2, #3, .LSrcAligned 92 ldr tmp1, [src, #-8]! 93 str tmp1, [dst, #-8]! 94 95.LSrcAligned: 96 cmp count, #64 97 b.ge .Lcpy_over64 98 99 /* 100 * Deal with small copies quickly by dropping straight into the 101 * exit block. 102 */ 103.Ltail63: 104 /* 105 * Copy up to 48 bytes of data. At this point we only need the 106 * bottom 6 bits of count to be accurate. 107 */ 108 ands tmp1, count, #0x30 109 b.eq .Ltail15 110 cmp tmp1w, #0x20 111 b.eq 1f 112 b.lt 2f 113 ldp A_l, A_h, [src, #-16]! 114 stp A_l, A_h, [dst, #-16]! 1151: 116 ldp A_l, A_h, [src, #-16]! 117 stp A_l, A_h, [dst, #-16]! 1182: 119 ldp A_l, A_h, [src, #-16]! 120 stp A_l, A_h, [dst, #-16]! 121 122.Ltail15: 123 tbz count, #3, 1f 124 ldr tmp1, [src, #-8]! 125 str tmp1, [dst, #-8]! 1261: 127 tbz count, #2, 2f 128 ldr tmp1w, [src, #-4]! 129 str tmp1w, [dst, #-4]! 1302: 131 tbz count, #1, 3f 132 ldrh tmp1w, [src, #-2]! 133 strh tmp1w, [dst, #-2]! 1343: 135 tbz count, #0, .Lexitfunc 136 ldrb tmp1w, [src, #-1] 137 strb tmp1w, [dst, #-1] 138 139.Lexitfunc: 140 ret 141 142.Lcpy_over64: 143 subs count, count, #128 144 b.ge .Lcpy_body_large 145 /* 146 * Less than 128 bytes to copy, so handle 64 bytes here and then jump 147 * to the tail. 148 */ 149 ldp A_l, A_h, [src, #-16] 150 stp A_l, A_h, [dst, #-16] 151 ldp B_l, B_h, [src, #-32] 152 ldp C_l, C_h, [src, #-48] 153 stp B_l, B_h, [dst, #-32] 154 stp C_l, C_h, [dst, #-48] 155 ldp D_l, D_h, [src, #-64]! 156 stp D_l, D_h, [dst, #-64]! 157 158 tst count, #0x3f 159 b.ne .Ltail63 160 ret 161 162 /* 163 * Critical loop. Start at a new cache line boundary. Assuming 164 * 64 bytes per line this ensures the entire loop is in one line. 165 */ 166 .p2align L1_CACHE_SHIFT 167.Lcpy_body_large: 168 /* pre-load 64 bytes data. */ 169 ldp A_l, A_h, [src, #-16] 170 ldp B_l, B_h, [src, #-32] 171 ldp C_l, C_h, [src, #-48] 172 ldp D_l, D_h, [src, #-64]! 1731: 174 /* 175 * interlace the load of next 64 bytes data block with store of the last 176 * loaded 64 bytes data. 177 */ 178 stp A_l, A_h, [dst, #-16] 179 ldp A_l, A_h, [src, #-16] 180 stp B_l, B_h, [dst, #-32] 181 ldp B_l, B_h, [src, #-32] 182 stp C_l, C_h, [dst, #-48] 183 ldp C_l, C_h, [src, #-48] 184 stp D_l, D_h, [dst, #-64]! 185 ldp D_l, D_h, [src, #-64]! 186 subs count, count, #64 187 b.ge 1b 188 stp A_l, A_h, [dst, #-16] 189 stp B_l, B_h, [dst, #-32] 190 stp C_l, C_h, [dst, #-48] 191 stp D_l, D_h, [dst, #-64]! 192 193 tst count, #0x3f 194 b.ne .Ltail63 195 ret 196ENDPROC(memmove) 197