1/* 2 * Copyright (C) 2019 Kalray Inc. 3 * 4 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB 5 * in this tarball. 6 */ 7 8#define REPLICATE_BYTE_MASK 0x0101010101010101 9#define MIN_SIZE_FOR_ALIGN 128 10 11/* 12 * Optimized memset for kvx architecture 13 * 14 * In order to optimize memset on kvx, we can use various things: 15 * - conditionnal store which avoid branch penalty 16 * - store half/word/double/quad/octuple to store up to 16 bytes at a time 17 * - hardware loop for steady cases. 18 * 19 * First, we start by checking if the size is below a minimum size. If so, we 20 * skip the alignment part. Indeed, the kvx supports misalignment and the 21 * penalty for letting it do unaligned accesses is lower than trying to 22 * realigning us. So for small sizes, we don't even bother to realign. 23 * In order to create the 64 bits pattern, we use sbmm to replicate the pattern 24 * on all bits on a register in one call. 25 * Once alignment has been reached, we can do the hardware loop using store 26 * octuple in order to optimize throughput. Care must be taken to align hardware 27 * loops on at least 8 bytes for performances. 28 * Once the main loop has been done, we finish the copy by checking length to do 29 * the necessary calls to store remaining bytes. 30 */ 31 32#include <sysdep.h> 33 34.align 16 35ENTRY(memset) 36 /* Preserve return value */ 37 copyd $r3 = $r0 38 /* Replicate the first pattern byte on all bytes */ 39 sbmm8 $r32 = $r1, REPLICATE_BYTE_MASK 40 /* Check if length < MIN_SIZE_FOR_ALIGN */ 41 compd.geu $r7 = $r2, MIN_SIZE_FOR_ALIGN 42 /* Invert address to compute what we need to copy to be aligned on 32 bytes */ 43 negd $r5 = $r0 44 ;; 45 /* Check if we are aligned on 32 bytes */ 46 andw $r9 = $r0, 0x1F 47 /* Compute the length that will be copied to align on 32 bytes boundary */ 48 andw $r6 = $r5, 0x1F 49 /* 50 * If size < MIN_SIZE_FOR_ALIGN bits, directly go to so, it will be done 51 * unaligned but that is still better that what we can do with sb 52 */ 53 cb.deqz $r7? .Laligned_32 54 ;; 55 /* Remove unaligned part from length */ 56 sbfd $r2 = $r6, $r2 57 /* If we are already aligned on 32 bytes, jump to main "so" loop */ 58 cb.deqz $r9? .Laligned_32 59 /* Check if we need to copy 1 byte */ 60 andw $r4 = $r5, (1 << 0) 61 ;; 62 /* If we are not aligned, store byte */ 63 sb.dnez $r4? [$r0] = $r32 64 /* Check if we need to copy 2 bytes */ 65 andw $r4 = $r5, (1 << 1) 66 /* Add potentially copied part for next store offset */ 67 addd $r0 = $r0, $r4 68 ;; 69 sh.dnez $r4? [$r0] = $r32 70 /* Check if we need to copy 4 bytes */ 71 andw $r4 = $r5, (1 << 2) 72 addd $r0 = $r0, $r4 73 ;; 74 sw.dnez $r4? [$r0] = $r32 75 /* Check if we need to copy 8 bytes */ 76 andw $r4 = $r5, (1 << 3) 77 addd $r0 = $r0, $r4 78 /* Copy second part of pattern for sq */ 79 copyd $r33 = $r32 80 ;; 81 sd.dnez $r4? [$r0] = $r32 82 /* Check if we need to copy 16 bytes */ 83 andw $r4 = $r5, (1 << 4) 84 addd $r0 = $r0, $r4 85 ;; 86 sq.dnez $r4? [$r0] = $r32r33 87 addd $r0 = $r0, $r4 88 ;; 89.Laligned_32: 90 /* Copy second part of pattern for sq */ 91 copyd $r33 = $r32 92 /* Prepare amount of data for 32 bytes store */ 93 srld $r10 = $r2, 5 94 nop 95 nop 96 ;; 97 copyq $r34r35 = $r32, $r33 98 /* Remaining bytes for 16 bytes store */ 99 andw $r8 = $r2, (1 << 4) 100 make $r11 = 32 101 /* Check if there are enough data for 32 bytes store */ 102 cb.deqz $r10? .Laligned_32_done 103 ;; 104 loopdo $r10, .Laligned_32_done 105 ;; 106 so 0[$r0] = $r32r33r34r35 107 addd $r0 = $r0, $r11 108 ;; 109 .Laligned_32_done: 110 /* 111 * Now that we have handled every aligned bytes using 'so', we can 112 * handled the remainder of length using store by decrementing size 113 * We also exploit the fact we are aligned to simply check remaining 114 * size */ 115 sq.dnez $r8? [$r0] = $r32r33 116 addd $r0 = $r0, $r8 117 /* Remaining bytes for 8 bytes store */ 118 andw $r8 = $r2, (1 << 3) 119 cb.deqz $r2? .Lmemset_done 120 ;; 121 sd.dnez $r8? [$r0] = $r32 122 addd $r0 = $r0, $r8 123 /* Remaining bytes for 4 bytes store */ 124 andw $r8 = $r2, (1 << 2) 125 ;; 126 sw.dnez $r8? [$r0] = $r32 127 addd $r0 = $r0, $r8 128 /* Remaining bytes for 2 bytes store */ 129 andw $r8 = $r2, (1 << 1) 130 ;; 131 sh.dnez $r8? [$r0] = $r32 132 addd $r0 = $r0, $r8 133 ;; 134 sb.odd $r2? [$r0] = $r32 135 /* Restore original value */ 136 copyd $r0 = $r3 137 ret 138 ;; 139.Lmemset_done: 140 /* Restore original value */ 141 copyd $r0 = $r3 142 ret 143 ;; 144END(memset) 145 146libc_hidden_def(memset) 147