1/*
2 * Copyright (C) 2019 Kalray Inc.
3 *
4 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
5 * in this tarball.
6 */
7
8#define REPLICATE_BYTE_MASK	0x0101010101010101
9#define MIN_SIZE_FOR_ALIGN	128
10
11/*
12 * Optimized memset for kvx architecture
13 *
14 * In order to optimize memset on kvx, we can use various things:
15 * - conditionnal store which avoid branch penalty
16 * - store half/word/double/quad/octuple to store up to 16 bytes at a time
17 * - hardware loop for steady cases.
18 *
19 * First,  we start by checking if the size is below a minimum size. If so, we
20 * skip the alignment part. Indeed, the kvx supports misalignment and the
21 * penalty for letting it do unaligned accesses is lower than trying to
22 * realigning us. So for small sizes, we don't even bother to realign.
23 * In order to create the 64 bits pattern, we use sbmm to replicate the pattern
24 * on all bits on a register in one call.
25 * Once alignment has been reached, we can do the hardware loop using store
26 * octuple in order to optimize throughput. Care must be taken to align hardware
27 * loops on at least 8 bytes for performances.
28 * Once the main loop has been done, we finish the copy by checking length to do
29 * the necessary calls to store remaining bytes.
30 */
31
32#include <sysdep.h>
33
34.align 16
35ENTRY(memset)
36	/* Preserve return value */
37	copyd $r3 = $r0
38	/* Replicate the first pattern byte on all bytes */
39	sbmm8 $r32 = $r1, REPLICATE_BYTE_MASK
40	/* Check if length < MIN_SIZE_FOR_ALIGN */
41	compd.geu $r7 = $r2, MIN_SIZE_FOR_ALIGN
42	/* Invert address to compute what we need to copy to be aligned on 32 bytes */
43	negd $r5 = $r0
44	;;
45	/* Check if we are aligned on 32 bytes */
46	andw $r9 = $r0, 0x1F
47	/* Compute the length that will be copied to align on 32 bytes boundary */
48	andw $r6 = $r5, 0x1F
49	/*
50	 * If size < MIN_SIZE_FOR_ALIGN bits, directly go to so, it will be done
51	 * unaligned but that is still better that what we can do with sb
52	 */
53	cb.deqz $r7? .Laligned_32
54	;;
55	/* Remove unaligned part from length */
56	sbfd $r2 = $r6, $r2
57	/* If we are already aligned on 32 bytes, jump to main "so" loop */
58	cb.deqz $r9? .Laligned_32
59	/* Check if we need to copy 1 byte */
60	andw $r4 = $r5, (1 << 0)
61	;;
62	/* If we are not aligned, store byte */
63	sb.dnez $r4? [$r0] = $r32
64	/* Check if we need to copy 2 bytes */
65	andw $r4 = $r5, (1 << 1)
66	/* Add potentially copied part for next store offset */
67	addd $r0 = $r0, $r4
68	;;
69	sh.dnez $r4? [$r0] = $r32
70	/* Check if we need to copy 4 bytes */
71	andw $r4 = $r5, (1 << 2)
72	addd $r0 = $r0, $r4
73	;;
74	sw.dnez $r4? [$r0] = $r32
75	/* Check if we need to copy 8 bytes */
76	andw $r4 = $r5, (1 << 3)
77	addd $r0 = $r0, $r4
78	/* Copy second part of pattern for sq */
79	copyd $r33 = $r32
80	;;
81	sd.dnez $r4? [$r0] = $r32
82	/* Check if we need to copy 16 bytes */
83	andw $r4 = $r5, (1 << 4)
84	addd $r0 = $r0, $r4
85	;;
86	sq.dnez $r4? [$r0] = $r32r33
87	addd $r0 = $r0, $r4
88	;;
89.Laligned_32:
90	/* Copy second part of pattern for sq */
91	copyd $r33 = $r32
92	/* Prepare amount of data for 32 bytes store */
93	srld $r10 = $r2, 5
94	nop
95	nop
96	;;
97	copyq $r34r35 = $r32, $r33
98	/* Remaining bytes for 16 bytes store */
99	andw $r8 = $r2, (1 << 4)
100	make $r11 = 32
101	/* Check if there are enough data for 32 bytes store */
102	cb.deqz $r10? .Laligned_32_done
103	;;
104	loopdo $r10, .Laligned_32_done
105		;;
106		so 0[$r0] = $r32r33r34r35
107		addd $r0 = $r0, $r11
108		;;
109	.Laligned_32_done:
110	/*
111	 * Now that we have handled every aligned bytes using 'so', we can
112	 * handled the remainder of length using store by decrementing size
113	 * We also exploit the fact we are aligned to simply check remaining
114	 * size */
115	sq.dnez $r8? [$r0] = $r32r33
116	addd $r0 = $r0, $r8
117	/* Remaining bytes for 8 bytes store */
118	andw $r8 = $r2, (1 << 3)
119	cb.deqz $r2? .Lmemset_done
120	;;
121	sd.dnez $r8? [$r0] = $r32
122	addd $r0 = $r0, $r8
123	/* Remaining bytes for 4 bytes store */
124	andw $r8 = $r2, (1 << 2)
125	;;
126	sw.dnez $r8? [$r0] = $r32
127	addd $r0 = $r0, $r8
128	/* Remaining bytes for 2 bytes store */
129	andw $r8 = $r2, (1 << 1)
130	;;
131	sh.dnez $r8? [$r0] = $r32
132	addd $r0 = $r0, $r8
133	;;
134	sb.odd $r2? [$r0] = $r32
135	/* Restore original value */
136	copyd $r0 = $r3
137	ret
138	;;
139.Lmemset_done:
140	/* Restore original value */
141	copyd $r0 = $r3
142	ret
143	;;
144END(memset)
145
146libc_hidden_def(memset)
147