1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright 2012 Xyratex Technology Limited 4 * 5 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 6 * calculation. 7 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 8 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 9 * at: 10 * http://www.intel.com/products/processor/manuals/ 11 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 12 * Volume 2B: Instruction Set Reference, N-Z 13 * 14 * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 15 * Alexander Boyko <Alexander_Boyko@xyratex.com> 16 */ 17 18#include <linux/linkage.h> 19 20 21.section .rodata 22.align 16 23/* 24 * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 25 * #define CONSTANT_R1 0x154442bd4LL 26 * 27 * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 28 * #define CONSTANT_R2 0x1c6e41596LL 29 */ 30.Lconstant_R2R1: 31 .octa 0x00000001c6e415960000000154442bd4 32/* 33 * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 34 * #define CONSTANT_R3 0x1751997d0LL 35 * 36 * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 37 * #define CONSTANT_R4 0x0ccaa009eLL 38 */ 39.Lconstant_R4R3: 40 .octa 0x00000000ccaa009e00000001751997d0 41/* 42 * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 43 * #define CONSTANT_R5 0x163cd6124LL 44 */ 45.Lconstant_R5: 46 .octa 0x00000000000000000000000163cd6124 47.Lconstant_mask32: 48 .octa 0x000000000000000000000000FFFFFFFF 49/* 50 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 51 * 52 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL 53 * #define CONSTANT_RU 0x1F7011641LL 54 */ 55.Lconstant_RUpoly: 56 .octa 0x00000001F701164100000001DB710641 57 58#define CONSTANT %xmm0 59 60#ifdef __x86_64__ 61#define BUF %rdi 62#define LEN %rsi 63#define CRC %edx 64#else 65#define BUF %eax 66#define LEN %edx 67#define CRC %ecx 68#endif 69 70 71 72.text 73/** 74 * Calculate crc32 75 * BUF - buffer (16 bytes aligned) 76 * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 77 * CRC - initial crc32 78 * return %eax crc32 79 * uint crc32_pclmul_le_16(unsigned char const *buffer, 80 * size_t len, uint crc32) 81 */ 82 83SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ 84 movdqa (BUF), %xmm1 85 movdqa 0x10(BUF), %xmm2 86 movdqa 0x20(BUF), %xmm3 87 movdqa 0x30(BUF), %xmm4 88 movd CRC, CONSTANT 89 pxor CONSTANT, %xmm1 90 sub $0x40, LEN 91 add $0x40, BUF 92 cmp $0x40, LEN 93 jb less_64 94 95#ifdef __x86_64__ 96 movdqa .Lconstant_R2R1(%rip), CONSTANT 97#else 98 movdqa .Lconstant_R2R1, CONSTANT 99#endif 100 101loop_64:/* 64 bytes Full cache line folding */ 102 prefetchnta 0x40(BUF) 103 movdqa %xmm1, %xmm5 104 movdqa %xmm2, %xmm6 105 movdqa %xmm3, %xmm7 106#ifdef __x86_64__ 107 movdqa %xmm4, %xmm8 108#endif 109 pclmulqdq $0x00, CONSTANT, %xmm1 110 pclmulqdq $0x00, CONSTANT, %xmm2 111 pclmulqdq $0x00, CONSTANT, %xmm3 112#ifdef __x86_64__ 113 pclmulqdq $0x00, CONSTANT, %xmm4 114#endif 115 pclmulqdq $0x11, CONSTANT, %xmm5 116 pclmulqdq $0x11, CONSTANT, %xmm6 117 pclmulqdq $0x11, CONSTANT, %xmm7 118#ifdef __x86_64__ 119 pclmulqdq $0x11, CONSTANT, %xmm8 120#endif 121 pxor %xmm5, %xmm1 122 pxor %xmm6, %xmm2 123 pxor %xmm7, %xmm3 124#ifdef __x86_64__ 125 pxor %xmm8, %xmm4 126#else 127 /* xmm8 unsupported for x32 */ 128 movdqa %xmm4, %xmm5 129 pclmulqdq $0x00, CONSTANT, %xmm4 130 pclmulqdq $0x11, CONSTANT, %xmm5 131 pxor %xmm5, %xmm4 132#endif 133 134 pxor (BUF), %xmm1 135 pxor 0x10(BUF), %xmm2 136 pxor 0x20(BUF), %xmm3 137 pxor 0x30(BUF), %xmm4 138 139 sub $0x40, LEN 140 add $0x40, BUF 141 cmp $0x40, LEN 142 jge loop_64 143less_64:/* Folding cache line into 128bit */ 144#ifdef __x86_64__ 145 movdqa .Lconstant_R4R3(%rip), CONSTANT 146#else 147 movdqa .Lconstant_R4R3, CONSTANT 148#endif 149 prefetchnta (BUF) 150 151 movdqa %xmm1, %xmm5 152 pclmulqdq $0x00, CONSTANT, %xmm1 153 pclmulqdq $0x11, CONSTANT, %xmm5 154 pxor %xmm5, %xmm1 155 pxor %xmm2, %xmm1 156 157 movdqa %xmm1, %xmm5 158 pclmulqdq $0x00, CONSTANT, %xmm1 159 pclmulqdq $0x11, CONSTANT, %xmm5 160 pxor %xmm5, %xmm1 161 pxor %xmm3, %xmm1 162 163 movdqa %xmm1, %xmm5 164 pclmulqdq $0x00, CONSTANT, %xmm1 165 pclmulqdq $0x11, CONSTANT, %xmm5 166 pxor %xmm5, %xmm1 167 pxor %xmm4, %xmm1 168 169 cmp $0x10, LEN 170 jb fold_64 171loop_16:/* Folding rest buffer into 128bit */ 172 movdqa %xmm1, %xmm5 173 pclmulqdq $0x00, CONSTANT, %xmm1 174 pclmulqdq $0x11, CONSTANT, %xmm5 175 pxor %xmm5, %xmm1 176 pxor (BUF), %xmm1 177 sub $0x10, LEN 178 add $0x10, BUF 179 cmp $0x10, LEN 180 jge loop_16 181 182fold_64: 183 /* perform the last 64 bit fold, also adds 32 zeroes 184 * to the input stream */ 185 pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ 186 psrldq $0x08, %xmm1 187 pxor CONSTANT, %xmm1 188 189 /* final 32-bit fold */ 190 movdqa %xmm1, %xmm2 191#ifdef __x86_64__ 192 movdqa .Lconstant_R5(%rip), CONSTANT 193 movdqa .Lconstant_mask32(%rip), %xmm3 194#else 195 movdqa .Lconstant_R5, CONSTANT 196 movdqa .Lconstant_mask32, %xmm3 197#endif 198 psrldq $0x04, %xmm2 199 pand %xmm3, %xmm1 200 pclmulqdq $0x00, CONSTANT, %xmm1 201 pxor %xmm2, %xmm1 202 203 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 204#ifdef __x86_64__ 205 movdqa .Lconstant_RUpoly(%rip), CONSTANT 206#else 207 movdqa .Lconstant_RUpoly, CONSTANT 208#endif 209 movdqa %xmm1, %xmm2 210 pand %xmm3, %xmm1 211 pclmulqdq $0x10, CONSTANT, %xmm1 212 pand %xmm3, %xmm1 213 pclmulqdq $0x00, CONSTANT, %xmm1 214 pxor %xmm2, %xmm1 215 pextrd $0x01, %xmm1, %eax 216 217 RET 218SYM_FUNC_END(crc32_pclmul_le_16) 219