1/* SPDX-License-Identifier: BSD-2-Clause */ 2/* 3 * Accelerated GHASH implementation with ARMv8 PMULL instructions. 4 * 5 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 */ 7 8#include <arm32_macros.S> 9#include <asm.S> 10 11#define CPU_LE(x...) x 12 13 SHASH .req q0 14 T1 .req q1 15 XL .req q2 16 XM .req q3 17 XH .req q4 18 IN1 .req q4 19 20 SHASH_L .req d0 21 SHASH_H .req d1 22 T1_L .req d2 23 T1_H .req d3 24 XL_L .req d4 25 XL_H .req d5 26 XM_L .req d6 27 XM_H .req d7 28 XH_L .req d8 29 30 t0l .req d10 31 t0h .req d11 32 t1l .req d12 33 t1h .req d13 34 t2l .req d14 35 t2h .req d15 36 t3l .req d16 37 t3h .req d17 38 t4l .req d18 39 t4h .req d19 40 41 t0q .req q5 42 t1q .req q6 43 t2q .req q7 44 t3q .req q8 45 t4q .req q9 46 T2 .req q9 47 48 s1l .req d20 49 s1h .req d21 50 s2l .req d22 51 s2h .req d23 52 s3l .req d24 53 s3h .req d25 54 s4l .req d26 55 s4h .req d27 56 57 MASK .req d28 58 SHASH2_p8 .req d28 59 60 k16 .req d29 61 k32 .req d30 62 k48 .req d31 63 SHASH2_p64 .req d31 64 65 .text 66 .fpu crypto-neon-fp-armv8 67 68 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 69 vmull.p64 \rd, \rn, \rm 70 .endm 71 72 /* 73 * This implementation of 64x64 -> 128 bit polynomial multiplication 74 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper 75 * "Fast Software Polynomial Multiplication on ARM Processors Using 76 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and 77 * Ricardo Dahab (https://hal.inria.fr/hal-01506572) 78 * 79 * It has been slightly tweaked for in-order performance, and to allow 80 * 'rq' to overlap with 'ad' or 'bd'. 81 */ 82 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l 83 vext.8 t0l, \ad, \ad, #1 @ A1 84 .ifc \b1, t4l 85 vext.8 t4l, \bd, \bd, #1 @ B1 86 .endif 87 vmull.p8 t0q, t0l, \bd @ F = A1*B 88 vext.8 t1l, \ad, \ad, #2 @ A2 89 vmull.p8 t4q, \ad, \b1 @ E = A*B1 90 .ifc \b2, t3l 91 vext.8 t3l, \bd, \bd, #2 @ B2 92 .endif 93 vmull.p8 t1q, t1l, \bd @ H = A2*B 94 vext.8 t2l, \ad, \ad, #3 @ A3 95 vmull.p8 t3q, \ad, \b2 @ G = A*B2 96 veor t0q, t0q, t4q @ L = E + F 97 .ifc \b3, t4l 98 vext.8 t4l, \bd, \bd, #3 @ B3 99 .endif 100 vmull.p8 t2q, t2l, \bd @ J = A3*B 101 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 102 veor t1q, t1q, t3q @ M = G + H 103 .ifc \b4, t3l 104 vext.8 t3l, \bd, \bd, #4 @ B4 105 .endif 106 vmull.p8 t4q, \ad, \b3 @ I = A*B3 107 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 108 vmull.p8 t3q, \ad, \b4 @ K = A*B4 109 vand t0h, t0h, k48 110 vand t1h, t1h, k32 111 veor t2q, t2q, t4q @ N = I + J 112 veor t0l, t0l, t0h 113 veor t1l, t1l, t1h 114 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 115 vand t2h, t2h, k16 116 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 117 vmov.i64 t3h, #0 118 vext.8 t0q, t0q, t0q, #15 119 veor t2l, t2l, t2h 120 vext.8 t1q, t1q, t1q, #14 121 vmull.p8 \rq, \ad, \bd @ D = A*B 122 vext.8 t2q, t2q, t2q, #13 123 vext.8 t3q, t3q, t3q, #12 124 veor t0q, t0q, t1q 125 veor t2q, t2q, t3q 126 veor \rq, \rq, t0q 127 veor \rq, \rq, t2q 128 .endm 129 130 // 131 // PMULL (64x64->128) based reduction for CPUs that can do 132 // it in a single instruction. 133 // 134 .macro __pmull_reduce_p64 135 vmull.p64 T1, XL_L, MASK 136 137 veor XH_L, XH_L, XM_H 138 vext.8 T1, T1, T1, #8 139 veor XL_H, XL_H, XM_L 140 veor T1, T1, XL 141 142 vmull.p64 XL, T1_H, MASK 143 .endm 144 145 // 146 // Alternative reduction for CPUs that lack support for the 147 // 64x64->128 PMULL instruction 148 // 149 .macro __pmull_reduce_p8 150 veor XL_H, XL_H, XM_L 151 veor XH_L, XH_L, XM_H 152 153 vshl.i64 T1, XL, #57 154 vshl.i64 T2, XL, #62 155 veor T1, T1, T2 156 vshl.i64 T2, XL, #63 157 veor T1, T1, T2 158 veor XL_H, XL_H, T1_L 159 veor XH_L, XH_L, T1_H 160 161 vshr.u64 T1, XL, #1 162 veor XH, XH, XL 163 veor XL, XL, T1 164 vshr.u64 T1, T1, #6 165 vshr.u64 XL, XL, #1 166 .endm 167 168 .macro ghash_update, pn 169 vld1.64 {XL}, [r1] 170 171 /* do the head block first, if supplied */ 172 ldr ip, [sp] 173 teq ip, #0 174 beq 0f 175 vld1.8 {T1}, [ip] 176 teq r0, #0 177 b 1f 178 1790: vld1.8 {T1}, [r2]! 180 subs r0, r0, #1 181 1821: /* multiply XL by SHASH in GF(2^128) */ 183#ifndef CONFIG_CPU_BIG_ENDIAN 184 vrev64.8 T1, T1 185#endif 186 vext.8 IN1, T1, T1, #8 187 veor T1_L, T1_L, XL_H 188 veor XL, XL, IN1 189 190 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 191 veor T1, T1, XL 192 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 193 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) 194 195 veor T1, XL, XH 196 veor XM, XM, T1 197 198 __pmull_reduce_\pn 199 200 veor T1, T1, XH 201 veor XL, XL, T1 202 203 bne 0b 204 205 vst1.64 {XL}, [r1] 206 bx lr 207 .endm 208 209 /* 210 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 211 * struct ghash_key const *k, const char *head) 212 */ 213FUNC pmull_ghash_update_p64 , : 214 vld1.64 {SHASH}, [r3] 215 veor SHASH2_p64, SHASH_L, SHASH_H 216 217 vmov.i8 MASK, #0xe1 218 vshl.u64 MASK, MASK, #57 219 220 ghash_update p64 221END_FUNC pmull_ghash_update_p64 222 223FUNC pmull_ghash_update_p8 , : 224 vld1.64 {SHASH}, [r3] 225 veor SHASH2_p8, SHASH_L, SHASH_H 226 227 vext.8 s1l, SHASH_L, SHASH_L, #1 228 vext.8 s2l, SHASH_L, SHASH_L, #2 229 vext.8 s3l, SHASH_L, SHASH_L, #3 230 vext.8 s4l, SHASH_L, SHASH_L, #4 231 vext.8 s1h, SHASH_H, SHASH_H, #1 232 vext.8 s2h, SHASH_H, SHASH_H, #2 233 vext.8 s3h, SHASH_H, SHASH_H, #3 234 vext.8 s4h, SHASH_H, SHASH_H, #4 235 236 vmov.i64 k16, #0xffff 237 vmov.i64 k32, #0xffffffff 238 vmov.i64 k48, #0xffffffffffff 239 240 ghash_update p8 241END_FUNC pmull_ghash_update_p8 242