1/* SPDX-License-Identifier: BSD-2-Clause */
2/*
3 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4 *
5 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <arm32_macros.S>
9#include <asm.S>
10
11#define CPU_LE(x...)	x
12
13	SHASH		.req	q0
14	T1		.req	q1
15	XL		.req	q2
16	XM		.req	q3
17	XH		.req	q4
18	IN1		.req	q4
19
20	SHASH_L		.req	d0
21	SHASH_H		.req	d1
22	T1_L		.req	d2
23	T1_H		.req	d3
24	XL_L		.req	d4
25	XL_H		.req	d5
26	XM_L		.req	d6
27	XM_H		.req	d7
28	XH_L		.req	d8
29
30	t0l		.req	d10
31	t0h		.req	d11
32	t1l		.req	d12
33	t1h		.req	d13
34	t2l		.req	d14
35	t2h		.req	d15
36	t3l		.req	d16
37	t3h		.req	d17
38	t4l		.req	d18
39	t4h		.req	d19
40
41	t0q		.req	q5
42	t1q		.req	q6
43	t2q		.req	q7
44	t3q		.req	q8
45	t4q		.req	q9
46	T2		.req	q9
47
48	s1l		.req	d20
49	s1h		.req	d21
50	s2l		.req	d22
51	s2h		.req	d23
52	s3l		.req	d24
53	s3h		.req	d25
54	s4l		.req	d26
55	s4h		.req	d27
56
57	MASK		.req	d28
58	SHASH2_p8	.req	d28
59
60	k16		.req	d29
61	k32		.req	d30
62	k48		.req	d31
63	SHASH2_p64	.req	d31
64
65	.text
66	.fpu		crypto-neon-fp-armv8
67
68	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
69	vmull.p64	\rd, \rn, \rm
70	.endm
71
72	/*
73	 * This implementation of 64x64 -> 128 bit polynomial multiplication
74	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
75	 * "Fast Software Polynomial Multiplication on ARM Processors Using
76	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
77	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
78	 *
79	 * It has been slightly tweaked for in-order performance, and to allow
80	 * 'rq' to overlap with 'ad' or 'bd'.
81	 */
82	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
83	vext.8		t0l, \ad, \ad, #1	@ A1
84	.ifc		\b1, t4l
85	vext.8		t4l, \bd, \bd, #1	@ B1
86	.endif
87	vmull.p8	t0q, t0l, \bd		@ F = A1*B
88	vext.8		t1l, \ad, \ad, #2	@ A2
89	vmull.p8	t4q, \ad, \b1		@ E = A*B1
90	.ifc		\b2, t3l
91	vext.8		t3l, \bd, \bd, #2	@ B2
92	.endif
93	vmull.p8	t1q, t1l, \bd		@ H = A2*B
94	vext.8		t2l, \ad, \ad, #3	@ A3
95	vmull.p8	t3q, \ad, \b2		@ G = A*B2
96	veor		t0q, t0q, t4q		@ L = E + F
97	.ifc		\b3, t4l
98	vext.8		t4l, \bd, \bd, #3	@ B3
99	.endif
100	vmull.p8	t2q, t2l, \bd		@ J = A3*B
101	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
102	veor		t1q, t1q, t3q		@ M = G + H
103	.ifc		\b4, t3l
104	vext.8		t3l, \bd, \bd, #4	@ B4
105	.endif
106	vmull.p8	t4q, \ad, \b3		@ I = A*B3
107	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
108	vmull.p8	t3q, \ad, \b4		@ K = A*B4
109	vand		t0h, t0h, k48
110	vand		t1h, t1h, k32
111	veor		t2q, t2q, t4q		@ N = I + J
112	veor		t0l, t0l, t0h
113	veor		t1l, t1l, t1h
114	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
115	vand		t2h, t2h, k16
116	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
117	vmov.i64	t3h, #0
118	vext.8		t0q, t0q, t0q, #15
119	veor		t2l, t2l, t2h
120	vext.8		t1q, t1q, t1q, #14
121	vmull.p8	\rq, \ad, \bd		@ D = A*B
122	vext.8		t2q, t2q, t2q, #13
123	vext.8		t3q, t3q, t3q, #12
124	veor		t0q, t0q, t1q
125	veor		t2q, t2q, t3q
126	veor		\rq, \rq, t0q
127	veor		\rq, \rq, t2q
128	.endm
129
130	//
131	// PMULL (64x64->128) based reduction for CPUs that can do
132	// it in a single instruction.
133	//
134	.macro		__pmull_reduce_p64
135	vmull.p64	T1, XL_L, MASK
136
137	veor		XH_L, XH_L, XM_H
138	vext.8		T1, T1, T1, #8
139	veor		XL_H, XL_H, XM_L
140	veor		T1, T1, XL
141
142	vmull.p64	XL, T1_H, MASK
143	.endm
144
145	//
146	// Alternative reduction for CPUs that lack support for the
147	// 64x64->128 PMULL instruction
148	//
149	.macro		__pmull_reduce_p8
150	veor		XL_H, XL_H, XM_L
151	veor		XH_L, XH_L, XM_H
152
153	vshl.i64	T1, XL, #57
154	vshl.i64	T2, XL, #62
155	veor		T1, T1, T2
156	vshl.i64	T2, XL, #63
157	veor		T1, T1, T2
158	veor		XL_H, XL_H, T1_L
159	veor		XH_L, XH_L, T1_H
160
161	vshr.u64	T1, XL, #1
162	veor		XH, XH, XL
163	veor		XL, XL, T1
164	vshr.u64	T1, T1, #6
165	vshr.u64	XL, XL, #1
166	.endm
167
168	.macro		ghash_update, pn
169	vld1.64		{XL}, [r1]
170
171	/* do the head block first, if supplied */
172	ldr		ip, [sp]
173	teq		ip, #0
174	beq		0f
175	vld1.8		{T1}, [ip]
176	teq		r0, #0
177	b		1f
178
1790:	vld1.8		{T1}, [r2]!
180	subs		r0, r0, #1
181
1821:	/* multiply XL by SHASH in GF(2^128) */
183#ifndef CONFIG_CPU_BIG_ENDIAN
184	vrev64.8	T1, T1
185#endif
186	vext.8		IN1, T1, T1, #8
187	veor		T1_L, T1_L, XL_H
188	veor		XL, XL, IN1
189
190	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
191	veor		T1, T1, XL
192	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
193	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
194
195	veor		T1, XL, XH
196	veor		XM, XM, T1
197
198	__pmull_reduce_\pn
199
200	veor		T1, T1, XH
201	veor		XL, XL, T1
202
203	bne		0b
204
205	vst1.64		{XL}, [r1]
206	bx		lr
207	.endm
208
209	/*
210	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
211	 *			   struct ghash_key const *k, const char *head)
212	 */
213FUNC pmull_ghash_update_p64 , :
214	vld1.64		{SHASH}, [r3]
215	veor		SHASH2_p64, SHASH_L, SHASH_H
216
217	vmov.i8		MASK, #0xe1
218	vshl.u64	MASK, MASK, #57
219
220	ghash_update	p64
221END_FUNC pmull_ghash_update_p64
222
223FUNC pmull_ghash_update_p8 , :
224	vld1.64		{SHASH}, [r3]
225	veor		SHASH2_p8, SHASH_L, SHASH_H
226
227	vext.8		s1l, SHASH_L, SHASH_L, #1
228	vext.8		s2l, SHASH_L, SHASH_L, #2
229	vext.8		s3l, SHASH_L, SHASH_L, #3
230	vext.8		s4l, SHASH_L, SHASH_L, #4
231	vext.8		s1h, SHASH_H, SHASH_H, #1
232	vext.8		s2h, SHASH_H, SHASH_H, #2
233	vext.8		s3h, SHASH_H, SHASH_H, #3
234	vext.8		s4h, SHASH_H, SHASH_H, #4
235
236	vmov.i64	k16, #0xffff
237	vmov.i64	k32, #0xffffffff
238	vmov.i64	k48, #0xffffffffffff
239
240	ghash_update	p8
241END_FUNC pmull_ghash_update_p8
242