1#! /usr/bin/env perl
2# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# ====================================================================
10# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
11# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
12# the OpenSSL project.
13# ====================================================================
14
15#
16# Fixed length (n=6), unrolled PPC Montgomery Multiplication
17#
18
19# 2021
20#
21# Although this is a generic implementation for unrolling Montgomery
22# Multiplication for arbitrary values of n, this is currently only
23# used for n = 6 to improve the performance of ECC p384.
24#
25# Unrolling allows intermediate results to be stored in registers,
26# rather than on the stack, improving performance by ~7% compared to
27# the existing PPC assembly code.
28#
29# The ISA 3.0 implementation uses combination multiply/add
30# instructions (maddld, maddhdu) to improve performance by an
31# additional ~10% on Power 9.
32#
33# Finally, saving non-volatile registers into volatile vector
34# registers instead of onto the stack saves a little more.
35#
36# On a Power 9 machine we see an overall improvement of ~18%.
37#
38
39use strict;
40use warnings;
41
42my ($flavour, $output, $dir, $xlate);
43
44# $output is the last argument if it looks like a file (it has an extension)
45# $flavour is the first argument if it doesn't look like a file
46$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
47$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
52die "can't locate ppc-xlate.pl";
53
54open STDOUT,"| $^X $xlate $flavour \"$output\""
55    or die "can't call $xlate: $!";
56
57if ($flavour !~ /64/) {
58	die "bad flavour ($flavour) - only ppc64 permitted";
59}
60
61my $SIZE_T= 8;
62
63# Registers are global so the code is remotely readable
64
65# Parameters for Montgomery multiplication
66my $sp	= "r1";
67my $toc	= "r2";
68my $rp	= "r3";
69my $ap	= "r4";
70my $bp	= "r5";
71my $np	= "r6";
72my $n0	= "r7";
73my $num	= "r8";
74
75my $i	= "r9";
76my $c0	= "r10";
77my $bp0	= "r11";
78my $bpi	= "r11";
79my $bpj	= "r11";
80my $tj	= "r12";
81my $apj	= "r12";
82my $npj	= "r12";
83my $lo	= "r14";
84my $c1	= "r14";
85
86# Non-volatile registers used for tp[i]
87#
88# 12 registers are available but the limit on unrolling is 10,
89# since registers from $tp[0] to $tp[$n+1] are used.
90my @tp = ("r20" .. "r31");
91
92# volatile VSRs for saving non-volatile GPRs - faster than stack
93my @vsrs = ("v32" .. "v46");
94
95package Mont;
96
97sub new($$)
98{
99	my ($class, $n) = @_;
100
101	if ($n > 10) {
102		die "Can't unroll for BN length ${n} (maximum 10)"
103	}
104
105	my $self = {
106		code => "",
107		n => $n,
108	};
109	bless $self, $class;
110
111	return $self;
112}
113
114sub add_code($$)
115{
116	my ($self, $c) = @_;
117
118	$self->{code} .= $c;
119}
120
121sub get_code($)
122{
123	my ($self) = @_;
124
125	return $self->{code};
126}
127
128sub get_function_name($)
129{
130	my ($self) = @_;
131
132	return "bn_mul_mont_fixed_n" . $self->{n};
133}
134
135sub get_label($$)
136{
137	my ($self, $l) = @_;
138
139	return "L" . $l . "_" . $self->{n};
140}
141
142sub get_labels($@)
143{
144	my ($self, @labels) = @_;
145
146	my %out = ();
147
148	foreach my $l (@labels) {
149		$out{"$l"} = $self->get_label("$l");
150	}
151
152	return \%out;
153}
154
155sub nl($)
156{
157	my ($self) = @_;
158
159	$self->add_code("\n");
160}
161
162sub copy_result($)
163{
164	my ($self) = @_;
165
166	my ($n) = $self->{n};
167
168	for (my $j = 0; $j < $n; $j++) {
169		$self->add_code(<<___);
170	std		$tp[$j],`$j*$SIZE_T`($rp)
171___
172	}
173
174}
175
176sub mul_mont_fixed($)
177{
178	my ($self) = @_;
179
180	my ($n) = $self->{n};
181	my $fname = $self->get_function_name();
182	my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
183
184	$self->add_code(<<___);
185
186.globl	.${fname}
187.align	5
188.${fname}:
189
190___
191
192	$self->save_registers();
193
194	$self->add_code(<<___);
195	ld		$n0,0($n0)
196
197	ld		$bp0,0($bp)
198
199	ld		$apj,0($ap)
200___
201
202	$self->mul_c_0($tp[0], $apj, $bp0, $c0);
203
204	for (my $j = 1; $j < $n - 1; $j++) {
205		$self->add_code(<<___);
206	ld		$apj,`$j*$SIZE_T`($ap)
207___
208		$self->mul($tp[$j], $apj, $bp0, $c0);
209	}
210
211	$self->add_code(<<___);
212	ld		$apj,`($n-1)*$SIZE_T`($ap)
213___
214
215	$self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
216
217	$self->add_code(<<___);
218	li		$tp[$n+1],0
219
220___
221
222	$self->add_code(<<___);
223	li		$i,0
224	mtctr		$num
225	b		$label->{"enter"}
226
227.align	4
228$label->{"outer"}:
229	ldx		$bpi,$bp,$i
230
231	ld		$apj,0($ap)
232___
233
234	$self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
235
236	for (my $j = 1; $j < $n; $j++) {
237		$self->add_code(<<___);
238	ld		$apj,`$j*$SIZE_T`($ap)
239___
240		$self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
241	}
242
243	$self->add_code(<<___);
244	addc		$tp[$n],$tp[$n],$c0
245	addze		$tp[$n+1],$tp[$n+1]
246___
247
248	$self->add_code(<<___);
249.align	4
250$label->{"enter"}:
251	mulld		$bpi,$tp[0],$n0
252
253	ld		$npj,0($np)
254___
255
256	$self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
257
258	for (my $j = 1; $j < $n; $j++) {
259		$self->add_code(<<___);
260	ld		$npj,`$j*$SIZE_T`($np)
261___
262		$self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
263	}
264
265	$self->add_code(<<___);
266	addc		$tp[$n-1],$tp[$n],$c0
267	addze		$tp[$n],$tp[$n+1]
268
269	addi		$i,$i,$SIZE_T
270	bdnz		$label->{"outer"}
271
272	and.		$tp[$n],$tp[$n],$tp[$n]
273	bne		$label->{"sub"}
274
275	cmpld	$tp[$n-1],$npj
276	blt		$label->{"copy"}
277
278$label->{"sub"}:
279___
280
281	#
282	# Reduction
283	#
284
285		$self->add_code(<<___);
286	ld		$bpj,`0*$SIZE_T`($np)
287	subfc		$c1,$bpj,$tp[0]
288	std		$c1,`0*$SIZE_T`($rp)
289
290___
291	for (my $j = 1; $j < $n - 1; $j++) {
292		$self->add_code(<<___);
293	ld		$bpj,`$j*$SIZE_T`($np)
294	subfe		$c1,$bpj,$tp[$j]
295	std		$c1,`$j*$SIZE_T`($rp)
296
297___
298	}
299
300		$self->add_code(<<___);
301	subfe		$c1,$npj,$tp[$n-1]
302	std		$c1,`($n-1)*$SIZE_T`($rp)
303
304___
305
306	$self->add_code(<<___);
307	addme.		$tp[$n],$tp[$n]
308	beq		$label->{"end"}
309
310$label->{"copy"}:
311___
312
313	$self->copy_result();
314
315	$self->add_code(<<___);
316
317$label->{"end"}:
318___
319
320	$self->restore_registers();
321
322	$self->add_code(<<___);
323	li		r3,1
324	blr
325.size .${fname},.-.${fname}
326___
327
328}
329
330package Mont::GPR;
331
332our @ISA = ('Mont');
333
334sub new($$)
335{
336    my ($class, $n) = @_;
337
338    return $class->SUPER::new($n);
339}
340
341sub save_registers($)
342{
343	my ($self) = @_;
344
345	my $n = $self->{n};
346
347	$self->add_code(<<___);
348	std	$lo,-8($sp)
349___
350
351	for (my $j = 0; $j <= $n+1; $j++) {
352		$self->{code}.=<<___;
353	std	$tp[$j],-`($j+2)*8`($sp)
354___
355	}
356
357	$self->add_code(<<___);
358
359___
360}
361
362sub restore_registers($)
363{
364	my ($self) = @_;
365
366	my $n = $self->{n};
367
368	$self->add_code(<<___);
369	ld	$lo,-8($sp)
370___
371
372	for (my $j = 0; $j <= $n+1; $j++) {
373		$self->{code}.=<<___;
374	ld	$tp[$j],-`($j+2)*8`($sp)
375___
376	}
377
378	$self->{code} .=<<___;
379
380___
381}
382
383# Direct translation of C mul()
384sub mul($$$$$)
385{
386	my ($self, $r, $a, $w, $c) = @_;
387
388	$self->add_code(<<___);
389	mulld		$lo,$a,$w
390	addc		$r,$lo,$c
391	mulhdu		$c,$a,$w
392	addze		$c,$c
393
394___
395}
396
397# Like mul() but $c is ignored as an input - an optimisation to save a
398# preliminary instruction that would set input $c to 0
399sub mul_c_0($$$$$)
400{
401	my ($self, $r, $a, $w, $c) = @_;
402
403	$self->add_code(<<___);
404	mulld		$r,$a,$w
405	mulhdu		$c,$a,$w
406
407___
408}
409
410# Like mul() but does not to the final addition of CA into $c - an
411# optimisation to save an instruction
412sub mul_last($$$$$$)
413{
414	my ($self, $r1, $r2, $a, $w, $c) = @_;
415
416	$self->add_code(<<___);
417	mulld		$lo,$a,$w
418	addc		$r1,$lo,$c
419	mulhdu		$c,$a,$w
420
421	addze		$r2,$c
422___
423}
424
425# Like C mul_add() but allow $r_out and $r_in to be different
426sub mul_add($$$$$$)
427{
428	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
429
430	$self->add_code(<<___);
431	mulld		$lo,$a,$w
432	addc		$lo,$lo,$c
433	mulhdu		$c,$a,$w
434	addze		$c,$c
435	addc		$r_out,$r_in,$lo
436	addze		$c,$c
437
438___
439}
440
441# Like mul_add() but $c is ignored as an input - an optimisation to save a
442# preliminary instruction that would set input $c to 0
443sub mul_add_c_0($$$$$$)
444{
445	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
446
447	$self->add_code(<<___);
448	mulld		$lo,$a,$w
449	addc		$r_out,$r_in,$lo
450	mulhdu		$c,$a,$w
451	addze		$c,$c
452
453___
454}
455
456package Mont::GPR_300;
457
458our @ISA = ('Mont::GPR');
459
460sub new($$)
461{
462	my ($class, $n) = @_;
463
464	my $mont = $class->SUPER::new($n);
465
466	return $mont;
467}
468
469sub get_function_name($)
470{
471	my ($self) = @_;
472
473	return "bn_mul_mont_300_fixed_n" . $self->{n};
474}
475
476sub get_label($$)
477{
478	my ($self, $l) = @_;
479
480	return "L" . $l . "_300_" . $self->{n};
481}
482
483# Direct translation of C mul()
484sub mul($$$$$)
485{
486	my ($self, $r, $a, $w, $c, $last) = @_;
487
488	$self->add_code(<<___);
489	maddld		$r,$a,$w,$c
490	maddhdu		$c,$a,$w,$c
491
492___
493}
494
495# Save the last carry as the final entry
496sub mul_last($$$$$)
497{
498	my ($self, $r1, $r2, $a, $w, $c) = @_;
499
500	$self->add_code(<<___);
501	maddld		$r1,$a,$w,$c
502	maddhdu		$r2,$a,$w,$c
503
504___
505}
506
507# Like mul() but $c is ignored as an input - an optimisation to save a
508# preliminary instruction that would set input $c to 0
509sub mul_c_0($$$$$)
510{
511	my ($self, $r, $a, $w, $c) = @_;
512
513	$self->add_code(<<___);
514	mulld          $r,$a,$w
515	mulhdu          $c,$a,$w
516
517___
518}
519
520# Like C mul_add() but allow $r_out and $r_in to be different
521sub mul_add($$$$$$)
522{
523	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
524
525	$self->add_code(<<___);
526	maddld		$lo,$a,$w,$c
527	maddhdu		$c,$a,$w,$c
528	addc		$r_out,$r_in,$lo
529	addze		$c,$c
530
531___
532}
533
534# Like mul_add() but $c is ignored as an input - an optimisation to save a
535# preliminary instruction that would set input $c to 0
536sub mul_add_c_0($$$$$$)
537{
538	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
539
540	$self->add_code(<<___);
541	maddld		$lo,$a,$w,$r_in
542	maddhdu		$c,$a,$w,$r_in
543___
544
545	if ($r_out ne $lo) {
546		$self->add_code(<<___);
547	mr			$r_out,$lo
548___
549	}
550
551	$self->nl();
552}
553
554
555package main;
556
557my $code;
558
559$code.=<<___;
560.machine "any"
561.text
562___
563
564my $mont;
565
566$mont = new Mont::GPR(6);
567$mont->mul_mont_fixed();
568$code .= $mont->get_code();
569
570$mont = new Mont::GPR_300(6);
571$mont->mul_mont_fixed();
572$code .= $mont->get_code();
573
574$code =~ s/\`([^\`]*)\`/eval $1/gem;
575
576$code.=<<___;
577.asciz  "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
578___
579
580print $code;
581close STDOUT or die "error closing STDOUT: $!";
582