1#! /usr/bin/env perl 2# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# ==================================================================== 10# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke 11# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for 12# the OpenSSL project. 13# ==================================================================== 14 15# 16# Fixed length (n=6), unrolled PPC Montgomery Multiplication 17# 18 19# 2021 20# 21# Although this is a generic implementation for unrolling Montgomery 22# Multiplication for arbitrary values of n, this is currently only 23# used for n = 6 to improve the performance of ECC p384. 24# 25# Unrolling allows intermediate results to be stored in registers, 26# rather than on the stack, improving performance by ~7% compared to 27# the existing PPC assembly code. 28# 29# The ISA 3.0 implementation uses combination multiply/add 30# instructions (maddld, maddhdu) to improve performance by an 31# additional ~10% on Power 9. 32# 33# Finally, saving non-volatile registers into volatile vector 34# registers instead of onto the stack saves a little more. 35# 36# On a Power 9 machine we see an overall improvement of ~18%. 37# 38 39use strict; 40use warnings; 41 42my ($flavour, $output, $dir, $xlate); 43 44# $output is the last argument if it looks like a file (it has an extension) 45# $flavour is the first argument if it doesn't look like a file 46$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 47$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 51( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 52die "can't locate ppc-xlate.pl"; 53 54open STDOUT,"| $^X $xlate $flavour \"$output\"" 55 or die "can't call $xlate: $!"; 56 57if ($flavour !~ /64/) { 58 die "bad flavour ($flavour) - only ppc64 permitted"; 59} 60 61my $SIZE_T= 8; 62 63# Registers are global so the code is remotely readable 64 65# Parameters for Montgomery multiplication 66my $sp = "r1"; 67my $toc = "r2"; 68my $rp = "r3"; 69my $ap = "r4"; 70my $bp = "r5"; 71my $np = "r6"; 72my $n0 = "r7"; 73my $num = "r8"; 74 75my $i = "r9"; 76my $c0 = "r10"; 77my $bp0 = "r11"; 78my $bpi = "r11"; 79my $bpj = "r11"; 80my $tj = "r12"; 81my $apj = "r12"; 82my $npj = "r12"; 83my $lo = "r14"; 84my $c1 = "r14"; 85 86# Non-volatile registers used for tp[i] 87# 88# 12 registers are available but the limit on unrolling is 10, 89# since registers from $tp[0] to $tp[$n+1] are used. 90my @tp = ("r20" .. "r31"); 91 92# volatile VSRs for saving non-volatile GPRs - faster than stack 93my @vsrs = ("v32" .. "v46"); 94 95package Mont; 96 97sub new($$) 98{ 99 my ($class, $n) = @_; 100 101 if ($n > 10) { 102 die "Can't unroll for BN length ${n} (maximum 10)" 103 } 104 105 my $self = { 106 code => "", 107 n => $n, 108 }; 109 bless $self, $class; 110 111 return $self; 112} 113 114sub add_code($$) 115{ 116 my ($self, $c) = @_; 117 118 $self->{code} .= $c; 119} 120 121sub get_code($) 122{ 123 my ($self) = @_; 124 125 return $self->{code}; 126} 127 128sub get_function_name($) 129{ 130 my ($self) = @_; 131 132 return "bn_mul_mont_fixed_n" . $self->{n}; 133} 134 135sub get_label($$) 136{ 137 my ($self, $l) = @_; 138 139 return "L" . $l . "_" . $self->{n}; 140} 141 142sub get_labels($@) 143{ 144 my ($self, @labels) = @_; 145 146 my %out = (); 147 148 foreach my $l (@labels) { 149 $out{"$l"} = $self->get_label("$l"); 150 } 151 152 return \%out; 153} 154 155sub nl($) 156{ 157 my ($self) = @_; 158 159 $self->add_code("\n"); 160} 161 162sub copy_result($) 163{ 164 my ($self) = @_; 165 166 my ($n) = $self->{n}; 167 168 for (my $j = 0; $j < $n; $j++) { 169 $self->add_code(<<___); 170 std $tp[$j],`$j*$SIZE_T`($rp) 171___ 172 } 173 174} 175 176sub mul_mont_fixed($) 177{ 178 my ($self) = @_; 179 180 my ($n) = $self->{n}; 181 my $fname = $self->get_function_name(); 182 my $label = $self->get_labels("outer", "enter", "sub", "copy", "end"); 183 184 $self->add_code(<<___); 185 186.globl .${fname} 187.align 5 188.${fname}: 189 190___ 191 192 $self->save_registers(); 193 194 $self->add_code(<<___); 195 ld $n0,0($n0) 196 197 ld $bp0,0($bp) 198 199 ld $apj,0($ap) 200___ 201 202 $self->mul_c_0($tp[0], $apj, $bp0, $c0); 203 204 for (my $j = 1; $j < $n - 1; $j++) { 205 $self->add_code(<<___); 206 ld $apj,`$j*$SIZE_T`($ap) 207___ 208 $self->mul($tp[$j], $apj, $bp0, $c0); 209 } 210 211 $self->add_code(<<___); 212 ld $apj,`($n-1)*$SIZE_T`($ap) 213___ 214 215 $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0); 216 217 $self->add_code(<<___); 218 li $tp[$n+1],0 219 220___ 221 222 $self->add_code(<<___); 223 li $i,0 224 mtctr $num 225 b $label->{"enter"} 226 227.align 4 228$label->{"outer"}: 229 ldx $bpi,$bp,$i 230 231 ld $apj,0($ap) 232___ 233 234 $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0); 235 236 for (my $j = 1; $j < $n; $j++) { 237 $self->add_code(<<___); 238 ld $apj,`$j*$SIZE_T`($ap) 239___ 240 $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0); 241 } 242 243 $self->add_code(<<___); 244 addc $tp[$n],$tp[$n],$c0 245 addze $tp[$n+1],$tp[$n+1] 246___ 247 248 $self->add_code(<<___); 249.align 4 250$label->{"enter"}: 251 mulld $bpi,$tp[0],$n0 252 253 ld $npj,0($np) 254___ 255 256 $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0); 257 258 for (my $j = 1; $j < $n; $j++) { 259 $self->add_code(<<___); 260 ld $npj,`$j*$SIZE_T`($np) 261___ 262 $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0); 263 } 264 265 $self->add_code(<<___); 266 addc $tp[$n-1],$tp[$n],$c0 267 addze $tp[$n],$tp[$n+1] 268 269 addi $i,$i,$SIZE_T 270 bdnz $label->{"outer"} 271 272 and. $tp[$n],$tp[$n],$tp[$n] 273 bne $label->{"sub"} 274 275 cmpld $tp[$n-1],$npj 276 blt $label->{"copy"} 277 278$label->{"sub"}: 279___ 280 281 # 282 # Reduction 283 # 284 285 $self->add_code(<<___); 286 ld $bpj,`0*$SIZE_T`($np) 287 subfc $c1,$bpj,$tp[0] 288 std $c1,`0*$SIZE_T`($rp) 289 290___ 291 for (my $j = 1; $j < $n - 1; $j++) { 292 $self->add_code(<<___); 293 ld $bpj,`$j*$SIZE_T`($np) 294 subfe $c1,$bpj,$tp[$j] 295 std $c1,`$j*$SIZE_T`($rp) 296 297___ 298 } 299 300 $self->add_code(<<___); 301 subfe $c1,$npj,$tp[$n-1] 302 std $c1,`($n-1)*$SIZE_T`($rp) 303 304___ 305 306 $self->add_code(<<___); 307 addme. $tp[$n],$tp[$n] 308 beq $label->{"end"} 309 310$label->{"copy"}: 311___ 312 313 $self->copy_result(); 314 315 $self->add_code(<<___); 316 317$label->{"end"}: 318___ 319 320 $self->restore_registers(); 321 322 $self->add_code(<<___); 323 li r3,1 324 blr 325.size .${fname},.-.${fname} 326___ 327 328} 329 330package Mont::GPR; 331 332our @ISA = ('Mont'); 333 334sub new($$) 335{ 336 my ($class, $n) = @_; 337 338 return $class->SUPER::new($n); 339} 340 341sub save_registers($) 342{ 343 my ($self) = @_; 344 345 my $n = $self->{n}; 346 347 $self->add_code(<<___); 348 std $lo,-8($sp) 349___ 350 351 for (my $j = 0; $j <= $n+1; $j++) { 352 $self->{code}.=<<___; 353 std $tp[$j],-`($j+2)*8`($sp) 354___ 355 } 356 357 $self->add_code(<<___); 358 359___ 360} 361 362sub restore_registers($) 363{ 364 my ($self) = @_; 365 366 my $n = $self->{n}; 367 368 $self->add_code(<<___); 369 ld $lo,-8($sp) 370___ 371 372 for (my $j = 0; $j <= $n+1; $j++) { 373 $self->{code}.=<<___; 374 ld $tp[$j],-`($j+2)*8`($sp) 375___ 376 } 377 378 $self->{code} .=<<___; 379 380___ 381} 382 383# Direct translation of C mul() 384sub mul($$$$$) 385{ 386 my ($self, $r, $a, $w, $c) = @_; 387 388 $self->add_code(<<___); 389 mulld $lo,$a,$w 390 addc $r,$lo,$c 391 mulhdu $c,$a,$w 392 addze $c,$c 393 394___ 395} 396 397# Like mul() but $c is ignored as an input - an optimisation to save a 398# preliminary instruction that would set input $c to 0 399sub mul_c_0($$$$$) 400{ 401 my ($self, $r, $a, $w, $c) = @_; 402 403 $self->add_code(<<___); 404 mulld $r,$a,$w 405 mulhdu $c,$a,$w 406 407___ 408} 409 410# Like mul() but does not to the final addition of CA into $c - an 411# optimisation to save an instruction 412sub mul_last($$$$$$) 413{ 414 my ($self, $r1, $r2, $a, $w, $c) = @_; 415 416 $self->add_code(<<___); 417 mulld $lo,$a,$w 418 addc $r1,$lo,$c 419 mulhdu $c,$a,$w 420 421 addze $r2,$c 422___ 423} 424 425# Like C mul_add() but allow $r_out and $r_in to be different 426sub mul_add($$$$$$) 427{ 428 my ($self, $r_out, $r_in, $a, $w, $c) = @_; 429 430 $self->add_code(<<___); 431 mulld $lo,$a,$w 432 addc $lo,$lo,$c 433 mulhdu $c,$a,$w 434 addze $c,$c 435 addc $r_out,$r_in,$lo 436 addze $c,$c 437 438___ 439} 440 441# Like mul_add() but $c is ignored as an input - an optimisation to save a 442# preliminary instruction that would set input $c to 0 443sub mul_add_c_0($$$$$$) 444{ 445 my ($self, $r_out, $r_in, $a, $w, $c) = @_; 446 447 $self->add_code(<<___); 448 mulld $lo,$a,$w 449 addc $r_out,$r_in,$lo 450 mulhdu $c,$a,$w 451 addze $c,$c 452 453___ 454} 455 456package Mont::GPR_300; 457 458our @ISA = ('Mont::GPR'); 459 460sub new($$) 461{ 462 my ($class, $n) = @_; 463 464 my $mont = $class->SUPER::new($n); 465 466 return $mont; 467} 468 469sub get_function_name($) 470{ 471 my ($self) = @_; 472 473 return "bn_mul_mont_300_fixed_n" . $self->{n}; 474} 475 476sub get_label($$) 477{ 478 my ($self, $l) = @_; 479 480 return "L" . $l . "_300_" . $self->{n}; 481} 482 483# Direct translation of C mul() 484sub mul($$$$$) 485{ 486 my ($self, $r, $a, $w, $c, $last) = @_; 487 488 $self->add_code(<<___); 489 maddld $r,$a,$w,$c 490 maddhdu $c,$a,$w,$c 491 492___ 493} 494 495# Save the last carry as the final entry 496sub mul_last($$$$$) 497{ 498 my ($self, $r1, $r2, $a, $w, $c) = @_; 499 500 $self->add_code(<<___); 501 maddld $r1,$a,$w,$c 502 maddhdu $r2,$a,$w,$c 503 504___ 505} 506 507# Like mul() but $c is ignored as an input - an optimisation to save a 508# preliminary instruction that would set input $c to 0 509sub mul_c_0($$$$$) 510{ 511 my ($self, $r, $a, $w, $c) = @_; 512 513 $self->add_code(<<___); 514 mulld $r,$a,$w 515 mulhdu $c,$a,$w 516 517___ 518} 519 520# Like C mul_add() but allow $r_out and $r_in to be different 521sub mul_add($$$$$$) 522{ 523 my ($self, $r_out, $r_in, $a, $w, $c) = @_; 524 525 $self->add_code(<<___); 526 maddld $lo,$a,$w,$c 527 maddhdu $c,$a,$w,$c 528 addc $r_out,$r_in,$lo 529 addze $c,$c 530 531___ 532} 533 534# Like mul_add() but $c is ignored as an input - an optimisation to save a 535# preliminary instruction that would set input $c to 0 536sub mul_add_c_0($$$$$$) 537{ 538 my ($self, $r_out, $r_in, $a, $w, $c) = @_; 539 540 $self->add_code(<<___); 541 maddld $lo,$a,$w,$r_in 542 maddhdu $c,$a,$w,$r_in 543___ 544 545 if ($r_out ne $lo) { 546 $self->add_code(<<___); 547 mr $r_out,$lo 548___ 549 } 550 551 $self->nl(); 552} 553 554 555package main; 556 557my $code; 558 559$code.=<<___; 560.machine "any" 561.text 562___ 563 564my $mont; 565 566$mont = new Mont::GPR(6); 567$mont->mul_mont_fixed(); 568$code .= $mont->get_code(); 569 570$mont = new Mont::GPR_300(6); 571$mont->mul_mont_fixed(); 572$code .= $mont->get_code(); 573 574$code =~ s/\`([^\`]*)\`/eval $1/gem; 575 576$code.=<<___; 577.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>" 578___ 579 580print $code; 581close STDOUT or die "error closing STDOUT: $!"; 582