123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328 |
- #! /usr/bin/env perl
- # Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the OpenSSL license (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- #
- # ====================================================================
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- # project. The module is, however, dual licensed under OpenSSL and
- # CRYPTOGAMS licenses depending on where you obtain it. For further
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- #
- # On 21264 RSA sign performance improves by 70/35/20/15 percent for
- # 512/1024/2048/4096 bit key lengths. This is against vendor compiler
- # instructed to '-tune host' code with in-line assembler. Other
- # benchmarks improve by 15-20%. To anchor it to something else, the
- # code provides approximately the same performance per GHz as AMD64.
- # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
- # difference.
- $output=pop;
- open STDOUT,">$output";
- # int bn_mul_mont(
- $rp="a0"; # BN_ULONG *rp,
- $ap="a1"; # const BN_ULONG *ap,
- $bp="a2"; # const BN_ULONG *bp,
- $np="a3"; # const BN_ULONG *np,
- $n0="a4"; # const BN_ULONG *n0,
- $num="a5"; # int num);
- $lo0="t0";
- $hi0="t1";
- $lo1="t2";
- $hi1="t3";
- $aj="t4";
- $bi="t5";
- $nj="t6";
- $tp="t7";
- $alo="t8";
- $ahi="t9";
- $nlo="t10";
- $nhi="t11";
- $tj="t12";
- $i="s3";
- $j="s4";
- $m1="s5";
- $code=<<___;
- #ifdef __linux__
- #include <asm/regdef.h>
- #else
- #include <asm.h>
- #include <regdef.h>
- #endif
- .text
- .set noat
- .set noreorder
- .globl bn_mul_mont
- .align 5
- .ent bn_mul_mont
- bn_mul_mont:
- lda sp,-48(sp)
- stq ra,0(sp)
- stq s3,8(sp)
- stq s4,16(sp)
- stq s5,24(sp)
- stq fp,32(sp)
- mov sp,fp
- .mask 0x0400f000,-48
- .frame fp,48,ra
- .prologue 0
- .align 4
- .set reorder
- sextl $num,$num
- mov 0,v0
- cmplt $num,4,AT
- bne AT,.Lexit
- ldq $hi0,0($ap) # ap[0]
- s8addq $num,16,AT
- ldq $aj,8($ap)
- subq sp,AT,sp
- ldq $bi,0($bp) # bp[0]
- lda AT,-4096(zero) # mov -4096,AT
- ldq $n0,0($n0)
- and sp,AT,sp
- mulq $hi0,$bi,$lo0
- ldq $hi1,0($np) # np[0]
- umulh $hi0,$bi,$hi0
- ldq $nj,8($np)
- mulq $lo0,$n0,$m1
- mulq $hi1,$m1,$lo1
- umulh $hi1,$m1,$hi1
- addq $lo1,$lo0,$lo1
- cmpult $lo1,$lo0,AT
- addq $hi1,AT,$hi1
- mulq $aj,$bi,$alo
- mov 2,$j
- umulh $aj,$bi,$ahi
- mov sp,$tp
- mulq $nj,$m1,$nlo
- s8addq $j,$ap,$aj
- umulh $nj,$m1,$nhi
- s8addq $j,$np,$nj
- .align 4
- .L1st:
- .set noreorder
- ldq $aj,0($aj)
- addl $j,1,$j
- ldq $nj,0($nj)
- lda $tp,8($tp)
- addq $alo,$hi0,$lo0
- mulq $aj,$bi,$alo
- cmpult $lo0,$hi0,AT
- addq $nlo,$hi1,$lo1
- mulq $nj,$m1,$nlo
- addq $ahi,AT,$hi0
- cmpult $lo1,$hi1,v0
- cmplt $j,$num,$tj
- umulh $aj,$bi,$ahi
- addq $nhi,v0,$hi1
- addq $lo1,$lo0,$lo1
- s8addq $j,$ap,$aj
- umulh $nj,$m1,$nhi
- cmpult $lo1,$lo0,v0
- addq $hi1,v0,$hi1
- s8addq $j,$np,$nj
- stq $lo1,-8($tp)
- nop
- unop
- bne $tj,.L1st
- .set reorder
- addq $alo,$hi0,$lo0
- addq $nlo,$hi1,$lo1
- cmpult $lo0,$hi0,AT
- cmpult $lo1,$hi1,v0
- addq $ahi,AT,$hi0
- addq $nhi,v0,$hi1
- addq $lo1,$lo0,$lo1
- cmpult $lo1,$lo0,v0
- addq $hi1,v0,$hi1
- stq $lo1,0($tp)
- addq $hi1,$hi0,$hi1
- cmpult $hi1,$hi0,AT
- stq $hi1,8($tp)
- stq AT,16($tp)
- mov 1,$i
- .align 4
- .Louter:
- s8addq $i,$bp,$bi
- ldq $hi0,0($ap)
- ldq $aj,8($ap)
- ldq $bi,0($bi)
- ldq $hi1,0($np)
- ldq $nj,8($np)
- ldq $tj,0(sp)
- mulq $hi0,$bi,$lo0
- umulh $hi0,$bi,$hi0
- addq $lo0,$tj,$lo0
- cmpult $lo0,$tj,AT
- addq $hi0,AT,$hi0
- mulq $lo0,$n0,$m1
- mulq $hi1,$m1,$lo1
- umulh $hi1,$m1,$hi1
- addq $lo1,$lo0,$lo1
- cmpult $lo1,$lo0,AT
- mov 2,$j
- addq $hi1,AT,$hi1
- mulq $aj,$bi,$alo
- mov sp,$tp
- umulh $aj,$bi,$ahi
- mulq $nj,$m1,$nlo
- s8addq $j,$ap,$aj
- umulh $nj,$m1,$nhi
- .align 4
- .Linner:
- .set noreorder
- ldq $tj,8($tp) #L0
- nop #U1
- ldq $aj,0($aj) #L1
- s8addq $j,$np,$nj #U0
- ldq $nj,0($nj) #L0
- nop #U1
- addq $alo,$hi0,$lo0 #L1
- lda $tp,8($tp)
- mulq $aj,$bi,$alo #U1
- cmpult $lo0,$hi0,AT #L0
- addq $nlo,$hi1,$lo1 #L1
- addl $j,1,$j
- mulq $nj,$m1,$nlo #U1
- addq $ahi,AT,$hi0 #L0
- addq $lo0,$tj,$lo0 #L1
- cmpult $lo1,$hi1,v0 #U0
- umulh $aj,$bi,$ahi #U1
- cmpult $lo0,$tj,AT #L0
- addq $lo1,$lo0,$lo1 #L1
- addq $nhi,v0,$hi1 #U0
- umulh $nj,$m1,$nhi #U1
- s8addq $j,$ap,$aj #L0
- cmpult $lo1,$lo0,v0 #L1
- cmplt $j,$num,$tj #U0 # borrow $tj
- addq $hi0,AT,$hi0 #L0
- addq $hi1,v0,$hi1 #U1
- stq $lo1,-8($tp) #L1
- bne $tj,.Linner #U0
- .set reorder
- ldq $tj,8($tp)
- addq $alo,$hi0,$lo0
- addq $nlo,$hi1,$lo1
- cmpult $lo0,$hi0,AT
- cmpult $lo1,$hi1,v0
- addq $ahi,AT,$hi0
- addq $nhi,v0,$hi1
- addq $lo0,$tj,$lo0
- cmpult $lo0,$tj,AT
- addq $hi0,AT,$hi0
- ldq $tj,16($tp)
- addq $lo1,$lo0,$j
- cmpult $j,$lo0,v0
- addq $hi1,v0,$hi1
- addq $hi1,$hi0,$lo1
- stq $j,0($tp)
- cmpult $lo1,$hi0,$hi1
- addq $lo1,$tj,$lo1
- cmpult $lo1,$tj,AT
- addl $i,1,$i
- addq $hi1,AT,$hi1
- stq $lo1,8($tp)
- cmplt $i,$num,$tj # borrow $tj
- stq $hi1,16($tp)
- bne $tj,.Louter
- s8addq $num,sp,$tj # &tp[num]
- mov $rp,$bp # put rp aside
- mov sp,$tp
- mov sp,$ap
- mov 0,$hi0 # clear borrow bit
- .align 4
- .Lsub: ldq $lo0,0($tp)
- ldq $lo1,0($np)
- lda $tp,8($tp)
- lda $np,8($np)
- subq $lo0,$lo1,$lo1 # tp[i]-np[i]
- cmpult $lo0,$lo1,AT
- subq $lo1,$hi0,$lo0
- cmpult $lo1,$lo0,$hi0
- or $hi0,AT,$hi0
- stq $lo0,0($rp)
- cmpult $tp,$tj,v0
- lda $rp,8($rp)
- bne v0,.Lsub
- subq $hi1,$hi0,$hi0 # handle upmost overflow bit
- mov sp,$tp
- mov $bp,$rp # restore rp
- .align 4
- .Lcopy: ldq $aj,0($tp) # conditional copy
- ldq $nj,0($rp)
- lda $tp,8($tp)
- lda $rp,8($rp)
- cmoveq $hi0,$nj,$aj
- stq zero,-8($tp) # zap tp
- cmpult $tp,$tj,AT
- stq $aj,-8($rp)
- bne AT,.Lcopy
- mov 1,v0
- .Lexit:
- .set noreorder
- mov fp,sp
- /*ldq ra,0(sp)*/
- ldq s3,8(sp)
- ldq s4,16(sp)
- ldq s5,24(sp)
- ldq fp,32(sp)
- lda sp,48(sp)
- ret (ra)
- .end bn_mul_mont
- .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
- .align 2
- ___
- print $code;
- close STDOUT or die "error closing STDOUT: $!";
|