123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824 |
- #! /usr/bin/env perl
- # Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the OpenSSL license (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- #
- # ====================================================================
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- # project. The module is, however, dual licensed under OpenSSL and
- # CRYPTOGAMS licenses depending on where you obtain it. For further
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- #
- # X25519 lower-level primitives for PPC64.
- #
- # July 2018.
- #
- # Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15%
- # faster on PPC970/G5. POWER8 on the other hand seems to trip on own
- # shoelaces when handling longer carry chains. As base 2^51 has just
- # single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is
- # pretty old, base 2^64 implementation is not engaged. Comparison to
- # compiler-generated code is complicated by the fact that not all
- # compilers support 128-bit integers. When compiler doesn't, like xlc,
- # this module delivers more than 2x improvement, and when it does,
- # from 12% to 30% improvement was measured...
- $flavour = shift;
- while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
- die "can't locate ppc-xlate.pl";
- open OUT,"| \"$^X\" $xlate $flavour $output";
- *STDOUT=*OUT;
- my $sp = "r1";
- my ($rp,$ap,$bp) = map("r$_",3..5);
- ####################################################### base 2^64
- if (0) {
- my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3,
- $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) =
- map("r$_",(6..12,22..31));
- my $zero = "r0";
- my $FRAME = 16*8;
- $code.=<<___;
- .text
- .globl x25519_fe64_mul
- .type x25519_fe64_mul,\@function
- .align 5
- x25519_fe64_mul:
- stdu $sp,-$FRAME($sp)
- std r22,`$FRAME-8*10`($sp)
- std r23,`$FRAME-8*9`($sp)
- std r24,`$FRAME-8*8`($sp)
- std r25,`$FRAME-8*7`($sp)
- std r26,`$FRAME-8*6`($sp)
- std r27,`$FRAME-8*5`($sp)
- std r28,`$FRAME-8*4`($sp)
- std r29,`$FRAME-8*3`($sp)
- std r30,`$FRAME-8*2`($sp)
- std r31,`$FRAME-8*1`($sp)
- ld $bi,0($bp)
- ld $a0,0($ap)
- xor $zero,$zero,$zero
- ld $a1,8($ap)
- ld $a2,16($ap)
- ld $a3,24($ap)
- mulld $acc0,$a0,$bi # a[0]*b[0]
- mulhdu $t0,$a0,$bi
- mulld $acc1,$a1,$bi # a[1]*b[0]
- mulhdu $t1,$a1,$bi
- mulld $acc2,$a2,$bi # a[2]*b[0]
- mulhdu $t2,$a2,$bi
- mulld $acc3,$a3,$bi # a[3]*b[0]
- mulhdu $t3,$a3,$bi
- ___
- for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7),
- my $i=1; $i<4; shift(@acc), $i++) {
- my $acc4 = $i==1? $zero : @acc[4];
- $code.=<<___;
- ld $bi,`8*$i`($bp)
- addc @acc[1],@acc[1],$t0 # accumulate high parts
- mulld $t0,$a0,$bi
- adde @acc[2],@acc[2],$t1
- mulld $t1,$a1,$bi
- adde @acc[3],@acc[3],$t2
- mulld $t2,$a2,$bi
- adde @acc[4],$acc4,$t3
- mulld $t3,$a3,$bi
- addc @acc[1],@acc[1],$t0 # accumulate low parts
- mulhdu $t0,$a0,$bi
- adde @acc[2],@acc[2],$t1
- mulhdu $t1,$a1,$bi
- adde @acc[3],@acc[3],$t2
- mulhdu $t2,$a2,$bi
- adde @acc[4],@acc[4],$t3
- mulhdu $t3,$a3,$bi
- adde @acc[5],$zero,$zero
- ___
- }
- $code.=<<___;
- li $bi,38
- addc $acc4,$acc4,$t0
- mulld $t0,$acc4,$bi
- adde $acc5,$acc5,$t1
- mulld $t1,$acc5,$bi
- adde $acc6,$acc6,$t2
- mulld $t2,$acc6,$bi
- adde $acc7,$acc7,$t3
- mulld $t3,$acc7,$bi
- addc $acc0,$acc0,$t0
- mulhdu $t0,$acc4,$bi
- adde $acc1,$acc1,$t1
- mulhdu $t1,$acc5,$bi
- adde $acc2,$acc2,$t2
- mulhdu $t2,$acc6,$bi
- adde $acc3,$acc3,$t3
- mulhdu $t3,$acc7,$bi
- adde $acc4,$zero,$zero
- addc $acc1,$acc1,$t0
- adde $acc2,$acc2,$t1
- adde $acc3,$acc3,$t2
- adde $acc4,$acc4,$t3
- mulld $acc4,$acc4,$bi
- addc $acc0,$acc0,$acc4
- addze $acc1,$acc1
- addze $acc2,$acc2
- addze $acc3,$acc3
- subfe $acc4,$acc4,$acc4 # carry -> ~mask
- std $acc1,8($rp)
- andc $acc4,$bi,$acc4
- std $acc2,16($rp)
- add $acc0,$acc0,$acc4
- std $acc3,24($rp)
- std $acc0,0($rp)
- ld r22,`$FRAME-8*10`($sp)
- ld r23,`$FRAME-8*9`($sp)
- ld r24,`$FRAME-8*8`($sp)
- ld r25,`$FRAME-8*7`($sp)
- ld r26,`$FRAME-8*6`($sp)
- ld r27,`$FRAME-8*5`($sp)
- ld r28,`$FRAME-8*4`($sp)
- ld r29,`$FRAME-8*3`($sp)
- ld r30,`$FRAME-8*2`($sp)
- ld r31,`$FRAME-8*1`($sp)
- addi $sp,$sp,$FRAME
- blr
- .long 0
- .byte 0,12,4,0,0x80,10,3,0
- .long 0
- .size x25519_fe64_mul,.-x25519_fe64_mul
- .globl x25519_fe64_sqr
- .type x25519_fe64_sqr,\@function
- .align 5
- x25519_fe64_sqr:
- stdu $sp,-$FRAME($sp)
- std r22,`$FRAME-8*10`($sp)
- std r23,`$FRAME-8*9`($sp)
- std r24,`$FRAME-8*8`($sp)
- std r25,`$FRAME-8*7`($sp)
- std r26,`$FRAME-8*6`($sp)
- std r27,`$FRAME-8*5`($sp)
- std r28,`$FRAME-8*4`($sp)
- std r29,`$FRAME-8*3`($sp)
- std r30,`$FRAME-8*2`($sp)
- std r31,`$FRAME-8*1`($sp)
- ld $a0,0($ap)
- xor $zero,$zero,$zero
- ld $a1,8($ap)
- ld $a2,16($ap)
- ld $a3,24($ap)
- ################################
- # | | | | | |a1*a0| |
- # | | | | |a2*a0| | |
- # | |a3*a2|a3*a0| | | |
- # | | | |a2*a1| | | |
- # | | |a3*a1| | | | |
- # *| | | | | | | | 2|
- # +|a3*a3|a2*a2|a1*a1|a0*a0|
- # |--+--+--+--+--+--+--+--|
- # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
- #
- # "can't overflow" below mark carrying into high part of
- # multiplication result, which can't overflow, because it
- # can never be all ones.
- mulld $acc1,$a1,$a0 # a[1]*a[0]
- mulhdu $t1,$a1,$a0
- mulld $acc2,$a2,$a0 # a[2]*a[0]
- mulhdu $t2,$a2,$a0
- mulld $acc3,$a3,$a0 # a[3]*a[0]
- mulhdu $acc4,$a3,$a0
- addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
- mulld $t0,$a2,$a1 # a[2]*a[1]
- mulhdu $t1,$a2,$a1
- adde $acc3,$acc3,$t2
- mulld $t2,$a3,$a1 # a[3]*a[1]
- mulhdu $t3,$a3,$a1
- addze $acc4,$acc4 # can't overflow
- mulld $acc5,$a3,$a2 # a[3]*a[2]
- mulhdu $acc6,$a3,$a2
- addc $t1,$t1,$t2 # accumulate high parts of multiplication
- mulld $acc0,$a0,$a0 # a[0]*a[0]
- addze $t2,$t3 # can't overflow
- addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
- mulhdu $a0,$a0,$a0
- adde $acc4,$acc4,$t1
- mulld $t1,$a1,$a1 # a[1]*a[1]
- adde $acc5,$acc5,$t2
- mulhdu $a1,$a1,$a1
- addze $acc6,$acc6 # can't overflow
- addc $acc1,$acc1,$acc1 # acc[1-6]*=2
- mulld $t2,$a2,$a2 # a[2]*a[2]
- adde $acc2,$acc2,$acc2
- mulhdu $a2,$a2,$a2
- adde $acc3,$acc3,$acc3
- mulld $t3,$a3,$a3 # a[3]*a[3]
- adde $acc4,$acc4,$acc4
- mulhdu $a3,$a3,$a3
- adde $acc5,$acc5,$acc5
- adde $acc6,$acc6,$acc6
- addze $acc7,$zero
- addc $acc1,$acc1,$a0 # +a[i]*a[i]
- li $bi,38
- adde $acc2,$acc2,$t1
- adde $acc3,$acc3,$a1
- adde $acc4,$acc4,$t2
- adde $acc5,$acc5,$a2
- adde $acc6,$acc6,$t3
- adde $acc7,$acc7,$a3
- mulld $t0,$acc4,$bi
- mulld $t1,$acc5,$bi
- mulld $t2,$acc6,$bi
- mulld $t3,$acc7,$bi
- addc $acc0,$acc0,$t0
- mulhdu $t0,$acc4,$bi
- adde $acc1,$acc1,$t1
- mulhdu $t1,$acc5,$bi
- adde $acc2,$acc2,$t2
- mulhdu $t2,$acc6,$bi
- adde $acc3,$acc3,$t3
- mulhdu $t3,$acc7,$bi
- addze $acc4,$zero
- addc $acc1,$acc1,$t0
- adde $acc2,$acc2,$t1
- adde $acc3,$acc3,$t2
- adde $acc4,$acc4,$t3
- mulld $acc4,$acc4,$bi
- addc $acc0,$acc0,$acc4
- addze $acc1,$acc1
- addze $acc2,$acc2
- addze $acc3,$acc3
- subfe $acc4,$acc4,$acc4 # carry -> ~mask
- std $acc1,8($rp)
- andc $acc4,$bi,$acc4
- std $acc2,16($rp)
- add $acc0,$acc0,$acc4
- std $acc3,24($rp)
- std $acc0,0($rp)
- ld r22,`$FRAME-8*10`($sp)
- ld r23,`$FRAME-8*9`($sp)
- ld r24,`$FRAME-8*8`($sp)
- ld r25,`$FRAME-8*7`($sp)
- ld r26,`$FRAME-8*6`($sp)
- ld r27,`$FRAME-8*5`($sp)
- ld r28,`$FRAME-8*4`($sp)
- ld r29,`$FRAME-8*3`($sp)
- ld r30,`$FRAME-8*2`($sp)
- ld r31,`$FRAME-8*1`($sp)
- addi $sp,$sp,$FRAME
- blr
- .long 0
- .byte 0,12,4,0,0x80,10,2,0
- .long 0
- .size x25519_fe64_sqr,.-x25519_fe64_sqr
- .globl x25519_fe64_mul121666
- .type x25519_fe64_mul121666,\@function
- .align 5
- x25519_fe64_mul121666:
- lis $bi,`65536>>16`
- ori $bi,$bi,`121666-65536`
- ld $t0,0($ap)
- ld $t1,8($ap)
- ld $bp,16($ap)
- ld $ap,24($ap)
- mulld $a0,$t0,$bi
- mulhdu $t0,$t0,$bi
- mulld $a1,$t1,$bi
- mulhdu $t1,$t1,$bi
- mulld $a2,$bp,$bi
- mulhdu $bp,$bp,$bi
- mulld $a3,$ap,$bi
- mulhdu $ap,$ap,$bi
- addc $a1,$a1,$t0
- adde $a2,$a2,$t1
- adde $a3,$a3,$bp
- addze $ap, $ap
- mulli $ap,$ap,38
- addc $a0,$a0,$ap
- addze $a1,$a1
- addze $a2,$a2
- addze $a3,$a3
- subfe $t1,$t1,$t1 # carry -> ~mask
- std $a1,8($rp)
- andc $t0,$t0,$t1
- std $a2,16($rp)
- add $a0,$a0,$t0
- std $a3,24($rp)
- std $a0,0($rp)
- blr
- .long 0
- .byte 0,12,0x14,0,0,0,2,0
- .long 0
- .size x25519_fe64_mul121666,.-x25519_fe64_mul121666
- .globl x25519_fe64_add
- .type x25519_fe64_add,\@function
- .align 5
- x25519_fe64_add:
- ld $a0,0($ap)
- ld $t0,0($bp)
- ld $a1,8($ap)
- ld $t1,8($bp)
- ld $a2,16($ap)
- ld $bi,16($bp)
- ld $a3,24($ap)
- ld $bp,24($bp)
- addc $a0,$a0,$t0
- adde $a1,$a1,$t1
- adde $a2,$a2,$bi
- adde $a3,$a3,$bp
- li $t0,38
- subfe $t1,$t1,$t1 # carry -> ~mask
- andc $t1,$t0,$t1
- addc $a0,$a0,$t1
- addze $a1,$a1
- addze $a2,$a2
- addze $a3,$a3
- subfe $t1,$t1,$t1 # carry -> ~mask
- std $a1,8($rp)
- andc $t0,$t0,$t1
- std $a2,16($rp)
- add $a0,$a0,$t0
- std $a3,24($rp)
- std $a0,0($rp)
- blr
- .long 0
- .byte 0,12,0x14,0,0,0,3,0
- .long 0
- .size x25519_fe64_add,.-x25519_fe64_add
- .globl x25519_fe64_sub
- .type x25519_fe64_sub,\@function
- .align 5
- x25519_fe64_sub:
- ld $a0,0($ap)
- ld $t0,0($bp)
- ld $a1,8($ap)
- ld $t1,8($bp)
- ld $a2,16($ap)
- ld $bi,16($bp)
- ld $a3,24($ap)
- ld $bp,24($bp)
- subfc $a0,$t0,$a0
- subfe $a1,$t1,$a1
- subfe $a2,$bi,$a2
- subfe $a3,$bp,$a3
- li $t0,38
- subfe $t1,$t1,$t1 # borrow -> mask
- xor $zero,$zero,$zero
- and $t1,$t0,$t1
- subfc $a0,$t1,$a0
- subfe $a1,$zero,$a1
- subfe $a2,$zero,$a2
- subfe $a3,$zero,$a3
- subfe $t1,$t1,$t1 # borrow -> mask
- std $a1,8($rp)
- and $t0,$t0,$t1
- std $a2,16($rp)
- subf $a0,$t0,$a0
- std $a3,24($rp)
- std $a0,0($rp)
- blr
- .long 0
- .byte 0,12,0x14,0,0,0,3,0
- .long 0
- .size x25519_fe64_sub,.-x25519_fe64_sub
- .globl x25519_fe64_tobytes
- .type x25519_fe64_tobytes,\@function
- .align 5
- x25519_fe64_tobytes:
- ld $a3,24($ap)
- ld $a0,0($ap)
- ld $a1,8($ap)
- ld $a2,16($ap)
- sradi $t0,$a3,63 # most significant bit -> mask
- li $t1,19
- and $t0,$t0,$t1
- sldi $a3,$a3,1
- add $t0,$t0,$t1 # compare to modulus in the same go
- srdi $a3,$a3,1 # most significant bit cleared
- addc $a0,$a0,$t0
- addze $a1,$a1
- addze $a2,$a2
- addze $a3,$a3
- xor $zero,$zero,$zero
- sradi $t0,$a3,63 # most significant bit -> mask
- sldi $a3,$a3,1
- andc $t0,$t1,$t0
- srdi $a3,$a3,1 # most significant bit cleared
- subi $rp,$rp,1
- subfc $a0,$t0,$a0
- subfe $a1,$zero,$a1
- subfe $a2,$zero,$a2
- subfe $a3,$zero,$a3
- ___
- for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) {
- $code.=<<___;
- srdi $t0,@a[0],8
- stbu @a[0],1($rp)
- srdi @a[0],@a[0],16
- stbu $t0,1($rp)
- srdi $t0,@a[0],8
- stbu @a[0],1($rp)
- srdi @a[0],@a[0],16
- stbu $t0,1($rp)
- srdi $t0,@a[0],8
- stbu @a[0],1($rp)
- srdi @a[0],@a[0],16
- stbu $t0,1($rp)
- srdi $t0,@a[0],8
- stbu @a[0],1($rp)
- stbu $t0,1($rp)
- ___
- }
- $code.=<<___;
- blr
- .long 0
- .byte 0,12,0x14,0,0,0,2,0
- .long 0
- .size x25519_fe64_tobytes,.-x25519_fe64_tobytes
- ___
- }
- ####################################################### base 2^51
- {
- my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1,
- $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) =
- map("r$_",(6..12,21..31));
- my $mask = "r0";
- my $FRAME = 18*8;
- $code.=<<___;
- .text
- .globl x25519_fe51_mul
- .type x25519_fe51_mul,\@function
- .align 5
- x25519_fe51_mul:
- stdu $sp,-$FRAME($sp)
- std r21,`$FRAME-8*11`($sp)
- std r22,`$FRAME-8*10`($sp)
- std r23,`$FRAME-8*9`($sp)
- std r24,`$FRAME-8*8`($sp)
- std r25,`$FRAME-8*7`($sp)
- std r26,`$FRAME-8*6`($sp)
- std r27,`$FRAME-8*5`($sp)
- std r28,`$FRAME-8*4`($sp)
- std r29,`$FRAME-8*3`($sp)
- std r30,`$FRAME-8*2`($sp)
- std r31,`$FRAME-8*1`($sp)
- ld $bi,0($bp)
- ld $a0,0($ap)
- ld $a1,8($ap)
- ld $a2,16($ap)
- ld $a3,24($ap)
- ld $a4,32($ap)
- mulld $h0lo,$a0,$bi # a[0]*b[0]
- mulhdu $h0hi,$a0,$bi
- mulld $h1lo,$a1,$bi # a[1]*b[0]
- mulhdu $h1hi,$a1,$bi
- mulld $h4lo,$a4,$bi # a[4]*b[0]
- mulhdu $h4hi,$a4,$bi
- ld $ap,8($bp)
- mulli $a4,$a4,19
- mulld $h2lo,$a2,$bi # a[2]*b[0]
- mulhdu $h2hi,$a2,$bi
- mulld $h3lo,$a3,$bi # a[3]*b[0]
- mulhdu $h3hi,$a3,$bi
- ___
- for(my @a=($a0,$a1,$a2,$a3,$a4),
- my $i=1; $i<4; $i++) {
- ($ap,$bi) = ($bi,$ap);
- $code.=<<___;
- mulld $t0,@a[4],$bi
- mulhdu $t1,@a[4],$bi
- addc $h0lo,$h0lo,$t0
- adde $h0hi,$h0hi,$t1
- mulld $t0,@a[0],$bi
- mulhdu $t1,@a[0],$bi
- addc $h1lo,$h1lo,$t0
- adde $h1hi,$h1hi,$t1
- mulld $t0,@a[3],$bi
- mulhdu $t1,@a[3],$bi
- ld $ap,`8*($i+1)`($bp)
- mulli @a[3],@a[3],19
- addc $h4lo,$h4lo,$t0
- adde $h4hi,$h4hi,$t1
- mulld $t0,@a[1],$bi
- mulhdu $t1,@a[1],$bi
- addc $h2lo,$h2lo,$t0
- adde $h2hi,$h2hi,$t1
- mulld $t0,@a[2],$bi
- mulhdu $t1,@a[2],$bi
- addc $h3lo,$h3lo,$t0
- adde $h3hi,$h3hi,$t1
- ___
- unshift(@a,pop(@a));
- }
- ($ap,$bi) = ($bi,$ap);
- $code.=<<___;
- mulld $t0,$a1,$bi
- mulhdu $t1,$a1,$bi
- addc $h0lo,$h0lo,$t0
- adde $h0hi,$h0hi,$t1
- mulld $t0,$a2,$bi
- mulhdu $t1,$a2,$bi
- addc $h1lo,$h1lo,$t0
- adde $h1hi,$h1hi,$t1
- mulld $t0,$a3,$bi
- mulhdu $t1,$a3,$bi
- addc $h2lo,$h2lo,$t0
- adde $h2hi,$h2hi,$t1
- mulld $t0,$a4,$bi
- mulhdu $t1,$a4,$bi
- addc $h3lo,$h3lo,$t0
- adde $h3hi,$h3hi,$t1
- mulld $t0,$a0,$bi
- mulhdu $t1,$a0,$bi
- addc $h4lo,$h4lo,$t0
- adde $h4hi,$h4hi,$t1
- .Lfe51_reduce:
- li $mask,-1
- srdi $mask,$mask,13 # 0x7ffffffffffff
- srdi $t0,$h2lo,51
- and $a2,$h2lo,$mask
- insrdi $t0,$h2hi,51,0 # h2>>51
- srdi $t1,$h0lo,51
- and $a0,$h0lo,$mask
- insrdi $t1,$h0hi,51,0 # h0>>51
- addc $h3lo,$h3lo,$t0
- addze $h3hi,$h3hi
- addc $h1lo,$h1lo,$t1
- addze $h1hi,$h1hi
- srdi $t0,$h3lo,51
- and $a3,$h3lo,$mask
- insrdi $t0,$h3hi,51,0 # h3>>51
- srdi $t1,$h1lo,51
- and $a1,$h1lo,$mask
- insrdi $t1,$h1hi,51,0 # h1>>51
- addc $h4lo,$h4lo,$t0
- addze $h4hi,$h4hi
- add $a2,$a2,$t1
- srdi $t0,$h4lo,51
- and $a4,$h4lo,$mask
- insrdi $t0,$h4hi,51,0
- mulli $t0,$t0,19 # (h4 >> 51) * 19
- add $a0,$a0,$t0
- srdi $t1,$a2,51
- and $a2,$a2,$mask
- add $a3,$a3,$t1
- srdi $t0,$a0,51
- and $a0,$a0,$mask
- add $a1,$a1,$t0
- std $a2,16($rp)
- std $a3,24($rp)
- std $a4,32($rp)
- std $a0,0($rp)
- std $a1,8($rp)
- ld r21,`$FRAME-8*11`($sp)
- ld r22,`$FRAME-8*10`($sp)
- ld r23,`$FRAME-8*9`($sp)
- ld r24,`$FRAME-8*8`($sp)
- ld r25,`$FRAME-8*7`($sp)
- ld r26,`$FRAME-8*6`($sp)
- ld r27,`$FRAME-8*5`($sp)
- ld r28,`$FRAME-8*4`($sp)
- ld r29,`$FRAME-8*3`($sp)
- ld r30,`$FRAME-8*2`($sp)
- ld r31,`$FRAME-8*1`($sp)
- addi $sp,$sp,$FRAME
- blr
- .long 0
- .byte 0,12,4,0,0x80,11,3,0
- .long 0
- .size x25519_fe51_mul,.-x25519_fe51_mul
- ___
- {
- my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1);
- $code.=<<___;
- .globl x25519_fe51_sqr
- .type x25519_fe51_sqr,\@function
- .align 5
- x25519_fe51_sqr:
- stdu $sp,-$FRAME($sp)
- std r21,`$FRAME-8*11`($sp)
- std r22,`$FRAME-8*10`($sp)
- std r23,`$FRAME-8*9`($sp)
- std r24,`$FRAME-8*8`($sp)
- std r25,`$FRAME-8*7`($sp)
- std r26,`$FRAME-8*6`($sp)
- std r27,`$FRAME-8*5`($sp)
- std r28,`$FRAME-8*4`($sp)
- std r29,`$FRAME-8*3`($sp)
- std r30,`$FRAME-8*2`($sp)
- std r31,`$FRAME-8*1`($sp)
- ld $a0,0($ap)
- ld $a1,8($ap)
- ld $a2,16($ap)
- ld $a3,24($ap)
- ld $a4,32($ap)
- add $bi,$a0,$a0 # a[0]*2
- mulli $t1,$a4,19 # a[4]*19
- mulld $h0lo,$a0,$a0
- mulhdu $h0hi,$a0,$a0
- mulld $h1lo,$a1,$bi
- mulhdu $h1hi,$a1,$bi
- mulld $h2lo,$a2,$bi
- mulhdu $h2hi,$a2,$bi
- mulld $h3lo,$a3,$bi
- mulhdu $h3hi,$a3,$bi
- mulld $h4lo,$a4,$bi
- mulhdu $h4hi,$a4,$bi
- add $bi,$a1,$a1 # a[1]*2
- ___
- ($a4,$t1) = ($t1,$a4);
- $code.=<<___;
- mulld $t0,$t1,$a4
- mulhdu $t1,$t1,$a4
- addc $h3lo,$h3lo,$t0
- adde $h3hi,$h3hi,$t1
- mulli $bp,$a3,19 # a[3]*19
- mulld $t0,$a1,$a1
- mulhdu $t1,$a1,$a1
- addc $h2lo,$h2lo,$t0
- adde $h2hi,$h2hi,$t1
- mulld $t0,$a2,$bi
- mulhdu $t1,$a2,$bi
- addc $h3lo,$h3lo,$t0
- adde $h3hi,$h3hi,$t1
- mulld $t0,$a3,$bi
- mulhdu $t1,$a3,$bi
- addc $h4lo,$h4lo,$t0
- adde $h4hi,$h4hi,$t1
- mulld $t0,$a4,$bi
- mulhdu $t1,$a4,$bi
- add $bi,$a3,$a3 # a[3]*2
- addc $h0lo,$h0lo,$t0
- adde $h0hi,$h0hi,$t1
- ___
- ($a3,$t1) = ($bp,$a3);
- $code.=<<___;
- mulld $t0,$t1,$a3
- mulhdu $t1,$t1,$a3
- addc $h1lo,$h1lo,$t0
- adde $h1hi,$h1hi,$t1
- mulld $t0,$bi,$a4
- mulhdu $t1,$bi,$a4
- add $bi,$a2,$a2 # a[2]*2
- addc $h2lo,$h2lo,$t0
- adde $h2hi,$h2hi,$t1
- mulld $t0,$a2,$a2
- mulhdu $t1,$a2,$a2
- addc $h4lo,$h4lo,$t0
- adde $h4hi,$h4hi,$t1
- mulld $t0,$a3,$bi
- mulhdu $t1,$a3,$bi
- addc $h0lo,$h0lo,$t0
- adde $h0hi,$h0hi,$t1
- mulld $t0,$a4,$bi
- mulhdu $t1,$a4,$bi
- addc $h1lo,$h1lo,$t0
- adde $h1hi,$h1hi,$t1
- b .Lfe51_reduce
- .long 0
- .byte 0,12,4,0,0x80,11,2,0
- .long 0
- .size x25519_fe51_sqr,.-x25519_fe51_sqr
- ___
- }
- $code.=<<___;
- .globl x25519_fe51_mul121666
- .type x25519_fe51_mul121666,\@function
- .align 5
- x25519_fe51_mul121666:
- stdu $sp,-$FRAME($sp)
- std r21,`$FRAME-8*11`($sp)
- std r22,`$FRAME-8*10`($sp)
- std r23,`$FRAME-8*9`($sp)
- std r24,`$FRAME-8*8`($sp)
- std r25,`$FRAME-8*7`($sp)
- std r26,`$FRAME-8*6`($sp)
- std r27,`$FRAME-8*5`($sp)
- std r28,`$FRAME-8*4`($sp)
- std r29,`$FRAME-8*3`($sp)
- std r30,`$FRAME-8*2`($sp)
- std r31,`$FRAME-8*1`($sp)
- lis $bi,`65536>>16`
- ori $bi,$bi,`121666-65536`
- ld $a0,0($ap)
- ld $a1,8($ap)
- ld $a2,16($ap)
- ld $a3,24($ap)
- ld $a4,32($ap)
- mulld $h0lo,$a0,$bi # a[0]*121666
- mulhdu $h0hi,$a0,$bi
- mulld $h1lo,$a1,$bi # a[1]*121666
- mulhdu $h1hi,$a1,$bi
- mulld $h2lo,$a2,$bi # a[2]*121666
- mulhdu $h2hi,$a2,$bi
- mulld $h3lo,$a3,$bi # a[3]*121666
- mulhdu $h3hi,$a3,$bi
- mulld $h4lo,$a4,$bi # a[4]*121666
- mulhdu $h4hi,$a4,$bi
- b .Lfe51_reduce
- .long 0
- .byte 0,12,4,0,0x80,11,2,0
- .long 0
- .size x25519_fe51_mul121666,.-x25519_fe51_mul121666
- ___
- }
- $code =~ s/\`([^\`]*)\`/eval $1/gem;
- print $code;
- close STDOUT or die "error closing STDOUT: $!";
|