ecp_nistz256-ppc64.pl 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # ECP_NISTZ256 module for PPC64.
  17. #
  18. # August 2016.
  19. #
  20. # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
  21. # http://eprint.iacr.org/2013/816.
  22. #
  23. # with/without -DECP_NISTZ256_ASM
  24. # POWER7 +260-530%
  25. # POWER8 +220-340%
  26. $flavour = shift;
  27. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  28. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  29. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  30. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  31. die "can't locate ppc-xlate.pl";
  32. open OUT,"| \"$^X\" $xlate $flavour $output";
  33. *STDOUT=*OUT;
  34. my $sp="r1";
  35. {
  36. my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
  37. $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
  38. map("r$_",(3..12,22..31));
  39. my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont
  40. $code.=<<___;
  41. .machine "any"
  42. .text
  43. ___
  44. ########################################################################
  45. # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
  46. #
  47. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  48. open TABLE,"<ecp_nistz256_table.c" or
  49. open TABLE,"<${dir}../ecp_nistz256_table.c" or
  50. die "failed to open ecp_nistz256_table.c:",$!;
  51. use integer;
  52. foreach(<TABLE>) {
  53. s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
  54. }
  55. close TABLE;
  56. # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
  57. # 64*16*37-1 is because $#arr returns last valid index or @arr, not
  58. # amount of elements.
  59. die "insane number of elements" if ($#arr != 64*16*37-1);
  60. $code.=<<___;
  61. .type ecp_nistz256_precomputed,\@object
  62. .globl ecp_nistz256_precomputed
  63. .align 12
  64. ecp_nistz256_precomputed:
  65. ___
  66. ########################################################################
  67. # this conversion smashes P256_POINT_AFFINE by individual bytes with
  68. # 64 byte interval, similar to
  69. # 1111222233334444
  70. # 1234123412341234
  71. for(1..37) {
  72. @tbl = splice(@arr,0,64*16);
  73. for($i=0;$i<64;$i++) {
  74. undef @line;
  75. for($j=0;$j<64;$j++) {
  76. push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
  77. }
  78. $code.=".byte\t";
  79. $code.=join(',',map { sprintf "0x%02x",$_} @line);
  80. $code.="\n";
  81. }
  82. }
  83. $code.=<<___;
  84. .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
  85. .asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
  86. # void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
  87. # const BN_ULONG x2[4]);
  88. .globl ecp_nistz256_mul_mont
  89. .align 5
  90. ecp_nistz256_mul_mont:
  91. stdu $sp,-128($sp)
  92. mflr r0
  93. std r22,48($sp)
  94. std r23,56($sp)
  95. std r24,64($sp)
  96. std r25,72($sp)
  97. std r26,80($sp)
  98. std r27,88($sp)
  99. std r28,96($sp)
  100. std r29,104($sp)
  101. std r30,112($sp)
  102. std r31,120($sp)
  103. ld $a0,0($ap)
  104. ld $bi,0($bp)
  105. ld $a1,8($ap)
  106. ld $a2,16($ap)
  107. ld $a3,24($ap)
  108. li $poly1,-1
  109. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  110. li $poly3,1
  111. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  112. bl __ecp_nistz256_mul_mont
  113. mtlr r0
  114. ld r22,48($sp)
  115. ld r23,56($sp)
  116. ld r24,64($sp)
  117. ld r25,72($sp)
  118. ld r26,80($sp)
  119. ld r27,88($sp)
  120. ld r28,96($sp)
  121. ld r29,104($sp)
  122. ld r30,112($sp)
  123. ld r31,120($sp)
  124. addi $sp,$sp,128
  125. blr
  126. .long 0
  127. .byte 0,12,4,0,0x80,10,3,0
  128. .long 0
  129. .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
  130. # void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
  131. .globl ecp_nistz256_sqr_mont
  132. .align 4
  133. ecp_nistz256_sqr_mont:
  134. stdu $sp,-128($sp)
  135. mflr r0
  136. std r22,48($sp)
  137. std r23,56($sp)
  138. std r24,64($sp)
  139. std r25,72($sp)
  140. std r26,80($sp)
  141. std r27,88($sp)
  142. std r28,96($sp)
  143. std r29,104($sp)
  144. std r30,112($sp)
  145. std r31,120($sp)
  146. ld $a0,0($ap)
  147. ld $a1,8($ap)
  148. ld $a2,16($ap)
  149. ld $a3,24($ap)
  150. li $poly1,-1
  151. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  152. li $poly3,1
  153. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  154. bl __ecp_nistz256_sqr_mont
  155. mtlr r0
  156. ld r22,48($sp)
  157. ld r23,56($sp)
  158. ld r24,64($sp)
  159. ld r25,72($sp)
  160. ld r26,80($sp)
  161. ld r27,88($sp)
  162. ld r28,96($sp)
  163. ld r29,104($sp)
  164. ld r30,112($sp)
  165. ld r31,120($sp)
  166. addi $sp,$sp,128
  167. blr
  168. .long 0
  169. .byte 0,12,4,0,0x80,10,2,0
  170. .long 0
  171. .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
  172. # void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
  173. # const BN_ULONG x2[4]);
  174. .globl ecp_nistz256_add
  175. .align 4
  176. ecp_nistz256_add:
  177. stdu $sp,-128($sp)
  178. mflr r0
  179. std r28,96($sp)
  180. std r29,104($sp)
  181. std r30,112($sp)
  182. std r31,120($sp)
  183. ld $acc0,0($ap)
  184. ld $t0, 0($bp)
  185. ld $acc1,8($ap)
  186. ld $t1, 8($bp)
  187. ld $acc2,16($ap)
  188. ld $t2, 16($bp)
  189. ld $acc3,24($ap)
  190. ld $t3, 24($bp)
  191. li $poly1,-1
  192. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  193. li $poly3,1
  194. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  195. bl __ecp_nistz256_add
  196. mtlr r0
  197. ld r28,96($sp)
  198. ld r29,104($sp)
  199. ld r30,112($sp)
  200. ld r31,120($sp)
  201. addi $sp,$sp,128
  202. blr
  203. .long 0
  204. .byte 0,12,4,0,0x80,4,3,0
  205. .long 0
  206. .size ecp_nistz256_add,.-ecp_nistz256_add
  207. # void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
  208. .globl ecp_nistz256_div_by_2
  209. .align 4
  210. ecp_nistz256_div_by_2:
  211. stdu $sp,-128($sp)
  212. mflr r0
  213. std r28,96($sp)
  214. std r29,104($sp)
  215. std r30,112($sp)
  216. std r31,120($sp)
  217. ld $acc0,0($ap)
  218. ld $acc1,8($ap)
  219. ld $acc2,16($ap)
  220. ld $acc3,24($ap)
  221. li $poly1,-1
  222. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  223. li $poly3,1
  224. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  225. bl __ecp_nistz256_div_by_2
  226. mtlr r0
  227. ld r28,96($sp)
  228. ld r29,104($sp)
  229. ld r30,112($sp)
  230. ld r31,120($sp)
  231. addi $sp,$sp,128
  232. blr
  233. .long 0
  234. .byte 0,12,4,0,0x80,4,2,0
  235. .long 0
  236. .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
  237. # void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
  238. .globl ecp_nistz256_mul_by_2
  239. .align 4
  240. ecp_nistz256_mul_by_2:
  241. stdu $sp,-128($sp)
  242. mflr r0
  243. std r28,96($sp)
  244. std r29,104($sp)
  245. std r30,112($sp)
  246. std r31,120($sp)
  247. ld $acc0,0($ap)
  248. ld $acc1,8($ap)
  249. ld $acc2,16($ap)
  250. ld $acc3,24($ap)
  251. mr $t0,$acc0
  252. mr $t1,$acc1
  253. mr $t2,$acc2
  254. mr $t3,$acc3
  255. li $poly1,-1
  256. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  257. li $poly3,1
  258. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  259. bl __ecp_nistz256_add # ret = a+a // 2*a
  260. mtlr r0
  261. ld r28,96($sp)
  262. ld r29,104($sp)
  263. ld r30,112($sp)
  264. ld r31,120($sp)
  265. addi $sp,$sp,128
  266. blr
  267. .long 0
  268. .byte 0,12,4,0,0x80,4,3,0
  269. .long 0
  270. .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
  271. # void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
  272. .globl ecp_nistz256_mul_by_3
  273. .align 4
  274. ecp_nistz256_mul_by_3:
  275. stdu $sp,-128($sp)
  276. mflr r0
  277. std r28,96($sp)
  278. std r29,104($sp)
  279. std r30,112($sp)
  280. std r31,120($sp)
  281. ld $acc0,0($ap)
  282. ld $acc1,8($ap)
  283. ld $acc2,16($ap)
  284. ld $acc3,24($ap)
  285. mr $t0,$acc0
  286. std $acc0,64($sp)
  287. mr $t1,$acc1
  288. std $acc1,72($sp)
  289. mr $t2,$acc2
  290. std $acc2,80($sp)
  291. mr $t3,$acc3
  292. std $acc3,88($sp)
  293. li $poly1,-1
  294. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  295. li $poly3,1
  296. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  297. bl __ecp_nistz256_add # ret = a+a // 2*a
  298. ld $t0,64($sp)
  299. ld $t1,72($sp)
  300. ld $t2,80($sp)
  301. ld $t3,88($sp)
  302. bl __ecp_nistz256_add # ret += a // 2*a+a=3*a
  303. mtlr r0
  304. ld r28,96($sp)
  305. ld r29,104($sp)
  306. ld r30,112($sp)
  307. ld r31,120($sp)
  308. addi $sp,$sp,128
  309. blr
  310. .long 0
  311. .byte 0,12,4,0,0x80,4,2,0
  312. .long 0
  313. .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
  314. # void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
  315. # const BN_ULONG x2[4]);
  316. .globl ecp_nistz256_sub
  317. .align 4
  318. ecp_nistz256_sub:
  319. stdu $sp,-128($sp)
  320. mflr r0
  321. std r28,96($sp)
  322. std r29,104($sp)
  323. std r30,112($sp)
  324. std r31,120($sp)
  325. ld $acc0,0($ap)
  326. ld $acc1,8($ap)
  327. ld $acc2,16($ap)
  328. ld $acc3,24($ap)
  329. li $poly1,-1
  330. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  331. li $poly3,1
  332. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  333. bl __ecp_nistz256_sub_from
  334. mtlr r0
  335. ld r28,96($sp)
  336. ld r29,104($sp)
  337. ld r30,112($sp)
  338. ld r31,120($sp)
  339. addi $sp,$sp,128
  340. blr
  341. .long 0
  342. .byte 0,12,4,0,0x80,4,3,0
  343. .long 0
  344. .size ecp_nistz256_sub,.-ecp_nistz256_sub
  345. # void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
  346. .globl ecp_nistz256_neg
  347. .align 4
  348. ecp_nistz256_neg:
  349. stdu $sp,-128($sp)
  350. mflr r0
  351. std r28,96($sp)
  352. std r29,104($sp)
  353. std r30,112($sp)
  354. std r31,120($sp)
  355. mr $bp,$ap
  356. li $acc0,0
  357. li $acc1,0
  358. li $acc2,0
  359. li $acc3,0
  360. li $poly1,-1
  361. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  362. li $poly3,1
  363. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  364. bl __ecp_nistz256_sub_from
  365. mtlr r0
  366. ld r28,96($sp)
  367. ld r29,104($sp)
  368. ld r30,112($sp)
  369. ld r31,120($sp)
  370. addi $sp,$sp,128
  371. blr
  372. .long 0
  373. .byte 0,12,4,0,0x80,4,2,0
  374. .long 0
  375. .size ecp_nistz256_neg,.-ecp_nistz256_neg
  376. # note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
  377. # to $a0-$a3 and b[0] - to $bi
  378. .type __ecp_nistz256_mul_mont,\@function
  379. .align 4
  380. __ecp_nistz256_mul_mont:
  381. mulld $acc0,$a0,$bi # a[0]*b[0]
  382. mulhdu $t0,$a0,$bi
  383. mulld $acc1,$a1,$bi # a[1]*b[0]
  384. mulhdu $t1,$a1,$bi
  385. mulld $acc2,$a2,$bi # a[2]*b[0]
  386. mulhdu $t2,$a2,$bi
  387. mulld $acc3,$a3,$bi # a[3]*b[0]
  388. mulhdu $t3,$a3,$bi
  389. ld $bi,8($bp) # b[1]
  390. addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
  391. sldi $t0,$acc0,32
  392. adde $acc2,$acc2,$t1
  393. srdi $t1,$acc0,32
  394. adde $acc3,$acc3,$t2
  395. addze $acc4,$t3
  396. li $acc5,0
  397. ___
  398. for($i=1;$i<4;$i++) {
  399. ################################################################
  400. # Reduction iteration is normally performed by accumulating
  401. # result of multiplication of modulus by "magic" digit [and
  402. # omitting least significant word, which is guaranteed to
  403. # be 0], but thanks to special form of modulus and "magic"
  404. # digit being equal to least significant word, it can be
  405. # performed with additions and subtractions alone. Indeed:
  406. #
  407. # ffff0001.00000000.0000ffff.ffffffff
  408. # * abcdefgh
  409. # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
  410. #
  411. # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
  412. # rewrite above as:
  413. #
  414. # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
  415. # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
  416. # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
  417. #
  418. # or marking redundant operations:
  419. #
  420. # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
  421. # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
  422. # - 0000abcd.efgh0000.--------.--------.--------
  423. $code.=<<___;
  424. subfc $t2,$t0,$acc0 # "*0xffff0001"
  425. subfe $t3,$t1,$acc0
  426. addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
  427. adde $acc1,$acc2,$t1
  428. adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
  429. adde $acc3,$acc4,$t3
  430. addze $acc4,$acc5
  431. mulld $t0,$a0,$bi # lo(a[0]*b[i])
  432. mulld $t1,$a1,$bi # lo(a[1]*b[i])
  433. mulld $t2,$a2,$bi # lo(a[2]*b[i])
  434. mulld $t3,$a3,$bi # lo(a[3]*b[i])
  435. addc $acc0,$acc0,$t0 # accumulate low parts of multiplication
  436. mulhdu $t0,$a0,$bi # hi(a[0]*b[i])
  437. adde $acc1,$acc1,$t1
  438. mulhdu $t1,$a1,$bi # hi(a[1]*b[i])
  439. adde $acc2,$acc2,$t2
  440. mulhdu $t2,$a2,$bi # hi(a[2]*b[i])
  441. adde $acc3,$acc3,$t3
  442. mulhdu $t3,$a3,$bi # hi(a[3]*b[i])
  443. addze $acc4,$acc4
  444. ___
  445. $code.=<<___ if ($i<3);
  446. ld $bi,8*($i+1)($bp) # b[$i+1]
  447. ___
  448. $code.=<<___;
  449. addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
  450. sldi $t0,$acc0,32
  451. adde $acc2,$acc2,$t1
  452. srdi $t1,$acc0,32
  453. adde $acc3,$acc3,$t2
  454. adde $acc4,$acc4,$t3
  455. li $acc5,0
  456. addze $acc5,$acc5
  457. ___
  458. }
  459. $code.=<<___;
  460. # last reduction
  461. subfc $t2,$t0,$acc0 # "*0xffff0001"
  462. subfe $t3,$t1,$acc0
  463. addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
  464. adde $acc1,$acc2,$t1
  465. adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
  466. adde $acc3,$acc4,$t3
  467. addze $acc4,$acc5
  468. li $t2,0
  469. addic $acc0,$acc0,1 # ret -= modulus
  470. subfe $acc1,$poly1,$acc1
  471. subfe $acc2,$t2,$acc2
  472. subfe $acc3,$poly3,$acc3
  473. subfe $acc4,$t2,$acc4
  474. addc $acc0,$acc0,$acc4 # ret += modulus if borrow
  475. and $t1,$poly1,$acc4
  476. and $t3,$poly3,$acc4
  477. adde $acc1,$acc1,$t1
  478. addze $acc2,$acc2
  479. adde $acc3,$acc3,$t3
  480. std $acc0,0($rp)
  481. std $acc1,8($rp)
  482. std $acc2,16($rp)
  483. std $acc3,24($rp)
  484. blr
  485. .long 0
  486. .byte 0,12,0x14,0,0,0,1,0
  487. .long 0
  488. .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
  489. # note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
  490. # to $a0-$a3
  491. .type __ecp_nistz256_sqr_mont,\@function
  492. .align 4
  493. __ecp_nistz256_sqr_mont:
  494. ################################################################
  495. # | | | | | |a1*a0| |
  496. # | | | | |a2*a0| | |
  497. # | |a3*a2|a3*a0| | | |
  498. # | | | |a2*a1| | | |
  499. # | | |a3*a1| | | | |
  500. # *| | | | | | | | 2|
  501. # +|a3*a3|a2*a2|a1*a1|a0*a0|
  502. # |--+--+--+--+--+--+--+--|
  503. # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
  504. #
  505. # "can't overflow" below mark carrying into high part of
  506. # multiplication result, which can't overflow, because it
  507. # can never be all ones.
  508. mulld $acc1,$a1,$a0 # a[1]*a[0]
  509. mulhdu $t1,$a1,$a0
  510. mulld $acc2,$a2,$a0 # a[2]*a[0]
  511. mulhdu $t2,$a2,$a0
  512. mulld $acc3,$a3,$a0 # a[3]*a[0]
  513. mulhdu $acc4,$a3,$a0
  514. addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
  515. mulld $t0,$a2,$a1 # a[2]*a[1]
  516. mulhdu $t1,$a2,$a1
  517. adde $acc3,$acc3,$t2
  518. mulld $t2,$a3,$a1 # a[3]*a[1]
  519. mulhdu $t3,$a3,$a1
  520. addze $acc4,$acc4 # can't overflow
  521. mulld $acc5,$a3,$a2 # a[3]*a[2]
  522. mulhdu $acc6,$a3,$a2
  523. addc $t1,$t1,$t2 # accumulate high parts of multiplication
  524. addze $t2,$t3 # can't overflow
  525. addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
  526. adde $acc4,$acc4,$t1
  527. adde $acc5,$acc5,$t2
  528. addze $acc6,$acc6 # can't overflow
  529. addc $acc1,$acc1,$acc1 # acc[1-6]*=2
  530. adde $acc2,$acc2,$acc2
  531. adde $acc3,$acc3,$acc3
  532. adde $acc4,$acc4,$acc4
  533. adde $acc5,$acc5,$acc5
  534. adde $acc6,$acc6,$acc6
  535. li $acc7,0
  536. addze $acc7,$acc7
  537. mulld $acc0,$a0,$a0 # a[0]*a[0]
  538. mulhdu $a0,$a0,$a0
  539. mulld $t1,$a1,$a1 # a[1]*a[1]
  540. mulhdu $a1,$a1,$a1
  541. mulld $t2,$a2,$a2 # a[2]*a[2]
  542. mulhdu $a2,$a2,$a2
  543. mulld $t3,$a3,$a3 # a[3]*a[3]
  544. mulhdu $a3,$a3,$a3
  545. addc $acc1,$acc1,$a0 # +a[i]*a[i]
  546. sldi $t0,$acc0,32
  547. adde $acc2,$acc2,$t1
  548. srdi $t1,$acc0,32
  549. adde $acc3,$acc3,$a1
  550. adde $acc4,$acc4,$t2
  551. adde $acc5,$acc5,$a2
  552. adde $acc6,$acc6,$t3
  553. adde $acc7,$acc7,$a3
  554. ___
  555. for($i=0;$i<3;$i++) { # reductions, see commentary in
  556. # multiplication for details
  557. $code.=<<___;
  558. subfc $t2,$t0,$acc0 # "*0xffff0001"
  559. subfe $t3,$t1,$acc0
  560. addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
  561. sldi $t0,$acc0,32
  562. adde $acc1,$acc2,$t1
  563. srdi $t1,$acc0,32
  564. adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
  565. addze $acc3,$t3 # can't overflow
  566. ___
  567. }
  568. $code.=<<___;
  569. subfc $t2,$t0,$acc0 # "*0xffff0001"
  570. subfe $t3,$t1,$acc0
  571. addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
  572. adde $acc1,$acc2,$t1
  573. adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
  574. addze $acc3,$t3 # can't overflow
  575. addc $acc0,$acc0,$acc4 # accumulate upper half
  576. adde $acc1,$acc1,$acc5
  577. adde $acc2,$acc2,$acc6
  578. adde $acc3,$acc3,$acc7
  579. li $t2,0
  580. addze $acc4,$t2
  581. addic $acc0,$acc0,1 # ret -= modulus
  582. subfe $acc1,$poly1,$acc1
  583. subfe $acc2,$t2,$acc2
  584. subfe $acc3,$poly3,$acc3
  585. subfe $acc4,$t2,$acc4
  586. addc $acc0,$acc0,$acc4 # ret += modulus if borrow
  587. and $t1,$poly1,$acc4
  588. and $t3,$poly3,$acc4
  589. adde $acc1,$acc1,$t1
  590. addze $acc2,$acc2
  591. adde $acc3,$acc3,$t3
  592. std $acc0,0($rp)
  593. std $acc1,8($rp)
  594. std $acc2,16($rp)
  595. std $acc3,24($rp)
  596. blr
  597. .long 0
  598. .byte 0,12,0x14,0,0,0,1,0
  599. .long 0
  600. .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
  601. # Note that __ecp_nistz256_add expects both input vectors pre-loaded to
  602. # $a0-$a3 and $t0-$t3. This is done because it's used in multiple
  603. # contexts, e.g. in multiplication by 2 and 3...
  604. .type __ecp_nistz256_add,\@function
  605. .align 4
  606. __ecp_nistz256_add:
  607. addc $acc0,$acc0,$t0 # ret = a+b
  608. adde $acc1,$acc1,$t1
  609. adde $acc2,$acc2,$t2
  610. li $t2,0
  611. adde $acc3,$acc3,$t3
  612. addze $t0,$t2
  613. # if a+b >= modulus, subtract modulus
  614. #
  615. # But since comparison implies subtraction, we subtract
  616. # modulus and then add it back if subtraction borrowed.
  617. subic $acc0,$acc0,-1
  618. subfe $acc1,$poly1,$acc1
  619. subfe $acc2,$t2,$acc2
  620. subfe $acc3,$poly3,$acc3
  621. subfe $t0,$t2,$t0
  622. addc $acc0,$acc0,$t0
  623. and $t1,$poly1,$t0
  624. and $t3,$poly3,$t0
  625. adde $acc1,$acc1,$t1
  626. addze $acc2,$acc2
  627. adde $acc3,$acc3,$t3
  628. std $acc0,0($rp)
  629. std $acc1,8($rp)
  630. std $acc2,16($rp)
  631. std $acc3,24($rp)
  632. blr
  633. .long 0
  634. .byte 0,12,0x14,0,0,0,3,0
  635. .long 0
  636. .size __ecp_nistz256_add,.-__ecp_nistz256_add
  637. .type __ecp_nistz256_sub_from,\@function
  638. .align 4
  639. __ecp_nistz256_sub_from:
  640. ld $t0,0($bp)
  641. ld $t1,8($bp)
  642. ld $t2,16($bp)
  643. ld $t3,24($bp)
  644. subfc $acc0,$t0,$acc0 # ret = a-b
  645. subfe $acc1,$t1,$acc1
  646. subfe $acc2,$t2,$acc2
  647. subfe $acc3,$t3,$acc3
  648. subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
  649. # if a-b borrowed, add modulus
  650. addc $acc0,$acc0,$t0 # ret -= modulus & t0
  651. and $t1,$poly1,$t0
  652. and $t3,$poly3,$t0
  653. adde $acc1,$acc1,$t1
  654. addze $acc2,$acc2
  655. adde $acc3,$acc3,$t3
  656. std $acc0,0($rp)
  657. std $acc1,8($rp)
  658. std $acc2,16($rp)
  659. std $acc3,24($rp)
  660. blr
  661. .long 0
  662. .byte 0,12,0x14,0,0,0,3,0
  663. .long 0
  664. .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
  665. .type __ecp_nistz256_sub_morf,\@function
  666. .align 4
  667. __ecp_nistz256_sub_morf:
  668. ld $t0,0($bp)
  669. ld $t1,8($bp)
  670. ld $t2,16($bp)
  671. ld $t3,24($bp)
  672. subfc $acc0,$acc0,$t0 # ret = b-a
  673. subfe $acc1,$acc1,$t1
  674. subfe $acc2,$acc2,$t2
  675. subfe $acc3,$acc3,$t3
  676. subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
  677. # if b-a borrowed, add modulus
  678. addc $acc0,$acc0,$t0 # ret -= modulus & t0
  679. and $t1,$poly1,$t0
  680. and $t3,$poly3,$t0
  681. adde $acc1,$acc1,$t1
  682. addze $acc2,$acc2
  683. adde $acc3,$acc3,$t3
  684. std $acc0,0($rp)
  685. std $acc1,8($rp)
  686. std $acc2,16($rp)
  687. std $acc3,24($rp)
  688. blr
  689. .long 0
  690. .byte 0,12,0x14,0,0,0,3,0
  691. .long 0
  692. .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
  693. .type __ecp_nistz256_div_by_2,\@function
  694. .align 4
  695. __ecp_nistz256_div_by_2:
  696. andi. $t0,$acc0,1
  697. addic $acc0,$acc0,-1 # a += modulus
  698. neg $t0,$t0
  699. adde $acc1,$acc1,$poly1
  700. not $t0,$t0
  701. addze $acc2,$acc2
  702. li $t2,0
  703. adde $acc3,$acc3,$poly3
  704. and $t1,$poly1,$t0
  705. addze $ap,$t2 # ap = carry
  706. and $t3,$poly3,$t0
  707. subfc $acc0,$t0,$acc0 # a -= modulus if a was even
  708. subfe $acc1,$t1,$acc1
  709. subfe $acc2,$t2,$acc2
  710. subfe $acc3,$t3,$acc3
  711. subfe $ap, $t2,$ap
  712. srdi $acc0,$acc0,1
  713. sldi $t0,$acc1,63
  714. srdi $acc1,$acc1,1
  715. sldi $t1,$acc2,63
  716. srdi $acc2,$acc2,1
  717. sldi $t2,$acc3,63
  718. srdi $acc3,$acc3,1
  719. sldi $t3,$ap,63
  720. or $acc0,$acc0,$t0
  721. or $acc1,$acc1,$t1
  722. or $acc2,$acc2,$t2
  723. or $acc3,$acc3,$t3
  724. std $acc0,0($rp)
  725. std $acc1,8($rp)
  726. std $acc2,16($rp)
  727. std $acc3,24($rp)
  728. blr
  729. .long 0
  730. .byte 0,12,0x14,0,0,0,1,0
  731. .long 0
  732. .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
  733. ___
  734. ########################################################################
  735. # following subroutines are "literal" implementation of those found in
  736. # ecp_nistz256.c
  737. #
  738. ########################################################################
  739. # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
  740. #
  741. if (1) {
  742. my $FRAME=64+32*4+12*8;
  743. my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
  744. # above map() describes stack layout with 4 temporary
  745. # 256-bit vectors on top.
  746. my ($rp_real,$ap_real) = map("r$_",(20,21));
  747. $code.=<<___;
  748. .globl ecp_nistz256_point_double
  749. .align 5
  750. ecp_nistz256_point_double:
  751. stdu $sp,-$FRAME($sp)
  752. mflr r0
  753. std r20,$FRAME-8*12($sp)
  754. std r21,$FRAME-8*11($sp)
  755. std r22,$FRAME-8*10($sp)
  756. std r23,$FRAME-8*9($sp)
  757. std r24,$FRAME-8*8($sp)
  758. std r25,$FRAME-8*7($sp)
  759. std r26,$FRAME-8*6($sp)
  760. std r27,$FRAME-8*5($sp)
  761. std r28,$FRAME-8*4($sp)
  762. std r29,$FRAME-8*3($sp)
  763. std r30,$FRAME-8*2($sp)
  764. std r31,$FRAME-8*1($sp)
  765. li $poly1,-1
  766. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  767. li $poly3,1
  768. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  769. .Ldouble_shortcut:
  770. ld $acc0,32($ap)
  771. ld $acc1,40($ap)
  772. ld $acc2,48($ap)
  773. ld $acc3,56($ap)
  774. mr $t0,$acc0
  775. mr $t1,$acc1
  776. mr $t2,$acc2
  777. mr $t3,$acc3
  778. ld $a0,64($ap) # forward load for p256_sqr_mont
  779. ld $a1,72($ap)
  780. ld $a2,80($ap)
  781. ld $a3,88($ap)
  782. mr $rp_real,$rp
  783. mr $ap_real,$ap
  784. addi $rp,$sp,$S
  785. bl __ecp_nistz256_add # p256_mul_by_2(S, in_y);
  786. addi $rp,$sp,$Zsqr
  787. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z);
  788. ld $t0,0($ap_real)
  789. ld $t1,8($ap_real)
  790. ld $t2,16($ap_real)
  791. ld $t3,24($ap_real)
  792. mr $a0,$acc0 # put Zsqr aside for p256_sub
  793. mr $a1,$acc1
  794. mr $a2,$acc2
  795. mr $a3,$acc3
  796. addi $rp,$sp,$M
  797. bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x);
  798. addi $bp,$ap_real,0
  799. mr $acc0,$a0 # restore Zsqr
  800. mr $acc1,$a1
  801. mr $acc2,$a2
  802. mr $acc3,$a3
  803. ld $a0,$S+0($sp) # forward load for p256_sqr_mont
  804. ld $a1,$S+8($sp)
  805. ld $a2,$S+16($sp)
  806. ld $a3,$S+24($sp)
  807. addi $rp,$sp,$Zsqr
  808. bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr);
  809. addi $rp,$sp,$S
  810. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S);
  811. ld $bi,32($ap_real)
  812. ld $a0,64($ap_real)
  813. ld $a1,72($ap_real)
  814. ld $a2,80($ap_real)
  815. ld $a3,88($ap_real)
  816. addi $bp,$ap_real,32
  817. addi $rp,$sp,$tmp0
  818. bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y);
  819. mr $t0,$acc0
  820. mr $t1,$acc1
  821. mr $t2,$acc2
  822. mr $t3,$acc3
  823. ld $a0,$S+0($sp) # forward load for p256_sqr_mont
  824. ld $a1,$S+8($sp)
  825. ld $a2,$S+16($sp)
  826. ld $a3,$S+24($sp)
  827. addi $rp,$rp_real,64
  828. bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0);
  829. addi $rp,$sp,$tmp0
  830. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S);
  831. ld $bi,$Zsqr($sp) # forward load for p256_mul_mont
  832. ld $a0,$M+0($sp)
  833. ld $a1,$M+8($sp)
  834. ld $a2,$M+16($sp)
  835. ld $a3,$M+24($sp)
  836. addi $rp,$rp_real,32
  837. bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0);
  838. addi $bp,$sp,$Zsqr
  839. addi $rp,$sp,$M
  840. bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr);
  841. mr $t0,$acc0 # duplicate M
  842. mr $t1,$acc1
  843. mr $t2,$acc2
  844. mr $t3,$acc3
  845. mr $a0,$acc0 # put M aside
  846. mr $a1,$acc1
  847. mr $a2,$acc2
  848. mr $a3,$acc3
  849. addi $rp,$sp,$M
  850. bl __ecp_nistz256_add
  851. mr $t0,$a0 # restore M
  852. mr $t1,$a1
  853. mr $t2,$a2
  854. mr $t3,$a3
  855. ld $bi,0($ap_real) # forward load for p256_mul_mont
  856. ld $a0,$S+0($sp)
  857. ld $a1,$S+8($sp)
  858. ld $a2,$S+16($sp)
  859. ld $a3,$S+24($sp)
  860. bl __ecp_nistz256_add # p256_mul_by_3(M, M);
  861. addi $bp,$ap_real,0
  862. addi $rp,$sp,$S
  863. bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x);
  864. mr $t0,$acc0
  865. mr $t1,$acc1
  866. mr $t2,$acc2
  867. mr $t3,$acc3
  868. ld $a0,$M+0($sp) # forward load for p256_sqr_mont
  869. ld $a1,$M+8($sp)
  870. ld $a2,$M+16($sp)
  871. ld $a3,$M+24($sp)
  872. addi $rp,$sp,$tmp0
  873. bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S);
  874. addi $rp,$rp_real,0
  875. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M);
  876. addi $bp,$sp,$tmp0
  877. bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0);
  878. addi $bp,$sp,$S
  879. addi $rp,$sp,$S
  880. bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x);
  881. ld $bi,$M($sp)
  882. mr $a0,$acc0 # copy S
  883. mr $a1,$acc1
  884. mr $a2,$acc2
  885. mr $a3,$acc3
  886. addi $bp,$sp,$M
  887. bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M);
  888. addi $bp,$rp_real,32
  889. addi $rp,$rp_real,32
  890. bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y);
  891. mtlr r0
  892. ld r20,$FRAME-8*12($sp)
  893. ld r21,$FRAME-8*11($sp)
  894. ld r22,$FRAME-8*10($sp)
  895. ld r23,$FRAME-8*9($sp)
  896. ld r24,$FRAME-8*8($sp)
  897. ld r25,$FRAME-8*7($sp)
  898. ld r26,$FRAME-8*6($sp)
  899. ld r27,$FRAME-8*5($sp)
  900. ld r28,$FRAME-8*4($sp)
  901. ld r29,$FRAME-8*3($sp)
  902. ld r30,$FRAME-8*2($sp)
  903. ld r31,$FRAME-8*1($sp)
  904. addi $sp,$sp,$FRAME
  905. blr
  906. .long 0
  907. .byte 0,12,4,0,0x80,12,2,0
  908. .long 0
  909. .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
  910. ___
  911. }
  912. ########################################################################
  913. # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
  914. # const P256_POINT *in2);
  915. if (1) {
  916. my $FRAME = 64 + 32*12 + 16*8;
  917. my ($res_x,$res_y,$res_z,
  918. $H,$Hsqr,$R,$Rsqr,$Hcub,
  919. $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
  920. my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
  921. # above map() describes stack layout with 12 temporary
  922. # 256-bit vectors on top.
  923. my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
  924. $code.=<<___;
  925. .globl ecp_nistz256_point_add
  926. .align 5
  927. ecp_nistz256_point_add:
  928. stdu $sp,-$FRAME($sp)
  929. mflr r0
  930. std r16,$FRAME-8*16($sp)
  931. std r17,$FRAME-8*15($sp)
  932. std r18,$FRAME-8*14($sp)
  933. std r19,$FRAME-8*13($sp)
  934. std r20,$FRAME-8*12($sp)
  935. std r21,$FRAME-8*11($sp)
  936. std r22,$FRAME-8*10($sp)
  937. std r23,$FRAME-8*9($sp)
  938. std r24,$FRAME-8*8($sp)
  939. std r25,$FRAME-8*7($sp)
  940. std r26,$FRAME-8*6($sp)
  941. std r27,$FRAME-8*5($sp)
  942. std r28,$FRAME-8*4($sp)
  943. std r29,$FRAME-8*3($sp)
  944. std r30,$FRAME-8*2($sp)
  945. std r31,$FRAME-8*1($sp)
  946. li $poly1,-1
  947. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  948. li $poly3,1
  949. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  950. ld $a0,64($bp) # in2_z
  951. ld $a1,72($bp)
  952. ld $a2,80($bp)
  953. ld $a3,88($bp)
  954. mr $rp_real,$rp
  955. mr $ap_real,$ap
  956. mr $bp_real,$bp
  957. or $t0,$a0,$a1
  958. or $t2,$a2,$a3
  959. or $in2infty,$t0,$t2
  960. neg $t0,$in2infty
  961. or $in2infty,$in2infty,$t0
  962. sradi $in2infty,$in2infty,63 # !in2infty
  963. addi $rp,$sp,$Z2sqr
  964. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z);
  965. ld $a0,64($ap_real) # in1_z
  966. ld $a1,72($ap_real)
  967. ld $a2,80($ap_real)
  968. ld $a3,88($ap_real)
  969. or $t0,$a0,$a1
  970. or $t2,$a2,$a3
  971. or $in1infty,$t0,$t2
  972. neg $t0,$in1infty
  973. or $in1infty,$in1infty,$t0
  974. sradi $in1infty,$in1infty,63 # !in1infty
  975. addi $rp,$sp,$Z1sqr
  976. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
  977. ld $bi,64($bp_real)
  978. ld $a0,$Z2sqr+0($sp)
  979. ld $a1,$Z2sqr+8($sp)
  980. ld $a2,$Z2sqr+16($sp)
  981. ld $a3,$Z2sqr+24($sp)
  982. addi $bp,$bp_real,64
  983. addi $rp,$sp,$S1
  984. bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z);
  985. ld $bi,64($ap_real)
  986. ld $a0,$Z1sqr+0($sp)
  987. ld $a1,$Z1sqr+8($sp)
  988. ld $a2,$Z1sqr+16($sp)
  989. ld $a3,$Z1sqr+24($sp)
  990. addi $bp,$ap_real,64
  991. addi $rp,$sp,$S2
  992. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
  993. ld $bi,32($ap_real)
  994. ld $a0,$S1+0($sp)
  995. ld $a1,$S1+8($sp)
  996. ld $a2,$S1+16($sp)
  997. ld $a3,$S1+24($sp)
  998. addi $bp,$ap_real,32
  999. addi $rp,$sp,$S1
  1000. bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y);
  1001. ld $bi,32($bp_real)
  1002. ld $a0,$S2+0($sp)
  1003. ld $a1,$S2+8($sp)
  1004. ld $a2,$S2+16($sp)
  1005. ld $a3,$S2+24($sp)
  1006. addi $bp,$bp_real,32
  1007. addi $rp,$sp,$S2
  1008. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
  1009. addi $bp,$sp,$S1
  1010. ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont
  1011. ld $a0,0($ap_real)
  1012. ld $a1,8($ap_real)
  1013. ld $a2,16($ap_real)
  1014. ld $a3,24($ap_real)
  1015. addi $rp,$sp,$R
  1016. bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1);
  1017. or $acc0,$acc0,$acc1 # see if result is zero
  1018. or $acc2,$acc2,$acc3
  1019. or $temp,$acc0,$acc2
  1020. addi $bp,$sp,$Z2sqr
  1021. addi $rp,$sp,$U1
  1022. bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr);
  1023. ld $bi,$Z1sqr($sp)
  1024. ld $a0,0($bp_real)
  1025. ld $a1,8($bp_real)
  1026. ld $a2,16($bp_real)
  1027. ld $a3,24($bp_real)
  1028. addi $bp,$sp,$Z1sqr
  1029. addi $rp,$sp,$U2
  1030. bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr);
  1031. addi $bp,$sp,$U1
  1032. ld $a0,$R+0($sp) # forward load for p256_sqr_mont
  1033. ld $a1,$R+8($sp)
  1034. ld $a2,$R+16($sp)
  1035. ld $a3,$R+24($sp)
  1036. addi $rp,$sp,$H
  1037. bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1);
  1038. or $acc0,$acc0,$acc1 # see if result is zero
  1039. or $acc2,$acc2,$acc3
  1040. or. $acc0,$acc0,$acc2
  1041. bne .Ladd_proceed # is_equal(U1,U2)?
  1042. and. $t0,$in1infty,$in2infty
  1043. beq .Ladd_proceed # (in1infty || in2infty)?
  1044. cmpldi $temp,0
  1045. beq .Ladd_double # is_equal(S1,S2)?
  1046. xor $a0,$a0,$a0
  1047. std $a0,0($rp_real)
  1048. std $a0,8($rp_real)
  1049. std $a0,16($rp_real)
  1050. std $a0,24($rp_real)
  1051. std $a0,32($rp_real)
  1052. std $a0,40($rp_real)
  1053. std $a0,48($rp_real)
  1054. std $a0,56($rp_real)
  1055. std $a0,64($rp_real)
  1056. std $a0,72($rp_real)
  1057. std $a0,80($rp_real)
  1058. std $a0,88($rp_real)
  1059. b .Ladd_done
  1060. .align 4
  1061. .Ladd_double:
  1062. ld $bp,0($sp) # back-link
  1063. mr $ap,$ap_real
  1064. mr $rp,$rp_real
  1065. ld r16,$FRAME-8*16($sp)
  1066. ld r17,$FRAME-8*15($sp)
  1067. ld r18,$FRAME-8*14($sp)
  1068. ld r19,$FRAME-8*13($sp)
  1069. stdu $bp,$FRAME-288($sp) # difference in stack frame sizes
  1070. b .Ldouble_shortcut
  1071. .align 4
  1072. .Ladd_proceed:
  1073. addi $rp,$sp,$Rsqr
  1074. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
  1075. ld $bi,64($ap_real)
  1076. ld $a0,$H+0($sp)
  1077. ld $a1,$H+8($sp)
  1078. ld $a2,$H+16($sp)
  1079. ld $a3,$H+24($sp)
  1080. addi $bp,$ap_real,64
  1081. addi $rp,$sp,$res_z
  1082. bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
  1083. ld $a0,$H+0($sp)
  1084. ld $a1,$H+8($sp)
  1085. ld $a2,$H+16($sp)
  1086. ld $a3,$H+24($sp)
  1087. addi $rp,$sp,$Hsqr
  1088. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
  1089. ld $bi,64($bp_real)
  1090. ld $a0,$res_z+0($sp)
  1091. ld $a1,$res_z+8($sp)
  1092. ld $a2,$res_z+16($sp)
  1093. ld $a3,$res_z+24($sp)
  1094. addi $bp,$bp_real,64
  1095. addi $rp,$sp,$res_z
  1096. bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z);
  1097. ld $bi,$H($sp)
  1098. ld $a0,$Hsqr+0($sp)
  1099. ld $a1,$Hsqr+8($sp)
  1100. ld $a2,$Hsqr+16($sp)
  1101. ld $a3,$Hsqr+24($sp)
  1102. addi $bp,$sp,$H
  1103. addi $rp,$sp,$Hcub
  1104. bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
  1105. ld $bi,$Hsqr($sp)
  1106. ld $a0,$U1+0($sp)
  1107. ld $a1,$U1+8($sp)
  1108. ld $a2,$U1+16($sp)
  1109. ld $a3,$U1+24($sp)
  1110. addi $bp,$sp,$Hsqr
  1111. addi $rp,$sp,$U2
  1112. bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr);
  1113. mr $t0,$acc0
  1114. mr $t1,$acc1
  1115. mr $t2,$acc2
  1116. mr $t3,$acc3
  1117. addi $rp,$sp,$Hsqr
  1118. bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
  1119. addi $bp,$sp,$Rsqr
  1120. addi $rp,$sp,$res_x
  1121. bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
  1122. addi $bp,$sp,$Hcub
  1123. bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
  1124. addi $bp,$sp,$U2
  1125. ld $bi,$Hcub($sp) # forward load for p256_mul_mont
  1126. ld $a0,$S1+0($sp)
  1127. ld $a1,$S1+8($sp)
  1128. ld $a2,$S1+16($sp)
  1129. ld $a3,$S1+24($sp)
  1130. addi $rp,$sp,$res_y
  1131. bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
  1132. addi $bp,$sp,$Hcub
  1133. addi $rp,$sp,$S2
  1134. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub);
  1135. ld $bi,$R($sp)
  1136. ld $a0,$res_y+0($sp)
  1137. ld $a1,$res_y+8($sp)
  1138. ld $a2,$res_y+16($sp)
  1139. ld $a3,$res_y+24($sp)
  1140. addi $bp,$sp,$R
  1141. addi $rp,$sp,$res_y
  1142. bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
  1143. addi $bp,$sp,$S2
  1144. bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
  1145. ld $t0,0($bp_real) # in2
  1146. ld $t1,8($bp_real)
  1147. ld $t2,16($bp_real)
  1148. ld $t3,24($bp_real)
  1149. ld $a0,$res_x+0($sp) # res
  1150. ld $a1,$res_x+8($sp)
  1151. ld $a2,$res_x+16($sp)
  1152. ld $a3,$res_x+24($sp)
  1153. ___
  1154. for($i=0;$i<64;$i+=32) { # conditional moves
  1155. $code.=<<___;
  1156. ld $acc0,$i+0($ap_real) # in1
  1157. ld $acc1,$i+8($ap_real)
  1158. ld $acc2,$i+16($ap_real)
  1159. ld $acc3,$i+24($ap_real)
  1160. andc $t0,$t0,$in1infty
  1161. andc $t1,$t1,$in1infty
  1162. andc $t2,$t2,$in1infty
  1163. andc $t3,$t3,$in1infty
  1164. and $a0,$a0,$in1infty
  1165. and $a1,$a1,$in1infty
  1166. and $a2,$a2,$in1infty
  1167. and $a3,$a3,$in1infty
  1168. or $t0,$t0,$a0
  1169. or $t1,$t1,$a1
  1170. or $t2,$t2,$a2
  1171. or $t3,$t3,$a3
  1172. andc $acc0,$acc0,$in2infty
  1173. andc $acc1,$acc1,$in2infty
  1174. andc $acc2,$acc2,$in2infty
  1175. andc $acc3,$acc3,$in2infty
  1176. and $t0,$t0,$in2infty
  1177. and $t1,$t1,$in2infty
  1178. and $t2,$t2,$in2infty
  1179. and $t3,$t3,$in2infty
  1180. or $acc0,$acc0,$t0
  1181. or $acc1,$acc1,$t1
  1182. or $acc2,$acc2,$t2
  1183. or $acc3,$acc3,$t3
  1184. ld $t0,$i+32($bp_real) # in2
  1185. ld $t1,$i+40($bp_real)
  1186. ld $t2,$i+48($bp_real)
  1187. ld $t3,$i+56($bp_real)
  1188. ld $a0,$res_x+$i+32($sp)
  1189. ld $a1,$res_x+$i+40($sp)
  1190. ld $a2,$res_x+$i+48($sp)
  1191. ld $a3,$res_x+$i+56($sp)
  1192. std $acc0,$i+0($rp_real)
  1193. std $acc1,$i+8($rp_real)
  1194. std $acc2,$i+16($rp_real)
  1195. std $acc3,$i+24($rp_real)
  1196. ___
  1197. }
  1198. $code.=<<___;
  1199. ld $acc0,$i+0($ap_real) # in1
  1200. ld $acc1,$i+8($ap_real)
  1201. ld $acc2,$i+16($ap_real)
  1202. ld $acc3,$i+24($ap_real)
  1203. andc $t0,$t0,$in1infty
  1204. andc $t1,$t1,$in1infty
  1205. andc $t2,$t2,$in1infty
  1206. andc $t3,$t3,$in1infty
  1207. and $a0,$a0,$in1infty
  1208. and $a1,$a1,$in1infty
  1209. and $a2,$a2,$in1infty
  1210. and $a3,$a3,$in1infty
  1211. or $t0,$t0,$a0
  1212. or $t1,$t1,$a1
  1213. or $t2,$t2,$a2
  1214. or $t3,$t3,$a3
  1215. andc $acc0,$acc0,$in2infty
  1216. andc $acc1,$acc1,$in2infty
  1217. andc $acc2,$acc2,$in2infty
  1218. andc $acc3,$acc3,$in2infty
  1219. and $t0,$t0,$in2infty
  1220. and $t1,$t1,$in2infty
  1221. and $t2,$t2,$in2infty
  1222. and $t3,$t3,$in2infty
  1223. or $acc0,$acc0,$t0
  1224. or $acc1,$acc1,$t1
  1225. or $acc2,$acc2,$t2
  1226. or $acc3,$acc3,$t3
  1227. std $acc0,$i+0($rp_real)
  1228. std $acc1,$i+8($rp_real)
  1229. std $acc2,$i+16($rp_real)
  1230. std $acc3,$i+24($rp_real)
  1231. .Ladd_done:
  1232. mtlr r0
  1233. ld r16,$FRAME-8*16($sp)
  1234. ld r17,$FRAME-8*15($sp)
  1235. ld r18,$FRAME-8*14($sp)
  1236. ld r19,$FRAME-8*13($sp)
  1237. ld r20,$FRAME-8*12($sp)
  1238. ld r21,$FRAME-8*11($sp)
  1239. ld r22,$FRAME-8*10($sp)
  1240. ld r23,$FRAME-8*9($sp)
  1241. ld r24,$FRAME-8*8($sp)
  1242. ld r25,$FRAME-8*7($sp)
  1243. ld r26,$FRAME-8*6($sp)
  1244. ld r27,$FRAME-8*5($sp)
  1245. ld r28,$FRAME-8*4($sp)
  1246. ld r29,$FRAME-8*3($sp)
  1247. ld r30,$FRAME-8*2($sp)
  1248. ld r31,$FRAME-8*1($sp)
  1249. addi $sp,$sp,$FRAME
  1250. blr
  1251. .long 0
  1252. .byte 0,12,4,0,0x80,16,3,0
  1253. .long 0
  1254. .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
  1255. ___
  1256. }
  1257. ########################################################################
  1258. # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
  1259. # const P256_POINT_AFFINE *in2);
  1260. if (1) {
  1261. my $FRAME = 64 + 32*10 + 16*8;
  1262. my ($res_x,$res_y,$res_z,
  1263. $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
  1264. my $Z1sqr = $S2;
  1265. # above map() describes stack layout with 10 temporary
  1266. # 256-bit vectors on top.
  1267. my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
  1268. $code.=<<___;
  1269. .globl ecp_nistz256_point_add_affine
  1270. .align 5
  1271. ecp_nistz256_point_add_affine:
  1272. stdu $sp,-$FRAME($sp)
  1273. mflr r0
  1274. std r16,$FRAME-8*16($sp)
  1275. std r17,$FRAME-8*15($sp)
  1276. std r18,$FRAME-8*14($sp)
  1277. std r19,$FRAME-8*13($sp)
  1278. std r20,$FRAME-8*12($sp)
  1279. std r21,$FRAME-8*11($sp)
  1280. std r22,$FRAME-8*10($sp)
  1281. std r23,$FRAME-8*9($sp)
  1282. std r24,$FRAME-8*8($sp)
  1283. std r25,$FRAME-8*7($sp)
  1284. std r26,$FRAME-8*6($sp)
  1285. std r27,$FRAME-8*5($sp)
  1286. std r28,$FRAME-8*4($sp)
  1287. std r29,$FRAME-8*3($sp)
  1288. std r30,$FRAME-8*2($sp)
  1289. std r31,$FRAME-8*1($sp)
  1290. li $poly1,-1
  1291. srdi $poly1,$poly1,32 # 0x00000000ffffffff
  1292. li $poly3,1
  1293. orc $poly3,$poly3,$poly1 # 0xffffffff00000001
  1294. mr $rp_real,$rp
  1295. mr $ap_real,$ap
  1296. mr $bp_real,$bp
  1297. ld $a0,64($ap) # in1_z
  1298. ld $a1,72($ap)
  1299. ld $a2,80($ap)
  1300. ld $a3,88($ap)
  1301. or $t0,$a0,$a1
  1302. or $t2,$a2,$a3
  1303. or $in1infty,$t0,$t2
  1304. neg $t0,$in1infty
  1305. or $in1infty,$in1infty,$t0
  1306. sradi $in1infty,$in1infty,63 # !in1infty
  1307. ld $acc0,0($bp) # in2_x
  1308. ld $acc1,8($bp)
  1309. ld $acc2,16($bp)
  1310. ld $acc3,24($bp)
  1311. ld $t0,32($bp) # in2_y
  1312. ld $t1,40($bp)
  1313. ld $t2,48($bp)
  1314. ld $t3,56($bp)
  1315. or $acc0,$acc0,$acc1
  1316. or $acc2,$acc2,$acc3
  1317. or $acc0,$acc0,$acc2
  1318. or $t0,$t0,$t1
  1319. or $t2,$t2,$t3
  1320. or $t0,$t0,$t2
  1321. or $in2infty,$acc0,$t0
  1322. neg $t0,$in2infty
  1323. or $in2infty,$in2infty,$t0
  1324. sradi $in2infty,$in2infty,63 # !in2infty
  1325. addi $rp,$sp,$Z1sqr
  1326. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
  1327. mr $a0,$acc0
  1328. mr $a1,$acc1
  1329. mr $a2,$acc2
  1330. mr $a3,$acc3
  1331. ld $bi,0($bp_real)
  1332. addi $bp,$bp_real,0
  1333. addi $rp,$sp,$U2
  1334. bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x);
  1335. addi $bp,$ap_real,0
  1336. ld $bi,64($ap_real) # forward load for p256_mul_mont
  1337. ld $a0,$Z1sqr+0($sp)
  1338. ld $a1,$Z1sqr+8($sp)
  1339. ld $a2,$Z1sqr+16($sp)
  1340. ld $a3,$Z1sqr+24($sp)
  1341. addi $rp,$sp,$H
  1342. bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x);
  1343. addi $bp,$ap_real,64
  1344. addi $rp,$sp,$S2
  1345. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
  1346. ld $bi,64($ap_real)
  1347. ld $a0,$H+0($sp)
  1348. ld $a1,$H+8($sp)
  1349. ld $a2,$H+16($sp)
  1350. ld $a3,$H+24($sp)
  1351. addi $bp,$ap_real,64
  1352. addi $rp,$sp,$res_z
  1353. bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
  1354. ld $bi,32($bp_real)
  1355. ld $a0,$S2+0($sp)
  1356. ld $a1,$S2+8($sp)
  1357. ld $a2,$S2+16($sp)
  1358. ld $a3,$S2+24($sp)
  1359. addi $bp,$bp_real,32
  1360. addi $rp,$sp,$S2
  1361. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
  1362. addi $bp,$ap_real,32
  1363. ld $a0,$H+0($sp) # forward load for p256_sqr_mont
  1364. ld $a1,$H+8($sp)
  1365. ld $a2,$H+16($sp)
  1366. ld $a3,$H+24($sp)
  1367. addi $rp,$sp,$R
  1368. bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y);
  1369. addi $rp,$sp,$Hsqr
  1370. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
  1371. ld $a0,$R+0($sp)
  1372. ld $a1,$R+8($sp)
  1373. ld $a2,$R+16($sp)
  1374. ld $a3,$R+24($sp)
  1375. addi $rp,$sp,$Rsqr
  1376. bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
  1377. ld $bi,$H($sp)
  1378. ld $a0,$Hsqr+0($sp)
  1379. ld $a1,$Hsqr+8($sp)
  1380. ld $a2,$Hsqr+16($sp)
  1381. ld $a3,$Hsqr+24($sp)
  1382. addi $bp,$sp,$H
  1383. addi $rp,$sp,$Hcub
  1384. bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
  1385. ld $bi,0($ap_real)
  1386. ld $a0,$Hsqr+0($sp)
  1387. ld $a1,$Hsqr+8($sp)
  1388. ld $a2,$Hsqr+16($sp)
  1389. ld $a3,$Hsqr+24($sp)
  1390. addi $bp,$ap_real,0
  1391. addi $rp,$sp,$U2
  1392. bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr);
  1393. mr $t0,$acc0
  1394. mr $t1,$acc1
  1395. mr $t2,$acc2
  1396. mr $t3,$acc3
  1397. addi $rp,$sp,$Hsqr
  1398. bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
  1399. addi $bp,$sp,$Rsqr
  1400. addi $rp,$sp,$res_x
  1401. bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
  1402. addi $bp,$sp,$Hcub
  1403. bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
  1404. addi $bp,$sp,$U2
  1405. ld $bi,32($ap_real) # forward load for p256_mul_mont
  1406. ld $a0,$Hcub+0($sp)
  1407. ld $a1,$Hcub+8($sp)
  1408. ld $a2,$Hcub+16($sp)
  1409. ld $a3,$Hcub+24($sp)
  1410. addi $rp,$sp,$res_y
  1411. bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
  1412. addi $bp,$ap_real,32
  1413. addi $rp,$sp,$S2
  1414. bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub);
  1415. ld $bi,$R($sp)
  1416. ld $a0,$res_y+0($sp)
  1417. ld $a1,$res_y+8($sp)
  1418. ld $a2,$res_y+16($sp)
  1419. ld $a3,$res_y+24($sp)
  1420. addi $bp,$sp,$R
  1421. addi $rp,$sp,$res_y
  1422. bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
  1423. addi $bp,$sp,$S2
  1424. bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
  1425. ld $t0,0($bp_real) # in2
  1426. ld $t1,8($bp_real)
  1427. ld $t2,16($bp_real)
  1428. ld $t3,24($bp_real)
  1429. ld $a0,$res_x+0($sp) # res
  1430. ld $a1,$res_x+8($sp)
  1431. ld $a2,$res_x+16($sp)
  1432. ld $a3,$res_x+24($sp)
  1433. ___
  1434. for($i=0;$i<64;$i+=32) { # conditional moves
  1435. $code.=<<___;
  1436. ld $acc0,$i+0($ap_real) # in1
  1437. ld $acc1,$i+8($ap_real)
  1438. ld $acc2,$i+16($ap_real)
  1439. ld $acc3,$i+24($ap_real)
  1440. andc $t0,$t0,$in1infty
  1441. andc $t1,$t1,$in1infty
  1442. andc $t2,$t2,$in1infty
  1443. andc $t3,$t3,$in1infty
  1444. and $a0,$a0,$in1infty
  1445. and $a1,$a1,$in1infty
  1446. and $a2,$a2,$in1infty
  1447. and $a3,$a3,$in1infty
  1448. or $t0,$t0,$a0
  1449. or $t1,$t1,$a1
  1450. or $t2,$t2,$a2
  1451. or $t3,$t3,$a3
  1452. andc $acc0,$acc0,$in2infty
  1453. andc $acc1,$acc1,$in2infty
  1454. andc $acc2,$acc2,$in2infty
  1455. andc $acc3,$acc3,$in2infty
  1456. and $t0,$t0,$in2infty
  1457. and $t1,$t1,$in2infty
  1458. and $t2,$t2,$in2infty
  1459. and $t3,$t3,$in2infty
  1460. or $acc0,$acc0,$t0
  1461. or $acc1,$acc1,$t1
  1462. or $acc2,$acc2,$t2
  1463. or $acc3,$acc3,$t3
  1464. ___
  1465. $code.=<<___ if ($i==0);
  1466. ld $t0,32($bp_real) # in2
  1467. ld $t1,40($bp_real)
  1468. ld $t2,48($bp_real)
  1469. ld $t3,56($bp_real)
  1470. ___
  1471. $code.=<<___ if ($i==32);
  1472. li $t0,1 # Lone_mont
  1473. not $t1,$poly1
  1474. li $t2,-1
  1475. not $t3,$poly3
  1476. ___
  1477. $code.=<<___;
  1478. ld $a0,$res_x+$i+32($sp)
  1479. ld $a1,$res_x+$i+40($sp)
  1480. ld $a2,$res_x+$i+48($sp)
  1481. ld $a3,$res_x+$i+56($sp)
  1482. std $acc0,$i+0($rp_real)
  1483. std $acc1,$i+8($rp_real)
  1484. std $acc2,$i+16($rp_real)
  1485. std $acc3,$i+24($rp_real)
  1486. ___
  1487. }
  1488. $code.=<<___;
  1489. ld $acc0,$i+0($ap_real) # in1
  1490. ld $acc1,$i+8($ap_real)
  1491. ld $acc2,$i+16($ap_real)
  1492. ld $acc3,$i+24($ap_real)
  1493. andc $t0,$t0,$in1infty
  1494. andc $t1,$t1,$in1infty
  1495. andc $t2,$t2,$in1infty
  1496. andc $t3,$t3,$in1infty
  1497. and $a0,$a0,$in1infty
  1498. and $a1,$a1,$in1infty
  1499. and $a2,$a2,$in1infty
  1500. and $a3,$a3,$in1infty
  1501. or $t0,$t0,$a0
  1502. or $t1,$t1,$a1
  1503. or $t2,$t2,$a2
  1504. or $t3,$t3,$a3
  1505. andc $acc0,$acc0,$in2infty
  1506. andc $acc1,$acc1,$in2infty
  1507. andc $acc2,$acc2,$in2infty
  1508. andc $acc3,$acc3,$in2infty
  1509. and $t0,$t0,$in2infty
  1510. and $t1,$t1,$in2infty
  1511. and $t2,$t2,$in2infty
  1512. and $t3,$t3,$in2infty
  1513. or $acc0,$acc0,$t0
  1514. or $acc1,$acc1,$t1
  1515. or $acc2,$acc2,$t2
  1516. or $acc3,$acc3,$t3
  1517. std $acc0,$i+0($rp_real)
  1518. std $acc1,$i+8($rp_real)
  1519. std $acc2,$i+16($rp_real)
  1520. std $acc3,$i+24($rp_real)
  1521. mtlr r0
  1522. ld r16,$FRAME-8*16($sp)
  1523. ld r17,$FRAME-8*15($sp)
  1524. ld r18,$FRAME-8*14($sp)
  1525. ld r19,$FRAME-8*13($sp)
  1526. ld r20,$FRAME-8*12($sp)
  1527. ld r21,$FRAME-8*11($sp)
  1528. ld r22,$FRAME-8*10($sp)
  1529. ld r23,$FRAME-8*9($sp)
  1530. ld r24,$FRAME-8*8($sp)
  1531. ld r25,$FRAME-8*7($sp)
  1532. ld r26,$FRAME-8*6($sp)
  1533. ld r27,$FRAME-8*5($sp)
  1534. ld r28,$FRAME-8*4($sp)
  1535. ld r29,$FRAME-8*3($sp)
  1536. ld r30,$FRAME-8*2($sp)
  1537. ld r31,$FRAME-8*1($sp)
  1538. addi $sp,$sp,$FRAME
  1539. blr
  1540. .long 0
  1541. .byte 0,12,4,0,0x80,16,3,0
  1542. .long 0
  1543. .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
  1544. ___
  1545. }
  1546. if (1) {
  1547. my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
  1548. my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
  1549. $code.=<<___;
  1550. ########################################################################
  1551. # void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
  1552. # uint64_t b[4]);
  1553. .globl ecp_nistz256_ord_mul_mont
  1554. .align 5
  1555. ecp_nistz256_ord_mul_mont:
  1556. stdu $sp,-160($sp)
  1557. std r18,48($sp)
  1558. std r19,56($sp)
  1559. std r20,64($sp)
  1560. std r21,72($sp)
  1561. std r22,80($sp)
  1562. std r23,88($sp)
  1563. std r24,96($sp)
  1564. std r25,104($sp)
  1565. std r26,112($sp)
  1566. std r27,120($sp)
  1567. std r28,128($sp)
  1568. std r29,136($sp)
  1569. std r30,144($sp)
  1570. std r31,152($sp)
  1571. ld $a0,0($ap)
  1572. ld $bi,0($bp)
  1573. ld $a1,8($ap)
  1574. ld $a2,16($ap)
  1575. ld $a3,24($ap)
  1576. lis $ordk,0xccd1
  1577. lis $ord0,0xf3b9
  1578. lis $ord1,0xbce6
  1579. ori $ordk,$ordk,0xc8aa
  1580. ori $ord0,$ord0,0xcac2
  1581. ori $ord1,$ord1,0xfaad
  1582. sldi $ordk,$ordk,32
  1583. sldi $ord0,$ord0,32
  1584. sldi $ord1,$ord1,32
  1585. oris $ordk,$ordk,0xee00
  1586. oris $ord0,$ord0,0xfc63
  1587. oris $ord1,$ord1,0xa717
  1588. ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
  1589. ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
  1590. ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
  1591. li $ord2,-1 # 0xffffffffffffffff
  1592. sldi $ord3,$ord2,32 # 0xffffffff00000000
  1593. li $zr,0
  1594. mulld $acc0,$a0,$bi # a[0]*b[0]
  1595. mulhdu $t0,$a0,$bi
  1596. mulld $acc1,$a1,$bi # a[1]*b[0]
  1597. mulhdu $t1,$a1,$bi
  1598. mulld $acc2,$a2,$bi # a[2]*b[0]
  1599. mulhdu $t2,$a2,$bi
  1600. mulld $acc3,$a3,$bi # a[3]*b[0]
  1601. mulhdu $acc4,$a3,$bi
  1602. mulld $t4,$acc0,$ordk
  1603. addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
  1604. adde $acc2,$acc2,$t1
  1605. adde $acc3,$acc3,$t2
  1606. addze $acc4,$acc4
  1607. li $acc5,0
  1608. ___
  1609. for ($i=1;$i<4;$i++) {
  1610. ################################################################
  1611. # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
  1612. # * abcdefgh
  1613. # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
  1614. #
  1615. # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
  1616. # rewrite above as:
  1617. #
  1618. # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
  1619. # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
  1620. # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
  1621. $code.=<<___;
  1622. ld $bi,8*$i($bp) # b[i]
  1623. sldi $t0,$t4,32
  1624. subfc $acc2,$t4,$acc2
  1625. srdi $t1,$t4,32
  1626. subfe $acc3,$t0,$acc3
  1627. subfe $acc4,$t1,$acc4
  1628. subfe $acc5,$zr,$acc5
  1629. addic $t0,$acc0,-1 # discarded
  1630. mulhdu $t1,$ord0,$t4
  1631. mulld $t2,$ord1,$t4
  1632. mulhdu $t3,$ord1,$t4
  1633. adde $t2,$t2,$t1
  1634. mulld $t0,$a0,$bi
  1635. addze $t3,$t3
  1636. mulld $t1,$a1,$bi
  1637. addc $acc0,$acc1,$t2
  1638. mulld $t2,$a2,$bi
  1639. adde $acc1,$acc2,$t3
  1640. mulld $t3,$a3,$bi
  1641. adde $acc2,$acc3,$t4
  1642. adde $acc3,$acc4,$t4
  1643. addze $acc4,$acc5
  1644. addc $acc0,$acc0,$t0 # accumulate low parts
  1645. mulhdu $t0,$a0,$bi
  1646. adde $acc1,$acc1,$t1
  1647. mulhdu $t1,$a1,$bi
  1648. adde $acc2,$acc2,$t2
  1649. mulhdu $t2,$a2,$bi
  1650. adde $acc3,$acc3,$t3
  1651. mulhdu $t3,$a3,$bi
  1652. addze $acc4,$acc4
  1653. mulld $t4,$acc0,$ordk
  1654. addc $acc1,$acc1,$t0 # accumulate high parts
  1655. adde $acc2,$acc2,$t1
  1656. adde $acc3,$acc3,$t2
  1657. adde $acc4,$acc4,$t3
  1658. addze $acc5,$zr
  1659. ___
  1660. }
  1661. $code.=<<___;
  1662. sldi $t0,$t4,32 # last reduction
  1663. subfc $acc2,$t4,$acc2
  1664. srdi $t1,$t4,32
  1665. subfe $acc3,$t0,$acc3
  1666. subfe $acc4,$t1,$acc4
  1667. subfe $acc5,$zr,$acc5
  1668. addic $t0,$acc0,-1 # discarded
  1669. mulhdu $t1,$ord0,$t4
  1670. mulld $t2,$ord1,$t4
  1671. mulhdu $t3,$ord1,$t4
  1672. adde $t2,$t2,$t1
  1673. addze $t3,$t3
  1674. addc $acc0,$acc1,$t2
  1675. adde $acc1,$acc2,$t3
  1676. adde $acc2,$acc3,$t4
  1677. adde $acc3,$acc4,$t4
  1678. addze $acc4,$acc5
  1679. subfc $acc0,$ord0,$acc0 # ret -= modulus
  1680. subfe $acc1,$ord1,$acc1
  1681. subfe $acc2,$ord2,$acc2
  1682. subfe $acc3,$ord3,$acc3
  1683. subfe $acc4,$zr,$acc4
  1684. and $t0,$ord0,$acc4
  1685. and $t1,$ord1,$acc4
  1686. addc $acc0,$acc0,$t0 # ret += modulus if borrow
  1687. and $t3,$ord3,$acc4
  1688. adde $acc1,$acc1,$t1
  1689. adde $acc2,$acc2,$acc4
  1690. adde $acc3,$acc3,$t3
  1691. std $acc0,0($rp)
  1692. std $acc1,8($rp)
  1693. std $acc2,16($rp)
  1694. std $acc3,24($rp)
  1695. ld r18,48($sp)
  1696. ld r19,56($sp)
  1697. ld r20,64($sp)
  1698. ld r21,72($sp)
  1699. ld r22,80($sp)
  1700. ld r23,88($sp)
  1701. ld r24,96($sp)
  1702. ld r25,104($sp)
  1703. ld r26,112($sp)
  1704. ld r27,120($sp)
  1705. ld r28,128($sp)
  1706. ld r29,136($sp)
  1707. ld r30,144($sp)
  1708. ld r31,152($sp)
  1709. addi $sp,$sp,160
  1710. blr
  1711. .long 0
  1712. .byte 0,12,4,0,0x80,14,3,0
  1713. .long 0
  1714. .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
  1715. ################################################################################
  1716. # void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
  1717. # int rep);
  1718. .globl ecp_nistz256_ord_sqr_mont
  1719. .align 5
  1720. ecp_nistz256_ord_sqr_mont:
  1721. stdu $sp,-160($sp)
  1722. std r18,48($sp)
  1723. std r19,56($sp)
  1724. std r20,64($sp)
  1725. std r21,72($sp)
  1726. std r22,80($sp)
  1727. std r23,88($sp)
  1728. std r24,96($sp)
  1729. std r25,104($sp)
  1730. std r26,112($sp)
  1731. std r27,120($sp)
  1732. std r28,128($sp)
  1733. std r29,136($sp)
  1734. std r30,144($sp)
  1735. std r31,152($sp)
  1736. mtctr $bp
  1737. ld $a0,0($ap)
  1738. ld $a1,8($ap)
  1739. ld $a2,16($ap)
  1740. ld $a3,24($ap)
  1741. lis $ordk,0xccd1
  1742. lis $ord0,0xf3b9
  1743. lis $ord1,0xbce6
  1744. ori $ordk,$ordk,0xc8aa
  1745. ori $ord0,$ord0,0xcac2
  1746. ori $ord1,$ord1,0xfaad
  1747. sldi $ordk,$ordk,32
  1748. sldi $ord0,$ord0,32
  1749. sldi $ord1,$ord1,32
  1750. oris $ordk,$ordk,0xee00
  1751. oris $ord0,$ord0,0xfc63
  1752. oris $ord1,$ord1,0xa717
  1753. ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
  1754. ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
  1755. ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
  1756. li $ord2,-1 # 0xffffffffffffffff
  1757. sldi $ord3,$ord2,32 # 0xffffffff00000000
  1758. li $zr,0
  1759. b .Loop_ord_sqr
  1760. .align 5
  1761. .Loop_ord_sqr:
  1762. ################################################################
  1763. # | | | | | |a1*a0| |
  1764. # | | | | |a2*a0| | |
  1765. # | |a3*a2|a3*a0| | | |
  1766. # | | | |a2*a1| | | |
  1767. # | | |a3*a1| | | | |
  1768. # *| | | | | | | | 2|
  1769. # +|a3*a3|a2*a2|a1*a1|a0*a0|
  1770. # |--+--+--+--+--+--+--+--|
  1771. # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
  1772. #
  1773. # "can't overflow" below mark carrying into high part of
  1774. # multiplication result, which can't overflow, because it
  1775. # can never be all ones.
  1776. mulld $acc1,$a1,$a0 # a[1]*a[0]
  1777. mulhdu $t1,$a1,$a0
  1778. mulld $acc2,$a2,$a0 # a[2]*a[0]
  1779. mulhdu $t2,$a2,$a0
  1780. mulld $acc3,$a3,$a0 # a[3]*a[0]
  1781. mulhdu $acc4,$a3,$a0
  1782. addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
  1783. mulld $t0,$a2,$a1 # a[2]*a[1]
  1784. mulhdu $t1,$a2,$a1
  1785. adde $acc3,$acc3,$t2
  1786. mulld $t2,$a3,$a1 # a[3]*a[1]
  1787. mulhdu $t3,$a3,$a1
  1788. addze $acc4,$acc4 # can't overflow
  1789. mulld $acc5,$a3,$a2 # a[3]*a[2]
  1790. mulhdu $acc6,$a3,$a2
  1791. addc $t1,$t1,$t2 # accumulate high parts of multiplication
  1792. mulld $acc0,$a0,$a0 # a[0]*a[0]
  1793. addze $t2,$t3 # can't overflow
  1794. addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
  1795. mulhdu $a0,$a0,$a0
  1796. adde $acc4,$acc4,$t1
  1797. mulld $t1,$a1,$a1 # a[1]*a[1]
  1798. adde $acc5,$acc5,$t2
  1799. mulhdu $a1,$a1,$a1
  1800. addze $acc6,$acc6 # can't overflow
  1801. addc $acc1,$acc1,$acc1 # acc[1-6]*=2
  1802. mulld $t2,$a2,$a2 # a[2]*a[2]
  1803. adde $acc2,$acc2,$acc2
  1804. mulhdu $a2,$a2,$a2
  1805. adde $acc3,$acc3,$acc3
  1806. mulld $t3,$a3,$a3 # a[3]*a[3]
  1807. adde $acc4,$acc4,$acc4
  1808. mulhdu $a3,$a3,$a3
  1809. adde $acc5,$acc5,$acc5
  1810. adde $acc6,$acc6,$acc6
  1811. addze $acc7,$zr
  1812. addc $acc1,$acc1,$a0 # +a[i]*a[i]
  1813. mulld $t4,$acc0,$ordk
  1814. adde $acc2,$acc2,$t1
  1815. adde $acc3,$acc3,$a1
  1816. adde $acc4,$acc4,$t2
  1817. adde $acc5,$acc5,$a2
  1818. adde $acc6,$acc6,$t3
  1819. adde $acc7,$acc7,$a3
  1820. ___
  1821. for($i=0; $i<4; $i++) { # reductions
  1822. $code.=<<___;
  1823. addic $t0,$acc0,-1 # discarded
  1824. mulhdu $t1,$ord0,$t4
  1825. mulld $t2,$ord1,$t4
  1826. mulhdu $t3,$ord1,$t4
  1827. adde $t2,$t2,$t1
  1828. addze $t3,$t3
  1829. addc $acc0,$acc1,$t2
  1830. adde $acc1,$acc2,$t3
  1831. adde $acc2,$acc3,$t4
  1832. adde $acc3,$zr,$t4 # can't overflow
  1833. ___
  1834. $code.=<<___ if ($i<3);
  1835. mulld $t3,$acc0,$ordk
  1836. ___
  1837. $code.=<<___;
  1838. sldi $t0,$t4,32
  1839. subfc $acc1,$t4,$acc1
  1840. srdi $t1,$t4,32
  1841. subfe $acc2,$t0,$acc2
  1842. subfe $acc3,$t1,$acc3 # can't borrow
  1843. ___
  1844. ($t3,$t4) = ($t4,$t3);
  1845. }
  1846. $code.=<<___;
  1847. addc $acc0,$acc0,$acc4 # accumulate upper half
  1848. adde $acc1,$acc1,$acc5
  1849. adde $acc2,$acc2,$acc6
  1850. adde $acc3,$acc3,$acc7
  1851. addze $acc4,$zr
  1852. subfc $acc0,$ord0,$acc0 # ret -= modulus
  1853. subfe $acc1,$ord1,$acc1
  1854. subfe $acc2,$ord2,$acc2
  1855. subfe $acc3,$ord3,$acc3
  1856. subfe $acc4,$zr,$acc4
  1857. and $t0,$ord0,$acc4
  1858. and $t1,$ord1,$acc4
  1859. addc $a0,$acc0,$t0 # ret += modulus if borrow
  1860. and $t3,$ord3,$acc4
  1861. adde $a1,$acc1,$t1
  1862. adde $a2,$acc2,$acc4
  1863. adde $a3,$acc3,$t3
  1864. bdnz .Loop_ord_sqr
  1865. std $a0,0($rp)
  1866. std $a1,8($rp)
  1867. std $a2,16($rp)
  1868. std $a3,24($rp)
  1869. ld r18,48($sp)
  1870. ld r19,56($sp)
  1871. ld r20,64($sp)
  1872. ld r21,72($sp)
  1873. ld r22,80($sp)
  1874. ld r23,88($sp)
  1875. ld r24,96($sp)
  1876. ld r25,104($sp)
  1877. ld r26,112($sp)
  1878. ld r27,120($sp)
  1879. ld r28,128($sp)
  1880. ld r29,136($sp)
  1881. ld r30,144($sp)
  1882. ld r31,152($sp)
  1883. addi $sp,$sp,160
  1884. blr
  1885. .long 0
  1886. .byte 0,12,4,0,0x80,14,3,0
  1887. .long 0
  1888. .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
  1889. ___
  1890. } }
  1891. ########################################################################
  1892. # scatter-gather subroutines
  1893. {
  1894. my ($out,$inp,$index,$mask)=map("r$_",(3..7));
  1895. $code.=<<___;
  1896. ########################################################################
  1897. # void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
  1898. # int index);
  1899. .globl ecp_nistz256_scatter_w5
  1900. .align 4
  1901. ecp_nistz256_scatter_w5:
  1902. slwi $index,$index,2
  1903. add $out,$out,$index
  1904. ld r8, 0($inp) # X
  1905. ld r9, 8($inp)
  1906. ld r10,16($inp)
  1907. ld r11,24($inp)
  1908. stw r8, 64*0-4($out)
  1909. srdi r8, r8, 32
  1910. stw r9, 64*1-4($out)
  1911. srdi r9, r9, 32
  1912. stw r10,64*2-4($out)
  1913. srdi r10,r10,32
  1914. stw r11,64*3-4($out)
  1915. srdi r11,r11,32
  1916. stw r8, 64*4-4($out)
  1917. stw r9, 64*5-4($out)
  1918. stw r10,64*6-4($out)
  1919. stw r11,64*7-4($out)
  1920. addi $out,$out,64*8
  1921. ld r8, 32($inp) # Y
  1922. ld r9, 40($inp)
  1923. ld r10,48($inp)
  1924. ld r11,56($inp)
  1925. stw r8, 64*0-4($out)
  1926. srdi r8, r8, 32
  1927. stw r9, 64*1-4($out)
  1928. srdi r9, r9, 32
  1929. stw r10,64*2-4($out)
  1930. srdi r10,r10,32
  1931. stw r11,64*3-4($out)
  1932. srdi r11,r11,32
  1933. stw r8, 64*4-4($out)
  1934. stw r9, 64*5-4($out)
  1935. stw r10,64*6-4($out)
  1936. stw r11,64*7-4($out)
  1937. addi $out,$out,64*8
  1938. ld r8, 64($inp) # Z
  1939. ld r9, 72($inp)
  1940. ld r10,80($inp)
  1941. ld r11,88($inp)
  1942. stw r8, 64*0-4($out)
  1943. srdi r8, r8, 32
  1944. stw r9, 64*1-4($out)
  1945. srdi r9, r9, 32
  1946. stw r10,64*2-4($out)
  1947. srdi r10,r10,32
  1948. stw r11,64*3-4($out)
  1949. srdi r11,r11,32
  1950. stw r8, 64*4-4($out)
  1951. stw r9, 64*5-4($out)
  1952. stw r10,64*6-4($out)
  1953. stw r11,64*7-4($out)
  1954. blr
  1955. .long 0
  1956. .byte 0,12,0x14,0,0,0,3,0
  1957. .long 0
  1958. .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
  1959. ########################################################################
  1960. # void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
  1961. # int index);
  1962. .globl ecp_nistz256_gather_w5
  1963. .align 4
  1964. ecp_nistz256_gather_w5:
  1965. neg r0,$index
  1966. sradi r0,r0,63
  1967. add $index,$index,r0
  1968. slwi $index,$index,2
  1969. add $inp,$inp,$index
  1970. lwz r5, 64*0($inp)
  1971. lwz r6, 64*1($inp)
  1972. lwz r7, 64*2($inp)
  1973. lwz r8, 64*3($inp)
  1974. lwz r9, 64*4($inp)
  1975. lwz r10,64*5($inp)
  1976. lwz r11,64*6($inp)
  1977. lwz r12,64*7($inp)
  1978. addi $inp,$inp,64*8
  1979. sldi r9, r9, 32
  1980. sldi r10,r10,32
  1981. sldi r11,r11,32
  1982. sldi r12,r12,32
  1983. or r5,r5,r9
  1984. or r6,r6,r10
  1985. or r7,r7,r11
  1986. or r8,r8,r12
  1987. and r5,r5,r0
  1988. and r6,r6,r0
  1989. and r7,r7,r0
  1990. and r8,r8,r0
  1991. std r5,0($out) # X
  1992. std r6,8($out)
  1993. std r7,16($out)
  1994. std r8,24($out)
  1995. lwz r5, 64*0($inp)
  1996. lwz r6, 64*1($inp)
  1997. lwz r7, 64*2($inp)
  1998. lwz r8, 64*3($inp)
  1999. lwz r9, 64*4($inp)
  2000. lwz r10,64*5($inp)
  2001. lwz r11,64*6($inp)
  2002. lwz r12,64*7($inp)
  2003. addi $inp,$inp,64*8
  2004. sldi r9, r9, 32
  2005. sldi r10,r10,32
  2006. sldi r11,r11,32
  2007. sldi r12,r12,32
  2008. or r5,r5,r9
  2009. or r6,r6,r10
  2010. or r7,r7,r11
  2011. or r8,r8,r12
  2012. and r5,r5,r0
  2013. and r6,r6,r0
  2014. and r7,r7,r0
  2015. and r8,r8,r0
  2016. std r5,32($out) # Y
  2017. std r6,40($out)
  2018. std r7,48($out)
  2019. std r8,56($out)
  2020. lwz r5, 64*0($inp)
  2021. lwz r6, 64*1($inp)
  2022. lwz r7, 64*2($inp)
  2023. lwz r8, 64*3($inp)
  2024. lwz r9, 64*4($inp)
  2025. lwz r10,64*5($inp)
  2026. lwz r11,64*6($inp)
  2027. lwz r12,64*7($inp)
  2028. sldi r9, r9, 32
  2029. sldi r10,r10,32
  2030. sldi r11,r11,32
  2031. sldi r12,r12,32
  2032. or r5,r5,r9
  2033. or r6,r6,r10
  2034. or r7,r7,r11
  2035. or r8,r8,r12
  2036. and r5,r5,r0
  2037. and r6,r6,r0
  2038. and r7,r7,r0
  2039. and r8,r8,r0
  2040. std r5,64($out) # Z
  2041. std r6,72($out)
  2042. std r7,80($out)
  2043. std r8,88($out)
  2044. blr
  2045. .long 0
  2046. .byte 0,12,0x14,0,0,0,3,0
  2047. .long 0
  2048. .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
  2049. ########################################################################
  2050. # void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
  2051. # int index);
  2052. .globl ecp_nistz256_scatter_w7
  2053. .align 4
  2054. ecp_nistz256_scatter_w7:
  2055. li r0,8
  2056. mtctr r0
  2057. add $out,$out,$index
  2058. subi $inp,$inp,8
  2059. .Loop_scatter_w7:
  2060. ldu r0,8($inp)
  2061. stb r0,64*0($out)
  2062. srdi r0,r0,8
  2063. stb r0,64*1($out)
  2064. srdi r0,r0,8
  2065. stb r0,64*2($out)
  2066. srdi r0,r0,8
  2067. stb r0,64*3($out)
  2068. srdi r0,r0,8
  2069. stb r0,64*4($out)
  2070. srdi r0,r0,8
  2071. stb r0,64*5($out)
  2072. srdi r0,r0,8
  2073. stb r0,64*6($out)
  2074. srdi r0,r0,8
  2075. stb r0,64*7($out)
  2076. addi $out,$out,64*8
  2077. bdnz .Loop_scatter_w7
  2078. blr
  2079. .long 0
  2080. .byte 0,12,0x14,0,0,0,3,0
  2081. .long 0
  2082. .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
  2083. ########################################################################
  2084. # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
  2085. # int index);
  2086. .globl ecp_nistz256_gather_w7
  2087. .align 4
  2088. ecp_nistz256_gather_w7:
  2089. li r0,8
  2090. mtctr r0
  2091. neg r0,$index
  2092. sradi r0,r0,63
  2093. add $index,$index,r0
  2094. add $inp,$inp,$index
  2095. subi $out,$out,8
  2096. .Loop_gather_w7:
  2097. lbz r5, 64*0($inp)
  2098. lbz r6, 64*1($inp)
  2099. lbz r7, 64*2($inp)
  2100. lbz r8, 64*3($inp)
  2101. lbz r9, 64*4($inp)
  2102. lbz r10,64*5($inp)
  2103. lbz r11,64*6($inp)
  2104. lbz r12,64*7($inp)
  2105. addi $inp,$inp,64*8
  2106. sldi r6, r6, 8
  2107. sldi r7, r7, 16
  2108. sldi r8, r8, 24
  2109. sldi r9, r9, 32
  2110. sldi r10,r10,40
  2111. sldi r11,r11,48
  2112. sldi r12,r12,56
  2113. or r5,r5,r6
  2114. or r7,r7,r8
  2115. or r9,r9,r10
  2116. or r11,r11,r12
  2117. or r5,r5,r7
  2118. or r9,r9,r11
  2119. or r5,r5,r9
  2120. and r5,r5,r0
  2121. stdu r5,8($out)
  2122. bdnz .Loop_gather_w7
  2123. blr
  2124. .long 0
  2125. .byte 0,12,0x14,0,0,0,3,0
  2126. .long 0
  2127. .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
  2128. ___
  2129. }
  2130. foreach (split("\n",$code)) {
  2131. s/\`([^\`]*)\`/eval $1/ge;
  2132. print $_,"\n";
  2133. }
  2134. close STDOUT or die "error closing STDOUT: $!"; # enforce flush