ppc.pl 44 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # Implemented as a Perl wrapper as we want to support several different
  9. # architectures with single file. We pick up the target based on the
  10. # file name we are asked to generate.
  11. #
  12. # It should be noted though that this perl code is nothing like
  13. # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
  14. # as pre-processor to cover for platform differences in name decoration,
  15. # linker tables, 32-/64-bit instruction sets...
  16. #
  17. # As you might know there're several PowerPC ABI in use. Most notably
  18. # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
  19. # are similar enough to implement leaf(!) functions, which would be ABI
  20. # neutral. And that's what you find here: ABI neutral leaf functions.
  21. # In case you wonder what that is...
  22. #
  23. # AIX performance
  24. #
  25. # MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
  26. #
  27. # The following is the performance of 32-bit compiler
  28. # generated code:
  29. #
  30. # OpenSSL 0.9.6c 21 dec 2001
  31. # built on: Tue Jun 11 11:06:51 EDT 2002
  32. # options:bn(64,32) ...
  33. #compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
  34. # sign verify sign/s verify/s
  35. #rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
  36. #rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
  37. #rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
  38. #rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
  39. #dsa 512 bits 0.0087s 0.0106s 114.3 94.5
  40. #dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
  41. #
  42. # Same benchmark with this assembler code:
  43. #
  44. #rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
  45. #rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
  46. #rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
  47. #rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
  48. #dsa 512 bits 0.0052s 0.0062s 191.6 162.0
  49. #dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
  50. #
  51. # Number of operations increases by at almost 75%
  52. #
  53. # Here are performance numbers for 64-bit compiler
  54. # generated code:
  55. #
  56. # OpenSSL 0.9.6g [engine] 9 Aug 2002
  57. # built on: Fri Apr 18 16:59:20 EDT 2003
  58. # options:bn(64,64) ...
  59. # compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
  60. # sign verify sign/s verify/s
  61. #rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
  62. #rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
  63. #rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
  64. #rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
  65. #dsa 512 bits 0.0026s 0.0032s 382.5 313.7
  66. #dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
  67. #
  68. # Same benchmark with this assembler code:
  69. #
  70. #rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
  71. #rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
  72. #rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
  73. #rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
  74. #dsa 512 bits 0.0016s 0.0020s 610.7 507.1
  75. #dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
  76. #
  77. # Again, performance increases by at about 75%
  78. #
  79. # Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
  80. # OpenSSL 0.9.7c 30 Sep 2003
  81. #
  82. # Original code.
  83. #
  84. #rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
  85. #rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
  86. #rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
  87. #rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
  88. #dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
  89. #dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
  90. #dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
  91. #
  92. # Same benchmark with this assembler code:
  93. #
  94. #rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
  95. #rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
  96. #rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
  97. #rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
  98. #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
  99. #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
  100. #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
  101. #
  102. # Performance increase of ~60%
  103. # Based on submission from Suresh N. Chari of IBM
  104. $flavour = shift;
  105. if ($flavour =~ /32/) {
  106. $BITS= 32;
  107. $BNSZ= $BITS/8;
  108. $ISA= "\"ppc\"";
  109. $LD= "lwz"; # load
  110. $LDU= "lwzu"; # load and update
  111. $ST= "stw"; # store
  112. $STU= "stwu"; # store and update
  113. $UMULL= "mullw"; # unsigned multiply low
  114. $UMULH= "mulhwu"; # unsigned multiply high
  115. $UDIV= "divwu"; # unsigned divide
  116. $UCMPI= "cmplwi"; # unsigned compare with immediate
  117. $UCMP= "cmplw"; # unsigned compare
  118. $CNTLZ= "cntlzw"; # count leading zeros
  119. $SHL= "slw"; # shift left
  120. $SHR= "srw"; # unsigned shift right
  121. $SHRI= "srwi"; # unsigned shift right by immediate
  122. $SHLI= "slwi"; # shift left by immediate
  123. $CLRU= "clrlwi"; # clear upper bits
  124. $INSR= "insrwi"; # insert right
  125. $ROTL= "rotlwi"; # rotate left by immediate
  126. $TR= "tw"; # conditional trap
  127. } elsif ($flavour =~ /64/) {
  128. $BITS= 64;
  129. $BNSZ= $BITS/8;
  130. $ISA= "\"ppc64\"";
  131. # same as above, but 64-bit mnemonics...
  132. $LD= "ld"; # load
  133. $LDU= "ldu"; # load and update
  134. $ST= "std"; # store
  135. $STU= "stdu"; # store and update
  136. $UMULL= "mulld"; # unsigned multiply low
  137. $UMULH= "mulhdu"; # unsigned multiply high
  138. $UDIV= "divdu"; # unsigned divide
  139. $UCMPI= "cmpldi"; # unsigned compare with immediate
  140. $UCMP= "cmpld"; # unsigned compare
  141. $CNTLZ= "cntlzd"; # count leading zeros
  142. $SHL= "sld"; # shift left
  143. $SHR= "srd"; # unsigned shift right
  144. $SHRI= "srdi"; # unsigned shift right by immediate
  145. $SHLI= "sldi"; # shift left by immediate
  146. $CLRU= "clrldi"; # clear upper bits
  147. $INSR= "insrdi"; # insert right
  148. $ROTL= "rotldi"; # rotate left by immediate
  149. $TR= "td"; # conditional trap
  150. } else { die "nonsense $flavour"; }
  151. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  152. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  153. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  154. die "can't locate ppc-xlate.pl";
  155. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  156. $data=<<EOF;
  157. #--------------------------------------------------------------------
  158. #
  159. #
  160. #
  161. #
  162. # File: ppc32.s
  163. #
  164. # Created by: Suresh Chari
  165. # IBM Thomas J. Watson Research Library
  166. # Hawthorne, NY
  167. #
  168. #
  169. # Description: Optimized assembly routines for OpenSSL crypto
  170. # on the 32 bitPowerPC platform.
  171. #
  172. #
  173. # Version History
  174. #
  175. # 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
  176. # cleaned up code. Also made a single version which can
  177. # be used for both the AIX and Linux compilers. See NOTE
  178. # below.
  179. # 12/05/03 Suresh Chari
  180. # (with lots of help from) Andy Polyakov
  181. ##
  182. # 1. Initial version 10/20/02 Suresh Chari
  183. #
  184. #
  185. # The following file works for the xlc,cc
  186. # and gcc compilers.
  187. #
  188. # NOTE: To get the file to link correctly with the gcc compiler
  189. # you have to change the names of the routines and remove
  190. # the first .(dot) character. This should automatically
  191. # be done in the build process.
  192. #
  193. # Hand optimized assembly code for the following routines
  194. #
  195. # bn_sqr_comba4
  196. # bn_sqr_comba8
  197. # bn_mul_comba4
  198. # bn_mul_comba8
  199. # bn_sub_words
  200. # bn_add_words
  201. # bn_div_words
  202. # bn_sqr_words
  203. # bn_mul_words
  204. # bn_mul_add_words
  205. #
  206. # NOTE: It is possible to optimize this code more for
  207. # specific PowerPC or Power architectures. On the Northstar
  208. # architecture the optimizations in this file do
  209. # NOT provide much improvement.
  210. #
  211. # If you have comments or suggestions to improve code send
  212. # me a note at schari\@us.ibm.com
  213. #
  214. #--------------------------------------------------------------------------
  215. #
  216. # Defines to be used in the assembly code.
  217. #
  218. #.set r0,0 # we use it as storage for value of 0
  219. #.set SP,1 # preserved
  220. #.set RTOC,2 # preserved
  221. #.set r3,3 # 1st argument/return value
  222. #.set r4,4 # 2nd argument/volatile register
  223. #.set r5,5 # 3rd argument/volatile register
  224. #.set r6,6 # ...
  225. #.set r7,7
  226. #.set r8,8
  227. #.set r9,9
  228. #.set r10,10
  229. #.set r11,11
  230. #.set r12,12
  231. #.set r13,13 # not used, nor any other "below" it...
  232. # Declare function names to be global
  233. # NOTE: For gcc these names MUST be changed to remove
  234. # the first . i.e. for example change ".bn_sqr_comba4"
  235. # to "bn_sqr_comba4". This should be automatically done
  236. # in the build.
  237. .globl .bn_sqr_comba4
  238. .globl .bn_sqr_comba8
  239. .globl .bn_mul_comba4
  240. .globl .bn_mul_comba8
  241. .globl .bn_sub_words
  242. .globl .bn_add_words
  243. .globl .bn_div_words
  244. .globl .bn_sqr_words
  245. .globl .bn_mul_words
  246. .globl .bn_mul_add_words
  247. # .text section
  248. .machine "any"
  249. .text
  250. #
  251. # NOTE: The following label name should be changed to
  252. # "bn_sqr_comba4" i.e. remove the first dot
  253. # for the gcc compiler. This should be automatically
  254. # done in the build
  255. #
  256. .align 4
  257. .bn_sqr_comba4:
  258. #
  259. # Optimized version of bn_sqr_comba4.
  260. #
  261. # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
  262. # r3 contains r
  263. # r4 contains a
  264. #
  265. # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
  266. #
  267. # r5,r6 are the two BN_ULONGs being multiplied.
  268. # r7,r8 are the results of the 32x32 giving 64 bit multiply.
  269. # r9,r10, r11 are the equivalents of c1,c2, c3.
  270. # Here's the assembly
  271. #
  272. #
  273. xor r0,r0,r0 # set r0 = 0. Used in the addze
  274. # instructions below
  275. #sqr_add_c(a,0,c1,c2,c3)
  276. $LD r5,`0*$BNSZ`(r4)
  277. $UMULL r9,r5,r5
  278. $UMULH r10,r5,r5 #in first iteration. No need
  279. #to add since c1=c2=c3=0.
  280. # Note c3(r11) is NOT set to 0
  281. # but will be.
  282. $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
  283. # sqr_add_c2(a,1,0,c2,c3,c1);
  284. $LD r6,`1*$BNSZ`(r4)
  285. $UMULL r7,r5,r6
  286. $UMULH r8,r5,r6
  287. addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
  288. adde r8,r8,r8
  289. addze r9,r0 # catch carry if any.
  290. # r9= r0(=0) and carry
  291. addc r10,r7,r10 # now add to temp result.
  292. addze r11,r8 # r8 added to r11 which is 0
  293. addze r9,r9
  294. $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
  295. #sqr_add_c(a,1,c3,c1,c2)
  296. $UMULL r7,r6,r6
  297. $UMULH r8,r6,r6
  298. addc r11,r7,r11
  299. adde r9,r8,r9
  300. addze r10,r0
  301. #sqr_add_c2(a,2,0,c3,c1,c2)
  302. $LD r6,`2*$BNSZ`(r4)
  303. $UMULL r7,r5,r6
  304. $UMULH r8,r5,r6
  305. addc r7,r7,r7
  306. adde r8,r8,r8
  307. addze r10,r10
  308. addc r11,r7,r11
  309. adde r9,r8,r9
  310. addze r10,r10
  311. $ST r11,`2*$BNSZ`(r3) #r[2]=c3
  312. #sqr_add_c2(a,3,0,c1,c2,c3);
  313. $LD r6,`3*$BNSZ`(r4)
  314. $UMULL r7,r5,r6
  315. $UMULH r8,r5,r6
  316. addc r7,r7,r7
  317. adde r8,r8,r8
  318. addze r11,r0
  319. addc r9,r7,r9
  320. adde r10,r8,r10
  321. addze r11,r11
  322. #sqr_add_c2(a,2,1,c1,c2,c3);
  323. $LD r5,`1*$BNSZ`(r4)
  324. $LD r6,`2*$BNSZ`(r4)
  325. $UMULL r7,r5,r6
  326. $UMULH r8,r5,r6
  327. addc r7,r7,r7
  328. adde r8,r8,r8
  329. addze r11,r11
  330. addc r9,r7,r9
  331. adde r10,r8,r10
  332. addze r11,r11
  333. $ST r9,`3*$BNSZ`(r3) #r[3]=c1
  334. #sqr_add_c(a,2,c2,c3,c1);
  335. $UMULL r7,r6,r6
  336. $UMULH r8,r6,r6
  337. addc r10,r7,r10
  338. adde r11,r8,r11
  339. addze r9,r0
  340. #sqr_add_c2(a,3,1,c2,c3,c1);
  341. $LD r6,`3*$BNSZ`(r4)
  342. $UMULL r7,r5,r6
  343. $UMULH r8,r5,r6
  344. addc r7,r7,r7
  345. adde r8,r8,r8
  346. addze r9,r9
  347. addc r10,r7,r10
  348. adde r11,r8,r11
  349. addze r9,r9
  350. $ST r10,`4*$BNSZ`(r3) #r[4]=c2
  351. #sqr_add_c2(a,3,2,c3,c1,c2);
  352. $LD r5,`2*$BNSZ`(r4)
  353. $UMULL r7,r5,r6
  354. $UMULH r8,r5,r6
  355. addc r7,r7,r7
  356. adde r8,r8,r8
  357. addze r10,r0
  358. addc r11,r7,r11
  359. adde r9,r8,r9
  360. addze r10,r10
  361. $ST r11,`5*$BNSZ`(r3) #r[5] = c3
  362. #sqr_add_c(a,3,c1,c2,c3);
  363. $UMULL r7,r6,r6
  364. $UMULH r8,r6,r6
  365. addc r9,r7,r9
  366. adde r10,r8,r10
  367. $ST r9,`6*$BNSZ`(r3) #r[6]=c1
  368. $ST r10,`7*$BNSZ`(r3) #r[7]=c2
  369. blr
  370. .long 0
  371. .byte 0,12,0x14,0,0,0,2,0
  372. .long 0
  373. .size .bn_sqr_comba4,.-.bn_sqr_comba4
  374. #
  375. # NOTE: The following label name should be changed to
  376. # "bn_sqr_comba8" i.e. remove the first dot
  377. # for the gcc compiler. This should be automatically
  378. # done in the build
  379. #
  380. .align 4
  381. .bn_sqr_comba8:
  382. #
  383. # This is an optimized version of the bn_sqr_comba8 routine.
  384. # Tightly uses the adde instruction
  385. #
  386. #
  387. # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
  388. # r3 contains r
  389. # r4 contains a
  390. #
  391. # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
  392. #
  393. # r5,r6 are the two BN_ULONGs being multiplied.
  394. # r7,r8 are the results of the 32x32 giving 64 bit multiply.
  395. # r9,r10, r11 are the equivalents of c1,c2, c3.
  396. #
  397. # Possible optimization of loading all 8 longs of a into registers
  398. # doesn't provide any speedup
  399. #
  400. xor r0,r0,r0 #set r0 = 0.Used in addze
  401. #instructions below.
  402. #sqr_add_c(a,0,c1,c2,c3);
  403. $LD r5,`0*$BNSZ`(r4)
  404. $UMULL r9,r5,r5 #1st iteration: no carries.
  405. $UMULH r10,r5,r5
  406. $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
  407. #sqr_add_c2(a,1,0,c2,c3,c1);
  408. $LD r6,`1*$BNSZ`(r4)
  409. $UMULL r7,r5,r6
  410. $UMULH r8,r5,r6
  411. addc r10,r7,r10 #add the two register number
  412. adde r11,r8,r0 # (r8,r7) to the three register
  413. addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
  414. addc r10,r7,r10 #add the two register number
  415. adde r11,r8,r11 # (r8,r7) to the three register
  416. addze r9,r9 # number (r9,r11,r10).
  417. $ST r10,`1*$BNSZ`(r3) # r[1]=c2
  418. #sqr_add_c(a,1,c3,c1,c2);
  419. $UMULL r7,r6,r6
  420. $UMULH r8,r6,r6
  421. addc r11,r7,r11
  422. adde r9,r8,r9
  423. addze r10,r0
  424. #sqr_add_c2(a,2,0,c3,c1,c2);
  425. $LD r6,`2*$BNSZ`(r4)
  426. $UMULL r7,r5,r6
  427. $UMULH r8,r5,r6
  428. addc r11,r7,r11
  429. adde r9,r8,r9
  430. addze r10,r10
  431. addc r11,r7,r11
  432. adde r9,r8,r9
  433. addze r10,r10
  434. $ST r11,`2*$BNSZ`(r3) #r[2]=c3
  435. #sqr_add_c2(a,3,0,c1,c2,c3);
  436. $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
  437. $UMULL r7,r5,r6
  438. $UMULH r8,r5,r6
  439. addc r9,r7,r9
  440. adde r10,r8,r10
  441. addze r11,r0
  442. addc r9,r7,r9
  443. adde r10,r8,r10
  444. addze r11,r11
  445. #sqr_add_c2(a,2,1,c1,c2,c3);
  446. $LD r5,`1*$BNSZ`(r4)
  447. $LD r6,`2*$BNSZ`(r4)
  448. $UMULL r7,r5,r6
  449. $UMULH r8,r5,r6
  450. addc r9,r7,r9
  451. adde r10,r8,r10
  452. addze r11,r11
  453. addc r9,r7,r9
  454. adde r10,r8,r10
  455. addze r11,r11
  456. $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
  457. #sqr_add_c(a,2,c2,c3,c1);
  458. $UMULL r7,r6,r6
  459. $UMULH r8,r6,r6
  460. addc r10,r7,r10
  461. adde r11,r8,r11
  462. addze r9,r0
  463. #sqr_add_c2(a,3,1,c2,c3,c1);
  464. $LD r6,`3*$BNSZ`(r4)
  465. $UMULL r7,r5,r6
  466. $UMULH r8,r5,r6
  467. addc r10,r7,r10
  468. adde r11,r8,r11
  469. addze r9,r9
  470. addc r10,r7,r10
  471. adde r11,r8,r11
  472. addze r9,r9
  473. #sqr_add_c2(a,4,0,c2,c3,c1);
  474. $LD r5,`0*$BNSZ`(r4)
  475. $LD r6,`4*$BNSZ`(r4)
  476. $UMULL r7,r5,r6
  477. $UMULH r8,r5,r6
  478. addc r10,r7,r10
  479. adde r11,r8,r11
  480. addze r9,r9
  481. addc r10,r7,r10
  482. adde r11,r8,r11
  483. addze r9,r9
  484. $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
  485. #sqr_add_c2(a,5,0,c3,c1,c2);
  486. $LD r6,`5*$BNSZ`(r4)
  487. $UMULL r7,r5,r6
  488. $UMULH r8,r5,r6
  489. addc r11,r7,r11
  490. adde r9,r8,r9
  491. addze r10,r0
  492. addc r11,r7,r11
  493. adde r9,r8,r9
  494. addze r10,r10
  495. #sqr_add_c2(a,4,1,c3,c1,c2);
  496. $LD r5,`1*$BNSZ`(r4)
  497. $LD r6,`4*$BNSZ`(r4)
  498. $UMULL r7,r5,r6
  499. $UMULH r8,r5,r6
  500. addc r11,r7,r11
  501. adde r9,r8,r9
  502. addze r10,r10
  503. addc r11,r7,r11
  504. adde r9,r8,r9
  505. addze r10,r10
  506. #sqr_add_c2(a,3,2,c3,c1,c2);
  507. $LD r5,`2*$BNSZ`(r4)
  508. $LD r6,`3*$BNSZ`(r4)
  509. $UMULL r7,r5,r6
  510. $UMULH r8,r5,r6
  511. addc r11,r7,r11
  512. adde r9,r8,r9
  513. addze r10,r10
  514. addc r11,r7,r11
  515. adde r9,r8,r9
  516. addze r10,r10
  517. $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
  518. #sqr_add_c(a,3,c1,c2,c3);
  519. $UMULL r7,r6,r6
  520. $UMULH r8,r6,r6
  521. addc r9,r7,r9
  522. adde r10,r8,r10
  523. addze r11,r0
  524. #sqr_add_c2(a,4,2,c1,c2,c3);
  525. $LD r6,`4*$BNSZ`(r4)
  526. $UMULL r7,r5,r6
  527. $UMULH r8,r5,r6
  528. addc r9,r7,r9
  529. adde r10,r8,r10
  530. addze r11,r11
  531. addc r9,r7,r9
  532. adde r10,r8,r10
  533. addze r11,r11
  534. #sqr_add_c2(a,5,1,c1,c2,c3);
  535. $LD r5,`1*$BNSZ`(r4)
  536. $LD r6,`5*$BNSZ`(r4)
  537. $UMULL r7,r5,r6
  538. $UMULH r8,r5,r6
  539. addc r9,r7,r9
  540. adde r10,r8,r10
  541. addze r11,r11
  542. addc r9,r7,r9
  543. adde r10,r8,r10
  544. addze r11,r11
  545. #sqr_add_c2(a,6,0,c1,c2,c3);
  546. $LD r5,`0*$BNSZ`(r4)
  547. $LD r6,`6*$BNSZ`(r4)
  548. $UMULL r7,r5,r6
  549. $UMULH r8,r5,r6
  550. addc r9,r7,r9
  551. adde r10,r8,r10
  552. addze r11,r11
  553. addc r9,r7,r9
  554. adde r10,r8,r10
  555. addze r11,r11
  556. $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
  557. #sqr_add_c2(a,7,0,c2,c3,c1);
  558. $LD r6,`7*$BNSZ`(r4)
  559. $UMULL r7,r5,r6
  560. $UMULH r8,r5,r6
  561. addc r10,r7,r10
  562. adde r11,r8,r11
  563. addze r9,r0
  564. addc r10,r7,r10
  565. adde r11,r8,r11
  566. addze r9,r9
  567. #sqr_add_c2(a,6,1,c2,c3,c1);
  568. $LD r5,`1*$BNSZ`(r4)
  569. $LD r6,`6*$BNSZ`(r4)
  570. $UMULL r7,r5,r6
  571. $UMULH r8,r5,r6
  572. addc r10,r7,r10
  573. adde r11,r8,r11
  574. addze r9,r9
  575. addc r10,r7,r10
  576. adde r11,r8,r11
  577. addze r9,r9
  578. #sqr_add_c2(a,5,2,c2,c3,c1);
  579. $LD r5,`2*$BNSZ`(r4)
  580. $LD r6,`5*$BNSZ`(r4)
  581. $UMULL r7,r5,r6
  582. $UMULH r8,r5,r6
  583. addc r10,r7,r10
  584. adde r11,r8,r11
  585. addze r9,r9
  586. addc r10,r7,r10
  587. adde r11,r8,r11
  588. addze r9,r9
  589. #sqr_add_c2(a,4,3,c2,c3,c1);
  590. $LD r5,`3*$BNSZ`(r4)
  591. $LD r6,`4*$BNSZ`(r4)
  592. $UMULL r7,r5,r6
  593. $UMULH r8,r5,r6
  594. addc r10,r7,r10
  595. adde r11,r8,r11
  596. addze r9,r9
  597. addc r10,r7,r10
  598. adde r11,r8,r11
  599. addze r9,r9
  600. $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
  601. #sqr_add_c(a,4,c3,c1,c2);
  602. $UMULL r7,r6,r6
  603. $UMULH r8,r6,r6
  604. addc r11,r7,r11
  605. adde r9,r8,r9
  606. addze r10,r0
  607. #sqr_add_c2(a,5,3,c3,c1,c2);
  608. $LD r6,`5*$BNSZ`(r4)
  609. $UMULL r7,r5,r6
  610. $UMULH r8,r5,r6
  611. addc r11,r7,r11
  612. adde r9,r8,r9
  613. addze r10,r10
  614. addc r11,r7,r11
  615. adde r9,r8,r9
  616. addze r10,r10
  617. #sqr_add_c2(a,6,2,c3,c1,c2);
  618. $LD r5,`2*$BNSZ`(r4)
  619. $LD r6,`6*$BNSZ`(r4)
  620. $UMULL r7,r5,r6
  621. $UMULH r8,r5,r6
  622. addc r11,r7,r11
  623. adde r9,r8,r9
  624. addze r10,r10
  625. addc r11,r7,r11
  626. adde r9,r8,r9
  627. addze r10,r10
  628. #sqr_add_c2(a,7,1,c3,c1,c2);
  629. $LD r5,`1*$BNSZ`(r4)
  630. $LD r6,`7*$BNSZ`(r4)
  631. $UMULL r7,r5,r6
  632. $UMULH r8,r5,r6
  633. addc r11,r7,r11
  634. adde r9,r8,r9
  635. addze r10,r10
  636. addc r11,r7,r11
  637. adde r9,r8,r9
  638. addze r10,r10
  639. $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
  640. #sqr_add_c2(a,7,2,c1,c2,c3);
  641. $LD r5,`2*$BNSZ`(r4)
  642. $UMULL r7,r5,r6
  643. $UMULH r8,r5,r6
  644. addc r9,r7,r9
  645. adde r10,r8,r10
  646. addze r11,r0
  647. addc r9,r7,r9
  648. adde r10,r8,r10
  649. addze r11,r11
  650. #sqr_add_c2(a,6,3,c1,c2,c3);
  651. $LD r5,`3*$BNSZ`(r4)
  652. $LD r6,`6*$BNSZ`(r4)
  653. $UMULL r7,r5,r6
  654. $UMULH r8,r5,r6
  655. addc r9,r7,r9
  656. adde r10,r8,r10
  657. addze r11,r11
  658. addc r9,r7,r9
  659. adde r10,r8,r10
  660. addze r11,r11
  661. #sqr_add_c2(a,5,4,c1,c2,c3);
  662. $LD r5,`4*$BNSZ`(r4)
  663. $LD r6,`5*$BNSZ`(r4)
  664. $UMULL r7,r5,r6
  665. $UMULH r8,r5,r6
  666. addc r9,r7,r9
  667. adde r10,r8,r10
  668. addze r11,r11
  669. addc r9,r7,r9
  670. adde r10,r8,r10
  671. addze r11,r11
  672. $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
  673. #sqr_add_c(a,5,c2,c3,c1);
  674. $UMULL r7,r6,r6
  675. $UMULH r8,r6,r6
  676. addc r10,r7,r10
  677. adde r11,r8,r11
  678. addze r9,r0
  679. #sqr_add_c2(a,6,4,c2,c3,c1);
  680. $LD r6,`6*$BNSZ`(r4)
  681. $UMULL r7,r5,r6
  682. $UMULH r8,r5,r6
  683. addc r10,r7,r10
  684. adde r11,r8,r11
  685. addze r9,r9
  686. addc r10,r7,r10
  687. adde r11,r8,r11
  688. addze r9,r9
  689. #sqr_add_c2(a,7,3,c2,c3,c1);
  690. $LD r5,`3*$BNSZ`(r4)
  691. $LD r6,`7*$BNSZ`(r4)
  692. $UMULL r7,r5,r6
  693. $UMULH r8,r5,r6
  694. addc r10,r7,r10
  695. adde r11,r8,r11
  696. addze r9,r9
  697. addc r10,r7,r10
  698. adde r11,r8,r11
  699. addze r9,r9
  700. $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
  701. #sqr_add_c2(a,7,4,c3,c1,c2);
  702. $LD r5,`4*$BNSZ`(r4)
  703. $UMULL r7,r5,r6
  704. $UMULH r8,r5,r6
  705. addc r11,r7,r11
  706. adde r9,r8,r9
  707. addze r10,r0
  708. addc r11,r7,r11
  709. adde r9,r8,r9
  710. addze r10,r10
  711. #sqr_add_c2(a,6,5,c3,c1,c2);
  712. $LD r5,`5*$BNSZ`(r4)
  713. $LD r6,`6*$BNSZ`(r4)
  714. $UMULL r7,r5,r6
  715. $UMULH r8,r5,r6
  716. addc r11,r7,r11
  717. adde r9,r8,r9
  718. addze r10,r10
  719. addc r11,r7,r11
  720. adde r9,r8,r9
  721. addze r10,r10
  722. $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
  723. #sqr_add_c(a,6,c1,c2,c3);
  724. $UMULL r7,r6,r6
  725. $UMULH r8,r6,r6
  726. addc r9,r7,r9
  727. adde r10,r8,r10
  728. addze r11,r0
  729. #sqr_add_c2(a,7,5,c1,c2,c3)
  730. $LD r6,`7*$BNSZ`(r4)
  731. $UMULL r7,r5,r6
  732. $UMULH r8,r5,r6
  733. addc r9,r7,r9
  734. adde r10,r8,r10
  735. addze r11,r11
  736. addc r9,r7,r9
  737. adde r10,r8,r10
  738. addze r11,r11
  739. $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
  740. #sqr_add_c2(a,7,6,c2,c3,c1)
  741. $LD r5,`6*$BNSZ`(r4)
  742. $UMULL r7,r5,r6
  743. $UMULH r8,r5,r6
  744. addc r10,r7,r10
  745. adde r11,r8,r11
  746. addze r9,r0
  747. addc r10,r7,r10
  748. adde r11,r8,r11
  749. addze r9,r9
  750. $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
  751. #sqr_add_c(a,7,c3,c1,c2);
  752. $UMULL r7,r6,r6
  753. $UMULH r8,r6,r6
  754. addc r11,r7,r11
  755. adde r9,r8,r9
  756. $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
  757. $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
  758. blr
  759. .long 0
  760. .byte 0,12,0x14,0,0,0,2,0
  761. .long 0
  762. .size .bn_sqr_comba8,.-.bn_sqr_comba8
  763. #
  764. # NOTE: The following label name should be changed to
  765. # "bn_mul_comba4" i.e. remove the first dot
  766. # for the gcc compiler. This should be automatically
  767. # done in the build
  768. #
  769. .align 4
  770. .bn_mul_comba4:
  771. #
  772. # This is an optimized version of the bn_mul_comba4 routine.
  773. #
  774. # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  775. # r3 contains r
  776. # r4 contains a
  777. # r5 contains b
  778. # r6, r7 are the 2 BN_ULONGs being multiplied.
  779. # r8, r9 are the results of the 32x32 giving 64 multiply.
  780. # r10, r11, r12 are the equivalents of c1, c2, and c3.
  781. #
  782. xor r0,r0,r0 #r0=0. Used in addze below.
  783. #mul_add_c(a[0],b[0],c1,c2,c3);
  784. $LD r6,`0*$BNSZ`(r4)
  785. $LD r7,`0*$BNSZ`(r5)
  786. $UMULL r10,r6,r7
  787. $UMULH r11,r6,r7
  788. $ST r10,`0*$BNSZ`(r3) #r[0]=c1
  789. #mul_add_c(a[0],b[1],c2,c3,c1);
  790. $LD r7,`1*$BNSZ`(r5)
  791. $UMULL r8,r6,r7
  792. $UMULH r9,r6,r7
  793. addc r11,r8,r11
  794. adde r12,r9,r0
  795. addze r10,r0
  796. #mul_add_c(a[1],b[0],c2,c3,c1);
  797. $LD r6, `1*$BNSZ`(r4)
  798. $LD r7, `0*$BNSZ`(r5)
  799. $UMULL r8,r6,r7
  800. $UMULH r9,r6,r7
  801. addc r11,r8,r11
  802. adde r12,r9,r12
  803. addze r10,r10
  804. $ST r11,`1*$BNSZ`(r3) #r[1]=c2
  805. #mul_add_c(a[2],b[0],c3,c1,c2);
  806. $LD r6,`2*$BNSZ`(r4)
  807. $UMULL r8,r6,r7
  808. $UMULH r9,r6,r7
  809. addc r12,r8,r12
  810. adde r10,r9,r10
  811. addze r11,r0
  812. #mul_add_c(a[1],b[1],c3,c1,c2);
  813. $LD r6,`1*$BNSZ`(r4)
  814. $LD r7,`1*$BNSZ`(r5)
  815. $UMULL r8,r6,r7
  816. $UMULH r9,r6,r7
  817. addc r12,r8,r12
  818. adde r10,r9,r10
  819. addze r11,r11
  820. #mul_add_c(a[0],b[2],c3,c1,c2);
  821. $LD r6,`0*$BNSZ`(r4)
  822. $LD r7,`2*$BNSZ`(r5)
  823. $UMULL r8,r6,r7
  824. $UMULH r9,r6,r7
  825. addc r12,r8,r12
  826. adde r10,r9,r10
  827. addze r11,r11
  828. $ST r12,`2*$BNSZ`(r3) #r[2]=c3
  829. #mul_add_c(a[0],b[3],c1,c2,c3);
  830. $LD r7,`3*$BNSZ`(r5)
  831. $UMULL r8,r6,r7
  832. $UMULH r9,r6,r7
  833. addc r10,r8,r10
  834. adde r11,r9,r11
  835. addze r12,r0
  836. #mul_add_c(a[1],b[2],c1,c2,c3);
  837. $LD r6,`1*$BNSZ`(r4)
  838. $LD r7,`2*$BNSZ`(r5)
  839. $UMULL r8,r6,r7
  840. $UMULH r9,r6,r7
  841. addc r10,r8,r10
  842. adde r11,r9,r11
  843. addze r12,r12
  844. #mul_add_c(a[2],b[1],c1,c2,c3);
  845. $LD r6,`2*$BNSZ`(r4)
  846. $LD r7,`1*$BNSZ`(r5)
  847. $UMULL r8,r6,r7
  848. $UMULH r9,r6,r7
  849. addc r10,r8,r10
  850. adde r11,r9,r11
  851. addze r12,r12
  852. #mul_add_c(a[3],b[0],c1,c2,c3);
  853. $LD r6,`3*$BNSZ`(r4)
  854. $LD r7,`0*$BNSZ`(r5)
  855. $UMULL r8,r6,r7
  856. $UMULH r9,r6,r7
  857. addc r10,r8,r10
  858. adde r11,r9,r11
  859. addze r12,r12
  860. $ST r10,`3*$BNSZ`(r3) #r[3]=c1
  861. #mul_add_c(a[3],b[1],c2,c3,c1);
  862. $LD r7,`1*$BNSZ`(r5)
  863. $UMULL r8,r6,r7
  864. $UMULH r9,r6,r7
  865. addc r11,r8,r11
  866. adde r12,r9,r12
  867. addze r10,r0
  868. #mul_add_c(a[2],b[2],c2,c3,c1);
  869. $LD r6,`2*$BNSZ`(r4)
  870. $LD r7,`2*$BNSZ`(r5)
  871. $UMULL r8,r6,r7
  872. $UMULH r9,r6,r7
  873. addc r11,r8,r11
  874. adde r12,r9,r12
  875. addze r10,r10
  876. #mul_add_c(a[1],b[3],c2,c3,c1);
  877. $LD r6,`1*$BNSZ`(r4)
  878. $LD r7,`3*$BNSZ`(r5)
  879. $UMULL r8,r6,r7
  880. $UMULH r9,r6,r7
  881. addc r11,r8,r11
  882. adde r12,r9,r12
  883. addze r10,r10
  884. $ST r11,`4*$BNSZ`(r3) #r[4]=c2
  885. #mul_add_c(a[2],b[3],c3,c1,c2);
  886. $LD r6,`2*$BNSZ`(r4)
  887. $UMULL r8,r6,r7
  888. $UMULH r9,r6,r7
  889. addc r12,r8,r12
  890. adde r10,r9,r10
  891. addze r11,r0
  892. #mul_add_c(a[3],b[2],c3,c1,c2);
  893. $LD r6,`3*$BNSZ`(r4)
  894. $LD r7,`2*$BNSZ`(r5)
  895. $UMULL r8,r6,r7
  896. $UMULH r9,r6,r7
  897. addc r12,r8,r12
  898. adde r10,r9,r10
  899. addze r11,r11
  900. $ST r12,`5*$BNSZ`(r3) #r[5]=c3
  901. #mul_add_c(a[3],b[3],c1,c2,c3);
  902. $LD r7,`3*$BNSZ`(r5)
  903. $UMULL r8,r6,r7
  904. $UMULH r9,r6,r7
  905. addc r10,r8,r10
  906. adde r11,r9,r11
  907. $ST r10,`6*$BNSZ`(r3) #r[6]=c1
  908. $ST r11,`7*$BNSZ`(r3) #r[7]=c2
  909. blr
  910. .long 0
  911. .byte 0,12,0x14,0,0,0,3,0
  912. .long 0
  913. .size .bn_mul_comba4,.-.bn_mul_comba4
  914. #
  915. # NOTE: The following label name should be changed to
  916. # "bn_mul_comba8" i.e. remove the first dot
  917. # for the gcc compiler. This should be automatically
  918. # done in the build
  919. #
  920. .align 4
  921. .bn_mul_comba8:
  922. #
  923. # Optimized version of the bn_mul_comba8 routine.
  924. #
  925. # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  926. # r3 contains r
  927. # r4 contains a
  928. # r5 contains b
  929. # r6, r7 are the 2 BN_ULONGs being multiplied.
  930. # r8, r9 are the results of the 32x32 giving 64 multiply.
  931. # r10, r11, r12 are the equivalents of c1, c2, and c3.
  932. #
  933. xor r0,r0,r0 #r0=0. Used in addze below.
  934. #mul_add_c(a[0],b[0],c1,c2,c3);
  935. $LD r6,`0*$BNSZ`(r4) #a[0]
  936. $LD r7,`0*$BNSZ`(r5) #b[0]
  937. $UMULL r10,r6,r7
  938. $UMULH r11,r6,r7
  939. $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
  940. #mul_add_c(a[0],b[1],c2,c3,c1);
  941. $LD r7,`1*$BNSZ`(r5)
  942. $UMULL r8,r6,r7
  943. $UMULH r9,r6,r7
  944. addc r11,r11,r8
  945. addze r12,r9 # since we didn't set r12 to zero before.
  946. addze r10,r0
  947. #mul_add_c(a[1],b[0],c2,c3,c1);
  948. $LD r6,`1*$BNSZ`(r4)
  949. $LD r7,`0*$BNSZ`(r5)
  950. $UMULL r8,r6,r7
  951. $UMULH r9,r6,r7
  952. addc r11,r11,r8
  953. adde r12,r12,r9
  954. addze r10,r10
  955. $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
  956. #mul_add_c(a[2],b[0],c3,c1,c2);
  957. $LD r6,`2*$BNSZ`(r4)
  958. $UMULL r8,r6,r7
  959. $UMULH r9,r6,r7
  960. addc r12,r12,r8
  961. adde r10,r10,r9
  962. addze r11,r0
  963. #mul_add_c(a[1],b[1],c3,c1,c2);
  964. $LD r6,`1*$BNSZ`(r4)
  965. $LD r7,`1*$BNSZ`(r5)
  966. $UMULL r8,r6,r7
  967. $UMULH r9,r6,r7
  968. addc r12,r12,r8
  969. adde r10,r10,r9
  970. addze r11,r11
  971. #mul_add_c(a[0],b[2],c3,c1,c2);
  972. $LD r6,`0*$BNSZ`(r4)
  973. $LD r7,`2*$BNSZ`(r5)
  974. $UMULL r8,r6,r7
  975. $UMULH r9,r6,r7
  976. addc r12,r12,r8
  977. adde r10,r10,r9
  978. addze r11,r11
  979. $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
  980. #mul_add_c(a[0],b[3],c1,c2,c3);
  981. $LD r7,`3*$BNSZ`(r5)
  982. $UMULL r8,r6,r7
  983. $UMULH r9,r6,r7
  984. addc r10,r10,r8
  985. adde r11,r11,r9
  986. addze r12,r0
  987. #mul_add_c(a[1],b[2],c1,c2,c3);
  988. $LD r6,`1*$BNSZ`(r4)
  989. $LD r7,`2*$BNSZ`(r5)
  990. $UMULL r8,r6,r7
  991. $UMULH r9,r6,r7
  992. addc r10,r10,r8
  993. adde r11,r11,r9
  994. addze r12,r12
  995. #mul_add_c(a[2],b[1],c1,c2,c3);
  996. $LD r6,`2*$BNSZ`(r4)
  997. $LD r7,`1*$BNSZ`(r5)
  998. $UMULL r8,r6,r7
  999. $UMULH r9,r6,r7
  1000. addc r10,r10,r8
  1001. adde r11,r11,r9
  1002. addze r12,r12
  1003. #mul_add_c(a[3],b[0],c1,c2,c3);
  1004. $LD r6,`3*$BNSZ`(r4)
  1005. $LD r7,`0*$BNSZ`(r5)
  1006. $UMULL r8,r6,r7
  1007. $UMULH r9,r6,r7
  1008. addc r10,r10,r8
  1009. adde r11,r11,r9
  1010. addze r12,r12
  1011. $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
  1012. #mul_add_c(a[4],b[0],c2,c3,c1);
  1013. $LD r6,`4*$BNSZ`(r4)
  1014. $UMULL r8,r6,r7
  1015. $UMULH r9,r6,r7
  1016. addc r11,r11,r8
  1017. adde r12,r12,r9
  1018. addze r10,r0
  1019. #mul_add_c(a[3],b[1],c2,c3,c1);
  1020. $LD r6,`3*$BNSZ`(r4)
  1021. $LD r7,`1*$BNSZ`(r5)
  1022. $UMULL r8,r6,r7
  1023. $UMULH r9,r6,r7
  1024. addc r11,r11,r8
  1025. adde r12,r12,r9
  1026. addze r10,r10
  1027. #mul_add_c(a[2],b[2],c2,c3,c1);
  1028. $LD r6,`2*$BNSZ`(r4)
  1029. $LD r7,`2*$BNSZ`(r5)
  1030. $UMULL r8,r6,r7
  1031. $UMULH r9,r6,r7
  1032. addc r11,r11,r8
  1033. adde r12,r12,r9
  1034. addze r10,r10
  1035. #mul_add_c(a[1],b[3],c2,c3,c1);
  1036. $LD r6,`1*$BNSZ`(r4)
  1037. $LD r7,`3*$BNSZ`(r5)
  1038. $UMULL r8,r6,r7
  1039. $UMULH r9,r6,r7
  1040. addc r11,r11,r8
  1041. adde r12,r12,r9
  1042. addze r10,r10
  1043. #mul_add_c(a[0],b[4],c2,c3,c1);
  1044. $LD r6,`0*$BNSZ`(r4)
  1045. $LD r7,`4*$BNSZ`(r5)
  1046. $UMULL r8,r6,r7
  1047. $UMULH r9,r6,r7
  1048. addc r11,r11,r8
  1049. adde r12,r12,r9
  1050. addze r10,r10
  1051. $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
  1052. #mul_add_c(a[0],b[5],c3,c1,c2);
  1053. $LD r7,`5*$BNSZ`(r5)
  1054. $UMULL r8,r6,r7
  1055. $UMULH r9,r6,r7
  1056. addc r12,r12,r8
  1057. adde r10,r10,r9
  1058. addze r11,r0
  1059. #mul_add_c(a[1],b[4],c3,c1,c2);
  1060. $LD r6,`1*$BNSZ`(r4)
  1061. $LD r7,`4*$BNSZ`(r5)
  1062. $UMULL r8,r6,r7
  1063. $UMULH r9,r6,r7
  1064. addc r12,r12,r8
  1065. adde r10,r10,r9
  1066. addze r11,r11
  1067. #mul_add_c(a[2],b[3],c3,c1,c2);
  1068. $LD r6,`2*$BNSZ`(r4)
  1069. $LD r7,`3*$BNSZ`(r5)
  1070. $UMULL r8,r6,r7
  1071. $UMULH r9,r6,r7
  1072. addc r12,r12,r8
  1073. adde r10,r10,r9
  1074. addze r11,r11
  1075. #mul_add_c(a[3],b[2],c3,c1,c2);
  1076. $LD r6,`3*$BNSZ`(r4)
  1077. $LD r7,`2*$BNSZ`(r5)
  1078. $UMULL r8,r6,r7
  1079. $UMULH r9,r6,r7
  1080. addc r12,r12,r8
  1081. adde r10,r10,r9
  1082. addze r11,r11
  1083. #mul_add_c(a[4],b[1],c3,c1,c2);
  1084. $LD r6,`4*$BNSZ`(r4)
  1085. $LD r7,`1*$BNSZ`(r5)
  1086. $UMULL r8,r6,r7
  1087. $UMULH r9,r6,r7
  1088. addc r12,r12,r8
  1089. adde r10,r10,r9
  1090. addze r11,r11
  1091. #mul_add_c(a[5],b[0],c3,c1,c2);
  1092. $LD r6,`5*$BNSZ`(r4)
  1093. $LD r7,`0*$BNSZ`(r5)
  1094. $UMULL r8,r6,r7
  1095. $UMULH r9,r6,r7
  1096. addc r12,r12,r8
  1097. adde r10,r10,r9
  1098. addze r11,r11
  1099. $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
  1100. #mul_add_c(a[6],b[0],c1,c2,c3);
  1101. $LD r6,`6*$BNSZ`(r4)
  1102. $UMULL r8,r6,r7
  1103. $UMULH r9,r6,r7
  1104. addc r10,r10,r8
  1105. adde r11,r11,r9
  1106. addze r12,r0
  1107. #mul_add_c(a[5],b[1],c1,c2,c3);
  1108. $LD r6,`5*$BNSZ`(r4)
  1109. $LD r7,`1*$BNSZ`(r5)
  1110. $UMULL r8,r6,r7
  1111. $UMULH r9,r6,r7
  1112. addc r10,r10,r8
  1113. adde r11,r11,r9
  1114. addze r12,r12
  1115. #mul_add_c(a[4],b[2],c1,c2,c3);
  1116. $LD r6,`4*$BNSZ`(r4)
  1117. $LD r7,`2*$BNSZ`(r5)
  1118. $UMULL r8,r6,r7
  1119. $UMULH r9,r6,r7
  1120. addc r10,r10,r8
  1121. adde r11,r11,r9
  1122. addze r12,r12
  1123. #mul_add_c(a[3],b[3],c1,c2,c3);
  1124. $LD r6,`3*$BNSZ`(r4)
  1125. $LD r7,`3*$BNSZ`(r5)
  1126. $UMULL r8,r6,r7
  1127. $UMULH r9,r6,r7
  1128. addc r10,r10,r8
  1129. adde r11,r11,r9
  1130. addze r12,r12
  1131. #mul_add_c(a[2],b[4],c1,c2,c3);
  1132. $LD r6,`2*$BNSZ`(r4)
  1133. $LD r7,`4*$BNSZ`(r5)
  1134. $UMULL r8,r6,r7
  1135. $UMULH r9,r6,r7
  1136. addc r10,r10,r8
  1137. adde r11,r11,r9
  1138. addze r12,r12
  1139. #mul_add_c(a[1],b[5],c1,c2,c3);
  1140. $LD r6,`1*$BNSZ`(r4)
  1141. $LD r7,`5*$BNSZ`(r5)
  1142. $UMULL r8,r6,r7
  1143. $UMULH r9,r6,r7
  1144. addc r10,r10,r8
  1145. adde r11,r11,r9
  1146. addze r12,r12
  1147. #mul_add_c(a[0],b[6],c1,c2,c3);
  1148. $LD r6,`0*$BNSZ`(r4)
  1149. $LD r7,`6*$BNSZ`(r5)
  1150. $UMULL r8,r6,r7
  1151. $UMULH r9,r6,r7
  1152. addc r10,r10,r8
  1153. adde r11,r11,r9
  1154. addze r12,r12
  1155. $ST r10,`6*$BNSZ`(r3) #r[6]=c1;
  1156. #mul_add_c(a[0],b[7],c2,c3,c1);
  1157. $LD r7,`7*$BNSZ`(r5)
  1158. $UMULL r8,r6,r7
  1159. $UMULH r9,r6,r7
  1160. addc r11,r11,r8
  1161. adde r12,r12,r9
  1162. addze r10,r0
  1163. #mul_add_c(a[1],b[6],c2,c3,c1);
  1164. $LD r6,`1*$BNSZ`(r4)
  1165. $LD r7,`6*$BNSZ`(r5)
  1166. $UMULL r8,r6,r7
  1167. $UMULH r9,r6,r7
  1168. addc r11,r11,r8
  1169. adde r12,r12,r9
  1170. addze r10,r10
  1171. #mul_add_c(a[2],b[5],c2,c3,c1);
  1172. $LD r6,`2*$BNSZ`(r4)
  1173. $LD r7,`5*$BNSZ`(r5)
  1174. $UMULL r8,r6,r7
  1175. $UMULH r9,r6,r7
  1176. addc r11,r11,r8
  1177. adde r12,r12,r9
  1178. addze r10,r10
  1179. #mul_add_c(a[3],b[4],c2,c3,c1);
  1180. $LD r6,`3*$BNSZ`(r4)
  1181. $LD r7,`4*$BNSZ`(r5)
  1182. $UMULL r8,r6,r7
  1183. $UMULH r9,r6,r7
  1184. addc r11,r11,r8
  1185. adde r12,r12,r9
  1186. addze r10,r10
  1187. #mul_add_c(a[4],b[3],c2,c3,c1);
  1188. $LD r6,`4*$BNSZ`(r4)
  1189. $LD r7,`3*$BNSZ`(r5)
  1190. $UMULL r8,r6,r7
  1191. $UMULH r9,r6,r7
  1192. addc r11,r11,r8
  1193. adde r12,r12,r9
  1194. addze r10,r10
  1195. #mul_add_c(a[5],b[2],c2,c3,c1);
  1196. $LD r6,`5*$BNSZ`(r4)
  1197. $LD r7,`2*$BNSZ`(r5)
  1198. $UMULL r8,r6,r7
  1199. $UMULH r9,r6,r7
  1200. addc r11,r11,r8
  1201. adde r12,r12,r9
  1202. addze r10,r10
  1203. #mul_add_c(a[6],b[1],c2,c3,c1);
  1204. $LD r6,`6*$BNSZ`(r4)
  1205. $LD r7,`1*$BNSZ`(r5)
  1206. $UMULL r8,r6,r7
  1207. $UMULH r9,r6,r7
  1208. addc r11,r11,r8
  1209. adde r12,r12,r9
  1210. addze r10,r10
  1211. #mul_add_c(a[7],b[0],c2,c3,c1);
  1212. $LD r6,`7*$BNSZ`(r4)
  1213. $LD r7,`0*$BNSZ`(r5)
  1214. $UMULL r8,r6,r7
  1215. $UMULH r9,r6,r7
  1216. addc r11,r11,r8
  1217. adde r12,r12,r9
  1218. addze r10,r10
  1219. $ST r11,`7*$BNSZ`(r3) #r[7]=c2;
  1220. #mul_add_c(a[7],b[1],c3,c1,c2);
  1221. $LD r7,`1*$BNSZ`(r5)
  1222. $UMULL r8,r6,r7
  1223. $UMULH r9,r6,r7
  1224. addc r12,r12,r8
  1225. adde r10,r10,r9
  1226. addze r11,r0
  1227. #mul_add_c(a[6],b[2],c3,c1,c2);
  1228. $LD r6,`6*$BNSZ`(r4)
  1229. $LD r7,`2*$BNSZ`(r5)
  1230. $UMULL r8,r6,r7
  1231. $UMULH r9,r6,r7
  1232. addc r12,r12,r8
  1233. adde r10,r10,r9
  1234. addze r11,r11
  1235. #mul_add_c(a[5],b[3],c3,c1,c2);
  1236. $LD r6,`5*$BNSZ`(r4)
  1237. $LD r7,`3*$BNSZ`(r5)
  1238. $UMULL r8,r6,r7
  1239. $UMULH r9,r6,r7
  1240. addc r12,r12,r8
  1241. adde r10,r10,r9
  1242. addze r11,r11
  1243. #mul_add_c(a[4],b[4],c3,c1,c2);
  1244. $LD r6,`4*$BNSZ`(r4)
  1245. $LD r7,`4*$BNSZ`(r5)
  1246. $UMULL r8,r6,r7
  1247. $UMULH r9,r6,r7
  1248. addc r12,r12,r8
  1249. adde r10,r10,r9
  1250. addze r11,r11
  1251. #mul_add_c(a[3],b[5],c3,c1,c2);
  1252. $LD r6,`3*$BNSZ`(r4)
  1253. $LD r7,`5*$BNSZ`(r5)
  1254. $UMULL r8,r6,r7
  1255. $UMULH r9,r6,r7
  1256. addc r12,r12,r8
  1257. adde r10,r10,r9
  1258. addze r11,r11
  1259. #mul_add_c(a[2],b[6],c3,c1,c2);
  1260. $LD r6,`2*$BNSZ`(r4)
  1261. $LD r7,`6*$BNSZ`(r5)
  1262. $UMULL r8,r6,r7
  1263. $UMULH r9,r6,r7
  1264. addc r12,r12,r8
  1265. adde r10,r10,r9
  1266. addze r11,r11
  1267. #mul_add_c(a[1],b[7],c3,c1,c2);
  1268. $LD r6,`1*$BNSZ`(r4)
  1269. $LD r7,`7*$BNSZ`(r5)
  1270. $UMULL r8,r6,r7
  1271. $UMULH r9,r6,r7
  1272. addc r12,r12,r8
  1273. adde r10,r10,r9
  1274. addze r11,r11
  1275. $ST r12,`8*$BNSZ`(r3) #r[8]=c3;
  1276. #mul_add_c(a[2],b[7],c1,c2,c3);
  1277. $LD r6,`2*$BNSZ`(r4)
  1278. $UMULL r8,r6,r7
  1279. $UMULH r9,r6,r7
  1280. addc r10,r10,r8
  1281. adde r11,r11,r9
  1282. addze r12,r0
  1283. #mul_add_c(a[3],b[6],c1,c2,c3);
  1284. $LD r6,`3*$BNSZ`(r4)
  1285. $LD r7,`6*$BNSZ`(r5)
  1286. $UMULL r8,r6,r7
  1287. $UMULH r9,r6,r7
  1288. addc r10,r10,r8
  1289. adde r11,r11,r9
  1290. addze r12,r12
  1291. #mul_add_c(a[4],b[5],c1,c2,c3);
  1292. $LD r6,`4*$BNSZ`(r4)
  1293. $LD r7,`5*$BNSZ`(r5)
  1294. $UMULL r8,r6,r7
  1295. $UMULH r9,r6,r7
  1296. addc r10,r10,r8
  1297. adde r11,r11,r9
  1298. addze r12,r12
  1299. #mul_add_c(a[5],b[4],c1,c2,c3);
  1300. $LD r6,`5*$BNSZ`(r4)
  1301. $LD r7,`4*$BNSZ`(r5)
  1302. $UMULL r8,r6,r7
  1303. $UMULH r9,r6,r7
  1304. addc r10,r10,r8
  1305. adde r11,r11,r9
  1306. addze r12,r12
  1307. #mul_add_c(a[6],b[3],c1,c2,c3);
  1308. $LD r6,`6*$BNSZ`(r4)
  1309. $LD r7,`3*$BNSZ`(r5)
  1310. $UMULL r8,r6,r7
  1311. $UMULH r9,r6,r7
  1312. addc r10,r10,r8
  1313. adde r11,r11,r9
  1314. addze r12,r12
  1315. #mul_add_c(a[7],b[2],c1,c2,c3);
  1316. $LD r6,`7*$BNSZ`(r4)
  1317. $LD r7,`2*$BNSZ`(r5)
  1318. $UMULL r8,r6,r7
  1319. $UMULH r9,r6,r7
  1320. addc r10,r10,r8
  1321. adde r11,r11,r9
  1322. addze r12,r12
  1323. $ST r10,`9*$BNSZ`(r3) #r[9]=c1;
  1324. #mul_add_c(a[7],b[3],c2,c3,c1);
  1325. $LD r7,`3*$BNSZ`(r5)
  1326. $UMULL r8,r6,r7
  1327. $UMULH r9,r6,r7
  1328. addc r11,r11,r8
  1329. adde r12,r12,r9
  1330. addze r10,r0
  1331. #mul_add_c(a[6],b[4],c2,c3,c1);
  1332. $LD r6,`6*$BNSZ`(r4)
  1333. $LD r7,`4*$BNSZ`(r5)
  1334. $UMULL r8,r6,r7
  1335. $UMULH r9,r6,r7
  1336. addc r11,r11,r8
  1337. adde r12,r12,r9
  1338. addze r10,r10
  1339. #mul_add_c(a[5],b[5],c2,c3,c1);
  1340. $LD r6,`5*$BNSZ`(r4)
  1341. $LD r7,`5*$BNSZ`(r5)
  1342. $UMULL r8,r6,r7
  1343. $UMULH r9,r6,r7
  1344. addc r11,r11,r8
  1345. adde r12,r12,r9
  1346. addze r10,r10
  1347. #mul_add_c(a[4],b[6],c2,c3,c1);
  1348. $LD r6,`4*$BNSZ`(r4)
  1349. $LD r7,`6*$BNSZ`(r5)
  1350. $UMULL r8,r6,r7
  1351. $UMULH r9,r6,r7
  1352. addc r11,r11,r8
  1353. adde r12,r12,r9
  1354. addze r10,r10
  1355. #mul_add_c(a[3],b[7],c2,c3,c1);
  1356. $LD r6,`3*$BNSZ`(r4)
  1357. $LD r7,`7*$BNSZ`(r5)
  1358. $UMULL r8,r6,r7
  1359. $UMULH r9,r6,r7
  1360. addc r11,r11,r8
  1361. adde r12,r12,r9
  1362. addze r10,r10
  1363. $ST r11,`10*$BNSZ`(r3) #r[10]=c2;
  1364. #mul_add_c(a[4],b[7],c3,c1,c2);
  1365. $LD r6,`4*$BNSZ`(r4)
  1366. $UMULL r8,r6,r7
  1367. $UMULH r9,r6,r7
  1368. addc r12,r12,r8
  1369. adde r10,r10,r9
  1370. addze r11,r0
  1371. #mul_add_c(a[5],b[6],c3,c1,c2);
  1372. $LD r6,`5*$BNSZ`(r4)
  1373. $LD r7,`6*$BNSZ`(r5)
  1374. $UMULL r8,r6,r7
  1375. $UMULH r9,r6,r7
  1376. addc r12,r12,r8
  1377. adde r10,r10,r9
  1378. addze r11,r11
  1379. #mul_add_c(a[6],b[5],c3,c1,c2);
  1380. $LD r6,`6*$BNSZ`(r4)
  1381. $LD r7,`5*$BNSZ`(r5)
  1382. $UMULL r8,r6,r7
  1383. $UMULH r9,r6,r7
  1384. addc r12,r12,r8
  1385. adde r10,r10,r9
  1386. addze r11,r11
  1387. #mul_add_c(a[7],b[4],c3,c1,c2);
  1388. $LD r6,`7*$BNSZ`(r4)
  1389. $LD r7,`4*$BNSZ`(r5)
  1390. $UMULL r8,r6,r7
  1391. $UMULH r9,r6,r7
  1392. addc r12,r12,r8
  1393. adde r10,r10,r9
  1394. addze r11,r11
  1395. $ST r12,`11*$BNSZ`(r3) #r[11]=c3;
  1396. #mul_add_c(a[7],b[5],c1,c2,c3);
  1397. $LD r7,`5*$BNSZ`(r5)
  1398. $UMULL r8,r6,r7
  1399. $UMULH r9,r6,r7
  1400. addc r10,r10,r8
  1401. adde r11,r11,r9
  1402. addze r12,r0
  1403. #mul_add_c(a[6],b[6],c1,c2,c3);
  1404. $LD r6,`6*$BNSZ`(r4)
  1405. $LD r7,`6*$BNSZ`(r5)
  1406. $UMULL r8,r6,r7
  1407. $UMULH r9,r6,r7
  1408. addc r10,r10,r8
  1409. adde r11,r11,r9
  1410. addze r12,r12
  1411. #mul_add_c(a[5],b[7],c1,c2,c3);
  1412. $LD r6,`5*$BNSZ`(r4)
  1413. $LD r7,`7*$BNSZ`(r5)
  1414. $UMULL r8,r6,r7
  1415. $UMULH r9,r6,r7
  1416. addc r10,r10,r8
  1417. adde r11,r11,r9
  1418. addze r12,r12
  1419. $ST r10,`12*$BNSZ`(r3) #r[12]=c1;
  1420. #mul_add_c(a[6],b[7],c2,c3,c1);
  1421. $LD r6,`6*$BNSZ`(r4)
  1422. $UMULL r8,r6,r7
  1423. $UMULH r9,r6,r7
  1424. addc r11,r11,r8
  1425. adde r12,r12,r9
  1426. addze r10,r0
  1427. #mul_add_c(a[7],b[6],c2,c3,c1);
  1428. $LD r6,`7*$BNSZ`(r4)
  1429. $LD r7,`6*$BNSZ`(r5)
  1430. $UMULL r8,r6,r7
  1431. $UMULH r9,r6,r7
  1432. addc r11,r11,r8
  1433. adde r12,r12,r9
  1434. addze r10,r10
  1435. $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
  1436. #mul_add_c(a[7],b[7],c3,c1,c2);
  1437. $LD r7,`7*$BNSZ`(r5)
  1438. $UMULL r8,r6,r7
  1439. $UMULH r9,r6,r7
  1440. addc r12,r12,r8
  1441. adde r10,r10,r9
  1442. $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
  1443. $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
  1444. blr
  1445. .long 0
  1446. .byte 0,12,0x14,0,0,0,3,0
  1447. .long 0
  1448. .size .bn_mul_comba8,.-.bn_mul_comba8
  1449. #
  1450. # NOTE: The following label name should be changed to
  1451. # "bn_sub_words" i.e. remove the first dot
  1452. # for the gcc compiler. This should be automatically
  1453. # done in the build
  1454. #
  1455. #
  1456. .align 4
  1457. .bn_sub_words:
  1458. #
  1459. # Handcoded version of bn_sub_words
  1460. #
  1461. #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  1462. #
  1463. # r3 = r
  1464. # r4 = a
  1465. # r5 = b
  1466. # r6 = n
  1467. #
  1468. # Note: No loop unrolling done since this is not a performance
  1469. # critical loop.
  1470. xor r0,r0,r0 #set r0 = 0
  1471. #
  1472. # check for r6 = 0 AND set carry bit.
  1473. #
  1474. subfc. r7,r0,r6 # If r6 is 0 then result is 0.
  1475. # if r6 > 0 then result !=0
  1476. # In either case carry bit is set.
  1477. beq Lppcasm_sub_adios
  1478. addi r4,r4,-$BNSZ
  1479. addi r3,r3,-$BNSZ
  1480. addi r5,r5,-$BNSZ
  1481. mtctr r6
  1482. Lppcasm_sub_mainloop:
  1483. $LDU r7,$BNSZ(r4)
  1484. $LDU r8,$BNSZ(r5)
  1485. subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
  1486. # if carry = 1 this is r7-r8. Else it
  1487. # is r7-r8 -1 as we need.
  1488. $STU r6,$BNSZ(r3)
  1489. bdnz Lppcasm_sub_mainloop
  1490. Lppcasm_sub_adios:
  1491. subfze r3,r0 # if carry bit is set then r3 = 0 else -1
  1492. andi. r3,r3,1 # keep only last bit.
  1493. blr
  1494. .long 0
  1495. .byte 0,12,0x14,0,0,0,4,0
  1496. .long 0
  1497. .size .bn_sub_words,.-.bn_sub_words
  1498. #
  1499. # NOTE: The following label name should be changed to
  1500. # "bn_add_words" i.e. remove the first dot
  1501. # for the gcc compiler. This should be automatically
  1502. # done in the build
  1503. #
  1504. .align 4
  1505. .bn_add_words:
  1506. #
  1507. # Handcoded version of bn_add_words
  1508. #
  1509. #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
  1510. #
  1511. # r3 = r
  1512. # r4 = a
  1513. # r5 = b
  1514. # r6 = n
  1515. #
  1516. # Note: No loop unrolling done since this is not a performance
  1517. # critical loop.
  1518. xor r0,r0,r0
  1519. #
  1520. # check for r6 = 0. Is this needed?
  1521. #
  1522. addic. r6,r6,0 #test r6 and clear carry bit.
  1523. beq Lppcasm_add_adios
  1524. addi r4,r4,-$BNSZ
  1525. addi r3,r3,-$BNSZ
  1526. addi r5,r5,-$BNSZ
  1527. mtctr r6
  1528. Lppcasm_add_mainloop:
  1529. $LDU r7,$BNSZ(r4)
  1530. $LDU r8,$BNSZ(r5)
  1531. adde r8,r7,r8
  1532. $STU r8,$BNSZ(r3)
  1533. bdnz Lppcasm_add_mainloop
  1534. Lppcasm_add_adios:
  1535. addze r3,r0 #return carry bit.
  1536. blr
  1537. .long 0
  1538. .byte 0,12,0x14,0,0,0,4,0
  1539. .long 0
  1540. .size .bn_add_words,.-.bn_add_words
  1541. #
  1542. # NOTE: The following label name should be changed to
  1543. # "bn_div_words" i.e. remove the first dot
  1544. # for the gcc compiler. This should be automatically
  1545. # done in the build
  1546. #
  1547. .align 4
  1548. .bn_div_words:
  1549. #
  1550. # This is a cleaned up version of code generated by
  1551. # the AIX compiler. The only optimization is to use
  1552. # the PPC instruction to count leading zeros instead
  1553. # of call to num_bits_word. Since this was compiled
  1554. # only at level -O2 we can possibly squeeze it more?
  1555. #
  1556. # r3 = h
  1557. # r4 = l
  1558. # r5 = d
  1559. $UCMPI 0,r5,0 # compare r5 and 0
  1560. bne Lppcasm_div1 # proceed if d!=0
  1561. li r3,-1 # d=0 return -1
  1562. blr
  1563. Lppcasm_div1:
  1564. xor r0,r0,r0 #r0=0
  1565. li r8,$BITS
  1566. $CNTLZ. r7,r5 #r7 = num leading 0s in d.
  1567. beq Lppcasm_div2 #proceed if no leading zeros
  1568. subf r8,r7,r8 #r8 = BN_num_bits_word(d)
  1569. $SHR. r9,r3,r8 #are there any bits above r8'th?
  1570. $TR 16,r9,r0 #if there're, signal to dump core...
  1571. Lppcasm_div2:
  1572. $UCMP 0,r3,r5 #h>=d?
  1573. blt Lppcasm_div3 #goto Lppcasm_div3 if not
  1574. subf r3,r5,r3 #h-=d ;
  1575. Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
  1576. cmpi 0,0,r7,0 # is (i == 0)?
  1577. beq Lppcasm_div4
  1578. $SHL r3,r3,r7 # h = (h<< i)
  1579. $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
  1580. $SHL r5,r5,r7 # d<<=i
  1581. or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
  1582. $SHL r4,r4,r7 # l <<=i
  1583. Lppcasm_div4:
  1584. $SHRI r9,r5,`$BITS/2` # r9 = dh
  1585. # dl will be computed when needed
  1586. # as it saves registers.
  1587. li r6,2 #r6=2
  1588. mtctr r6 #counter will be in count.
  1589. Lppcasm_divouterloop:
  1590. $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
  1591. $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
  1592. # compute here for innerloop.
  1593. $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
  1594. bne Lppcasm_div5 # goto Lppcasm_div5 if not
  1595. li r8,-1
  1596. $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
  1597. b Lppcasm_div6
  1598. Lppcasm_div5:
  1599. $UDIV r8,r3,r9 #q = h/dh
  1600. Lppcasm_div6:
  1601. $UMULL r12,r9,r8 #th = q*dh
  1602. $CLRU r10,r5,`$BITS/2` #r10=dl
  1603. $UMULL r6,r8,r10 #tl = q*dl
  1604. Lppcasm_divinnerloop:
  1605. subf r10,r12,r3 #t = h -th
  1606. $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
  1607. addic. r7,r7,0 #test if r7 == 0. used below.
  1608. # now want to compute
  1609. # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
  1610. # the following 2 instructions do that
  1611. $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
  1612. or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
  1613. $UCMP cr1,r6,r7 # compare (tl <= r7)
  1614. bne Lppcasm_divinnerexit
  1615. ble cr1,Lppcasm_divinnerexit
  1616. addi r8,r8,-1 #q--
  1617. subf r12,r9,r12 #th -=dh
  1618. $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
  1619. subf r6,r10,r6 #tl -=dl
  1620. b Lppcasm_divinnerloop
  1621. Lppcasm_divinnerexit:
  1622. $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
  1623. $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
  1624. $UCMP cr1,r4,r11 # compare l and tl
  1625. add r12,r12,r10 # th+=t
  1626. bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
  1627. addi r12,r12,1 # th++
  1628. Lppcasm_div7:
  1629. subf r11,r11,r4 #r11=l-tl
  1630. $UCMP cr1,r3,r12 #compare h and th
  1631. bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
  1632. addi r8,r8,-1 # q--
  1633. add r3,r5,r3 # h+=d
  1634. Lppcasm_div8:
  1635. subf r12,r12,r3 #r12 = h-th
  1636. $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
  1637. # want to compute
  1638. # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
  1639. # the following 2 instructions will do this.
  1640. $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
  1641. $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
  1642. bdz Lppcasm_div9 #if (count==0) break ;
  1643. $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
  1644. b Lppcasm_divouterloop
  1645. Lppcasm_div9:
  1646. or r3,r8,r0
  1647. blr
  1648. .long 0
  1649. .byte 0,12,0x14,0,0,0,3,0
  1650. .long 0
  1651. .size .bn_div_words,.-.bn_div_words
  1652. #
  1653. # NOTE: The following label name should be changed to
  1654. # "bn_sqr_words" i.e. remove the first dot
  1655. # for the gcc compiler. This should be automatically
  1656. # done in the build
  1657. #
  1658. .align 4
  1659. .bn_sqr_words:
  1660. #
  1661. # Optimized version of bn_sqr_words
  1662. #
  1663. # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
  1664. #
  1665. # r3 = r
  1666. # r4 = a
  1667. # r5 = n
  1668. #
  1669. # r6 = a[i].
  1670. # r7,r8 = product.
  1671. #
  1672. # No unrolling done here. Not performance critical.
  1673. addic. r5,r5,0 #test r5.
  1674. beq Lppcasm_sqr_adios
  1675. addi r4,r4,-$BNSZ
  1676. addi r3,r3,-$BNSZ
  1677. mtctr r5
  1678. Lppcasm_sqr_mainloop:
  1679. #sqr(r[0],r[1],a[0]);
  1680. $LDU r6,$BNSZ(r4)
  1681. $UMULL r7,r6,r6
  1682. $UMULH r8,r6,r6
  1683. $STU r7,$BNSZ(r3)
  1684. $STU r8,$BNSZ(r3)
  1685. bdnz Lppcasm_sqr_mainloop
  1686. Lppcasm_sqr_adios:
  1687. blr
  1688. .long 0
  1689. .byte 0,12,0x14,0,0,0,3,0
  1690. .long 0
  1691. .size .bn_sqr_words,.-.bn_sqr_words
  1692. #
  1693. # NOTE: The following label name should be changed to
  1694. # "bn_mul_words" i.e. remove the first dot
  1695. # for the gcc compiler. This should be automatically
  1696. # done in the build
  1697. #
  1698. .align 4
  1699. .bn_mul_words:
  1700. #
  1701. # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  1702. #
  1703. # r3 = rp
  1704. # r4 = ap
  1705. # r5 = num
  1706. # r6 = w
  1707. xor r0,r0,r0
  1708. xor r12,r12,r12 # used for carry
  1709. rlwinm. r7,r5,30,2,31 # num >> 2
  1710. beq Lppcasm_mw_REM
  1711. mtctr r7
  1712. Lppcasm_mw_LOOP:
  1713. #mul(rp[0],ap[0],w,c1);
  1714. $LD r8,`0*$BNSZ`(r4)
  1715. $UMULL r9,r6,r8
  1716. $UMULH r10,r6,r8
  1717. addc r9,r9,r12
  1718. #addze r10,r10 #carry is NOT ignored.
  1719. #will be taken care of
  1720. #in second spin below
  1721. #using adde.
  1722. $ST r9,`0*$BNSZ`(r3)
  1723. #mul(rp[1],ap[1],w,c1);
  1724. $LD r8,`1*$BNSZ`(r4)
  1725. $UMULL r11,r6,r8
  1726. $UMULH r12,r6,r8
  1727. adde r11,r11,r10
  1728. #addze r12,r12
  1729. $ST r11,`1*$BNSZ`(r3)
  1730. #mul(rp[2],ap[2],w,c1);
  1731. $LD r8,`2*$BNSZ`(r4)
  1732. $UMULL r9,r6,r8
  1733. $UMULH r10,r6,r8
  1734. adde r9,r9,r12
  1735. #addze r10,r10
  1736. $ST r9,`2*$BNSZ`(r3)
  1737. #mul_add(rp[3],ap[3],w,c1);
  1738. $LD r8,`3*$BNSZ`(r4)
  1739. $UMULL r11,r6,r8
  1740. $UMULH r12,r6,r8
  1741. adde r11,r11,r10
  1742. addze r12,r12 #this spin we collect carry into
  1743. #r12
  1744. $ST r11,`3*$BNSZ`(r3)
  1745. addi r3,r3,`4*$BNSZ`
  1746. addi r4,r4,`4*$BNSZ`
  1747. bdnz Lppcasm_mw_LOOP
  1748. Lppcasm_mw_REM:
  1749. andi. r5,r5,0x3
  1750. beq Lppcasm_mw_OVER
  1751. #mul(rp[0],ap[0],w,c1);
  1752. $LD r8,`0*$BNSZ`(r4)
  1753. $UMULL r9,r6,r8
  1754. $UMULH r10,r6,r8
  1755. addc r9,r9,r12
  1756. addze r10,r10
  1757. $ST r9,`0*$BNSZ`(r3)
  1758. addi r12,r10,0
  1759. addi r5,r5,-1
  1760. cmpli 0,0,r5,0
  1761. beq Lppcasm_mw_OVER
  1762. #mul(rp[1],ap[1],w,c1);
  1763. $LD r8,`1*$BNSZ`(r4)
  1764. $UMULL r9,r6,r8
  1765. $UMULH r10,r6,r8
  1766. addc r9,r9,r12
  1767. addze r10,r10
  1768. $ST r9,`1*$BNSZ`(r3)
  1769. addi r12,r10,0
  1770. addi r5,r5,-1
  1771. cmpli 0,0,r5,0
  1772. beq Lppcasm_mw_OVER
  1773. #mul_add(rp[2],ap[2],w,c1);
  1774. $LD r8,`2*$BNSZ`(r4)
  1775. $UMULL r9,r6,r8
  1776. $UMULH r10,r6,r8
  1777. addc r9,r9,r12
  1778. addze r10,r10
  1779. $ST r9,`2*$BNSZ`(r3)
  1780. addi r12,r10,0
  1781. Lppcasm_mw_OVER:
  1782. addi r3,r12,0
  1783. blr
  1784. .long 0
  1785. .byte 0,12,0x14,0,0,0,4,0
  1786. .long 0
  1787. .size .bn_mul_words,.-.bn_mul_words
  1788. #
  1789. # NOTE: The following label name should be changed to
  1790. # "bn_mul_add_words" i.e. remove the first dot
  1791. # for the gcc compiler. This should be automatically
  1792. # done in the build
  1793. #
  1794. .align 4
  1795. .bn_mul_add_words:
  1796. #
  1797. # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
  1798. #
  1799. # r3 = rp
  1800. # r4 = ap
  1801. # r5 = num
  1802. # r6 = w
  1803. #
  1804. # empirical evidence suggests that unrolled version performs best!!
  1805. #
  1806. xor r0,r0,r0 #r0 = 0
  1807. xor r12,r12,r12 #r12 = 0 . used for carry
  1808. rlwinm. r7,r5,30,2,31 # num >> 2
  1809. beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
  1810. mtctr r7
  1811. Lppcasm_maw_mainloop:
  1812. #mul_add(rp[0],ap[0],w,c1);
  1813. $LD r8,`0*$BNSZ`(r4)
  1814. $LD r11,`0*$BNSZ`(r3)
  1815. $UMULL r9,r6,r8
  1816. $UMULH r10,r6,r8
  1817. addc r9,r9,r12 #r12 is carry.
  1818. addze r10,r10
  1819. addc r9,r9,r11
  1820. #addze r10,r10
  1821. #the above instruction addze
  1822. #is NOT needed. Carry will NOT
  1823. #be ignored. It's not affected
  1824. #by multiply and will be collected
  1825. #in the next spin
  1826. $ST r9,`0*$BNSZ`(r3)
  1827. #mul_add(rp[1],ap[1],w,c1);
  1828. $LD r8,`1*$BNSZ`(r4)
  1829. $LD r9,`1*$BNSZ`(r3)
  1830. $UMULL r11,r6,r8
  1831. $UMULH r12,r6,r8
  1832. adde r11,r11,r10 #r10 is carry.
  1833. addze r12,r12
  1834. addc r11,r11,r9
  1835. #addze r12,r12
  1836. $ST r11,`1*$BNSZ`(r3)
  1837. #mul_add(rp[2],ap[2],w,c1);
  1838. $LD r8,`2*$BNSZ`(r4)
  1839. $UMULL r9,r6,r8
  1840. $LD r11,`2*$BNSZ`(r3)
  1841. $UMULH r10,r6,r8
  1842. adde r9,r9,r12
  1843. addze r10,r10
  1844. addc r9,r9,r11
  1845. #addze r10,r10
  1846. $ST r9,`2*$BNSZ`(r3)
  1847. #mul_add(rp[3],ap[3],w,c1);
  1848. $LD r8,`3*$BNSZ`(r4)
  1849. $UMULL r11,r6,r8
  1850. $LD r9,`3*$BNSZ`(r3)
  1851. $UMULH r12,r6,r8
  1852. adde r11,r11,r10
  1853. addze r12,r12
  1854. addc r11,r11,r9
  1855. addze r12,r12
  1856. $ST r11,`3*$BNSZ`(r3)
  1857. addi r3,r3,`4*$BNSZ`
  1858. addi r4,r4,`4*$BNSZ`
  1859. bdnz Lppcasm_maw_mainloop
  1860. Lppcasm_maw_leftover:
  1861. andi. r5,r5,0x3
  1862. beq Lppcasm_maw_adios
  1863. addi r3,r3,-$BNSZ
  1864. addi r4,r4,-$BNSZ
  1865. #mul_add(rp[0],ap[0],w,c1);
  1866. mtctr r5
  1867. $LDU r8,$BNSZ(r4)
  1868. $UMULL r9,r6,r8
  1869. $UMULH r10,r6,r8
  1870. $LDU r11,$BNSZ(r3)
  1871. addc r9,r9,r11
  1872. addze r10,r10
  1873. addc r9,r9,r12
  1874. addze r12,r10
  1875. $ST r9,0(r3)
  1876. bdz Lppcasm_maw_adios
  1877. #mul_add(rp[1],ap[1],w,c1);
  1878. $LDU r8,$BNSZ(r4)
  1879. $UMULL r9,r6,r8
  1880. $UMULH r10,r6,r8
  1881. $LDU r11,$BNSZ(r3)
  1882. addc r9,r9,r11
  1883. addze r10,r10
  1884. addc r9,r9,r12
  1885. addze r12,r10
  1886. $ST r9,0(r3)
  1887. bdz Lppcasm_maw_adios
  1888. #mul_add(rp[2],ap[2],w,c1);
  1889. $LDU r8,$BNSZ(r4)
  1890. $UMULL r9,r6,r8
  1891. $UMULH r10,r6,r8
  1892. $LDU r11,$BNSZ(r3)
  1893. addc r9,r9,r11
  1894. addze r10,r10
  1895. addc r9,r9,r12
  1896. addze r12,r10
  1897. $ST r9,0(r3)
  1898. Lppcasm_maw_adios:
  1899. addi r3,r12,0
  1900. blr
  1901. .long 0
  1902. .byte 0,12,0x14,0,0,0,4,0
  1903. .long 0
  1904. .size .bn_mul_add_words,.-.bn_mul_add_words
  1905. .align 4
  1906. EOF
  1907. $data =~ s/\`([^\`]*)\`/eval $1/gem;
  1908. print $data;
  1909. close STDOUT or die "error closing STDOUT: $!";