lbn80386.s 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. ### Copyright (c) 1995, Colin Plumb.
  2. ### For licensing and other legal details, see the file legal.c.
  3. ###
  4. ### Assembly primitives for bignum library, 80386 family, 32-bit code.
  5. ###
  6. ### Several primitives are included here. Only lbnMulAdd1 is *really*
  7. ### critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite
  8. ### easy to write as well, so they are included here as well.
  9. ### lbnDiv21 and lbnModQ are so easy to write that they're included, too.
  10. ###
  11. ### All functions here are for 32-bit flat mode. I.e. near code and
  12. ### near data, although the near offsets are 32 bits.
  13. ### Preserved registers are esp, ebp, esi, edi and ebx. That last
  14. ### is needed by ELF for PIC, and differs from the IBM PC calling
  15. ### convention.
  16. # Different assemblers have different conventions here
  17. align4=4 # could be 2 or 4
  18. align8=8 # could be 3 or 8
  19. align16=16 # cound be 4 or 16
  20. .text
  21. # We declare each symbol with two names, to deal with ELF/a.out variances.
  22. .globl lbnMulN1_32
  23. .globl _lbnMulN1_32
  24. .globl lbnMulAdd1_32
  25. .globl _lbnMulAdd1_32
  26. .globl lbnMulSub1_32
  27. .globl _lbnMulSub1_32
  28. .globl lbnDiv21_32
  29. .globl _lbnDiv21_32
  30. .globl lbnModQ_32
  31. .globl _lbnModQ_32
  32. ## Register usage:
  33. ## %eax - low half of product
  34. ## %ebx - carry to next iteration
  35. ## %ecx - multiplier (k)
  36. ## %edx - high half of product
  37. ## %esi - source pointer
  38. ## %edi - dest pointer
  39. ## %ebp - loop counter
  40. ##
  41. ## Stack frame:
  42. ## +--------+ %esp+20 %esp+24 %esp+28 %esp+32 %esp+36
  43. ## | k |
  44. ## +--------+ %esp+16 %esp+20 %esp+24 %esp+28 %esp+32
  45. ## | len |
  46. ## +--------+ %esp+12 %esp+16 %esp+20 %esp+24 %esp+28
  47. ## | in |
  48. ## +--------+ %esp+8 %esp+12 %esp+16 %esp+20 %esp+24
  49. ## | out |
  50. ## +--------+ %esp+4 %esp+8 %esp+12 %esp+16 %esp+20
  51. ## | return |
  52. ## +--------+ %esp %esp+4 %esp+8 %esp+12 %esp+16
  53. ## | %esi |
  54. ## +--------+ %esp %esp+4 %esp+8 %esp+12
  55. ## | %ebp |
  56. ## +--------+ %esp %esp+4 %esp+8
  57. ## | %ebx |
  58. ## +--------+ %esp %esp+4
  59. ## | %edi |
  60. ## +--------+ %esp
  61. .align align16
  62. lbnMulN1_32:
  63. _lbnMulN1_32:
  64. pushl %esi # U
  65. movl 12(%esp),%esi # V load in
  66. pushl %ebp # U
  67. movl 20(%esp),%ebp # V load len
  68. pushl %ebx # U
  69. movl 28(%esp),%ecx # V load k
  70. pushl %edi # U
  71. movl 20(%esp),%edi # V load out
  72. ## First multiply step has no carry in.
  73. movl (%esi),%eax # V
  74. leal -4(,%ebp,4),%ebx # U loop unrolling
  75. mull %ecx # NP first multiply
  76. movl %eax,(%edi) # U
  77. andl $12,%ebx # V loop unrolling
  78. addl %ebx,%esi # U loop unrolling
  79. addl %ebx,%edi # V loop unrolling
  80. jmp *m32_jumptable(%ebx) # NP loop unrolling
  81. .align align4
  82. m32_jumptable:
  83. .long m32_case0
  84. .long m32_case1
  85. .long m32_case2
  86. .long m32_case3
  87. nop
  88. .align align8
  89. nop
  90. nop
  91. nop # Get loop nicely aligned
  92. m32_case0:
  93. subl $4,%ebp # U
  94. jbe m32_done # V
  95. m32_loop:
  96. movl 4(%esi),%eax # U
  97. movl %edx,%ebx # V Remember carry for later
  98. addl $16,%esi # U
  99. addl $16,%edi # V
  100. mull %ecx # NP
  101. addl %ebx,%eax # U Add carry in from previous word
  102. adcl $0,%edx # U
  103. movl %eax,-12(%edi) # V
  104. m32_case3:
  105. movl -8(%esi),%eax # U
  106. movl %edx,%ebx # V Remember carry for later
  107. mull %ecx # NP
  108. addl %ebx,%eax # U Add carry in from previous word
  109. adcl $0,%edx # U
  110. movl %eax,-8(%edi) # V
  111. m32_case2:
  112. movl -4(%esi),%eax # U
  113. movl %edx,%ebx # V Remember carry for later
  114. mull %ecx # NP
  115. addl %ebx,%eax # U Add carry in from previous word
  116. adcl $0,%edx # U
  117. movl %eax,-4(%edi) # V
  118. m32_case1:
  119. movl (%esi),%eax # U
  120. movl %edx,%ebx # V Remember carry for later
  121. mull %ecx # NP
  122. addl %ebx,%eax # U Add carry in from previous word
  123. adcl $0,%edx # U
  124. movl %eax,(%edi) # V
  125. subl $4,%ebp # U
  126. ja m32_loop # V
  127. m32_done:
  128. movl %edx,4(%edi) # U
  129. popl %edi # V
  130. popl %ebx # U
  131. popl %ebp # V
  132. popl %esi # U
  133. ret # NP
  134. .align align16
  135. lbnMulAdd1_32:
  136. _lbnMulAdd1_32:
  137. pushl %esi # U
  138. movl 12(%esp),%esi # V load in
  139. pushl %edi # U
  140. movl 12(%esp),%edi # V load out
  141. pushl %ebp # U
  142. movl 24(%esp),%ebp # V load len
  143. pushl %ebx # U
  144. movl 32(%esp),%ecx # V load k
  145. ## First multiply step has no carry in.
  146. movl (%esi),%eax # V
  147. movl (%edi),%ebx # U
  148. mull %ecx # NP first multiply
  149. addl %eax,%ebx # U
  150. leal -4(,%ebp,4),%eax # V loop unrolling
  151. adcl $0,%edx # U
  152. andl $12,%eax # V loop unrolling
  153. movl %ebx,(%edi) # U
  154. addl %eax,%esi # V loop unrolling
  155. addl %eax,%edi # U loop unrolling
  156. jmp *ma32_jumptable(%eax) # NP loop unrolling
  157. .align align4
  158. ma32_jumptable:
  159. .long ma32_case0
  160. .long ma32_case1
  161. .long ma32_case2
  162. .long ma32_case3
  163. .align align8
  164. nop
  165. nop
  166. nop # To align loop properly
  167. ma32_case0:
  168. subl $4,%ebp # U
  169. jbe ma32_done # V
  170. ma32_loop:
  171. movl 4(%esi),%eax # U
  172. movl %edx,%ebx # V Remember carry for later
  173. addl $16,%esi # U
  174. addl $16,%edi # V
  175. mull %ecx # NP
  176. addl %ebx,%eax # U Add carry in from previous word
  177. movl -12(%edi),%ebx # V
  178. adcl $0,%edx # U
  179. addl %eax,%ebx # V
  180. adcl $0,%edx # U
  181. movl %ebx,-12(%edi) # V
  182. ma32_case3:
  183. movl -8(%esi),%eax # U
  184. movl %edx,%ebx # V Remember carry for later
  185. mull %ecx # NP
  186. addl %ebx,%eax # U Add carry in from previous word
  187. movl -8(%edi),%ebx # V
  188. adcl $0,%edx # U
  189. addl %eax,%ebx # V
  190. adcl $0,%edx # U
  191. movl %ebx,-8(%edi) # V
  192. ma32_case2:
  193. movl -4(%esi),%eax # U
  194. movl %edx,%ebx # V Remember carry for later
  195. mull %ecx # NP
  196. addl %ebx,%eax # U Add carry in from previous word
  197. movl -4(%edi),%ebx # V
  198. adcl $0,%edx # U
  199. addl %eax,%ebx # V
  200. adcl $0,%edx # U
  201. movl %ebx,-4(%edi) # V
  202. ma32_case1:
  203. movl (%esi),%eax # U
  204. movl %edx,%ebx # V Remember carry for later
  205. mull %ecx # NP
  206. addl %ebx,%eax # U Add carry in from previous word
  207. movl (%edi),%ebx # V
  208. adcl $0,%edx # U
  209. addl %eax,%ebx # V
  210. adcl $0,%edx # U
  211. movl %ebx,(%edi) # V
  212. subl $4,%ebp # U
  213. ja ma32_loop # V
  214. ma32_done:
  215. popl %ebx # U
  216. popl %ebp # V
  217. movl %edx,%eax # U
  218. popl %edi # V
  219. popl %esi # U
  220. ret # NP
  221. .align align16
  222. lbnMulSub1_32:
  223. _lbnMulSub1_32:
  224. pushl %esi # U
  225. movl 12(%esp),%esi # V load in
  226. pushl %edi # U
  227. movl 12(%esp),%edi # V load out
  228. pushl %ebp # U
  229. movl 24(%esp),%ebp # V load len
  230. pushl %ebx # U
  231. movl 32(%esp),%ecx # V load k
  232. /* First multiply step has no carry in. */
  233. movl (%esi),%eax # V
  234. movl (%edi),%ebx # U
  235. mull %ecx # NP first multiply
  236. subl %eax,%ebx # U
  237. leal -4(,%ebp,4),%eax # V loop unrolling
  238. adcl $0,%edx # U
  239. andl $12,%eax # V loop unrolling
  240. movl %ebx,(%edi) # U
  241. addl %eax,%esi # V loop unrolling
  242. addl %eax,%edi # U loop unrolling
  243. jmp *ms32_jumptable(%eax) # NP loop unrolling
  244. .align align4
  245. ms32_jumptable:
  246. .long ms32_case0
  247. .long ms32_case1
  248. .long ms32_case2
  249. .long ms32_case3
  250. .align align8
  251. nop
  252. nop
  253. nop
  254. ms32_case0:
  255. subl $4,%ebp # U
  256. jbe ms32_done # V
  257. ms32_loop:
  258. movl 4(%esi),%eax # U
  259. movl %edx,%ebx # V Remember carry for later
  260. addl $16,%esi # U
  261. addl $16,%edi # V
  262. mull %ecx # NP
  263. addl %ebx,%eax # U Add carry in from previous word
  264. movl -12(%edi),%ebx # V
  265. adcl $0,%edx # U
  266. subl %eax,%ebx # V
  267. adcl $0,%edx # U
  268. movl %ebx,-12(%edi) # V
  269. ms32_case3:
  270. movl -8(%esi),%eax # U
  271. movl %edx,%ebx # V Remember carry for later
  272. mull %ecx # NP
  273. addl %ebx,%eax # U Add carry in from previous word
  274. movl -8(%edi),%ebx # V
  275. adcl $0,%edx # U
  276. subl %eax,%ebx # V
  277. adcl $0,%edx # U
  278. movl %ebx,-8(%edi) # V
  279. ms32_case2:
  280. movl -4(%esi),%eax # U
  281. movl %edx,%ebx # V Remember carry for later
  282. mull %ecx # NP
  283. addl %ebx,%eax # U Add carry in from previous word
  284. movl -4(%edi),%ebx # V
  285. adcl $0,%edx # U
  286. subl %eax,%ebx # V
  287. adcl $0,%edx # U
  288. movl %ebx,-4(%edi) # V
  289. ms32_case1:
  290. movl (%esi),%eax # U
  291. movl %edx,%ebx # V Remember carry for later
  292. mull %ecx # NP
  293. addl %ebx,%eax # U Add carry in from previous word
  294. movl (%edi),%ebx # V
  295. adcl $0,%edx # U
  296. subl %eax,%ebx # V
  297. adcl $0,%edx # U
  298. movl %ebx,(%edi) # V
  299. subl $4,%ebp # U
  300. ja ms32_loop # V
  301. ms32_done:
  302. popl %ebx # U
  303. popl %ebp # V
  304. movl %edx,%eax # U
  305. popl %edi # V
  306. popl %esi # U
  307. ret # NP
  308. ## Two-word by one-word divide. Stores quotient, returns remainder.
  309. ## BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
  310. ## 4 8 12 16
  311. .align align16
  312. lbnDiv21_32:
  313. _lbnDiv21_32:
  314. movl 8(%esp),%edx # U Load nh
  315. movl 12(%esp),%eax # V Load nl
  316. movl 4(%esp),%ecx # U Load q
  317. divl 16(%esp) # NP
  318. movl %eax,(%ecx) # U Store quotient
  319. movl %edx,%eax # V Return remainder
  320. ret
  321. ## Multi-word by one-word remainder.
  322. ## This speeds up key generation. It's not worth unrolling and so on;
  323. ## using 32-bit divides is enough of a speedup.
  324. ##
  325. ## The modulus (in %ebp) is often 16 bits. Given that the dividend is 32
  326. ## bits, the chances of saving the first divide because the high word of the
  327. ## dividend is less than the modulus are low enough it's not worth taking
  328. ## the cycles to test for it.
  329. ##
  330. ## unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
  331. ## 4 8 12
  332. .align align16
  333. lbnModQ_32:
  334. _lbnModQ_32:
  335. movl 4(%esp),%eax # U Load n
  336. pushl %ebp # V
  337. movl 12(%esp),%ebp # U Load len
  338. pushl %esi # V
  339. leal -4(%eax,%ebp,4),%esi # U
  340. movl 20(%esp),%ecx # V Load d
  341. xorl %edx,%edx # U Clear MSW for first divide
  342. modq32_loop:
  343. movl (%esi),%eax # U
  344. subl $4,%esi # V
  345. divl %ecx # NP
  346. decl %ebp # U
  347. jnz modq32_loop # V
  348. popl %esi # U
  349. movl %edx,%eax # V
  350. popl %ebp # U
  351. ret # NP