lbn80386.asm 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. ;;; Copyright (c) 1995, Colin Plumb.
  2. ;;; For licensing and other legal details, see the file legal.c.
  3. ;;;
  4. ;;; Assembly primitives for bignum library, 80386 family, 32-bit code.
  5. ;;;
  6. ;;; Several primitives are included here. Only lbnMulAdd1 is *really*
  7. ;;; critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite
  8. ;;; easy to write as well, so they are included here as well.
  9. ;;; lbnDiv21 and lbnModQ are so easy to write that they're included, too.
  10. ;;;
  11. ;;; All functions here are for 32-bit flat mode. I.e. near code and
  12. ;;; near data, although the near offsets are 32 bits.
  13. ;;;
  14. ;;; The usual 80x86 calling conventions have AX, BX, CX and DX
  15. ;;; volatile, and SI, DI, SP and BP preserved across calls.
  16. ;;; This includes the "E"xtended forms of all of those registers
  17. ;;;
  18. ;;; However, just to be confusing, recent 32-bit DOS compilers have
  19. ;;; quietly changed that to require EBX preserved across calls, too.
  20. ;;; Joy.
  21. .386
  22. ;_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares
  23. ;_TEXT ends
  24. ifdef @Version
  25. if @Version le 510
  26. FLAT group _TEXT
  27. endif
  28. else
  29. FLAT group _TEXT
  30. endif
  31. assume cs:FLAT, ds:FLAT, ss:FLAT
  32. _TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares
  33. public _lbnMulN1_32
  34. public _lbnMulAdd1_32
  35. public _lbnMulSub1_32
  36. public _lbnDiv21_32
  37. public _lbnModQ_32
  38. ;; Register usage:
  39. ;; eax - low half of product
  40. ;; ebx - carry to next iteration
  41. ;; ecx - multiplier (k)
  42. ;; edx - high half of product
  43. ;; esi - source pointer
  44. ;; edi - dest pointer
  45. ;; ebp - loop counter
  46. ;;
  47. ;; Stack frame:
  48. ;; +--------+ esp+20 esp+24 esp+28 esp+32 esp+36
  49. ;; | k |
  50. ;; +--------+ esp+16 esp+20 esp+24 esp+28 esp+32
  51. ;; | len |
  52. ;; +--------+ esp+12 esp+16 esp+20 esp+24 esp+28
  53. ;; | in |
  54. ;; +--------+ esp+8 esp+12 esp+16 esp+20 esp+24
  55. ;; | out |
  56. ;; +--------+ esp+4 esp+8 esp+12 esp+16 esp+20
  57. ;; | return |
  58. ;; +--------+ esp esp+4 esp+8 esp+12 esp+16
  59. ;; | esi |
  60. ;; +--------+ esp esp+4 esp+8 esp+12
  61. ;; | ebp |
  62. ;; +--------+ esp esp+4 esp+8
  63. ;; | ebx |
  64. ;; +--------+ esp esp+4
  65. ;; | edi |
  66. ;; +--------+ esp
  67. align 16
  68. _lbnMulN1_32 proc near
  69. push esi ; U
  70. mov esi,[esp+12] ; V load in
  71. push ebp ; U
  72. mov ebp,[esp+20] ; V load len
  73. push ebx ; U
  74. mov ecx,[esp+28] ; V load k
  75. push edi ; U
  76. mov edi,[esp+20] ; V load out
  77. ;; First multiply step has no carry in.
  78. mov eax,[esi] ; U
  79. lea ebx,[ebp*4-4] ; V loop unrolling
  80. mul ecx ; NP first multiply
  81. mov [edi],eax ; U
  82. and ebx,12 ; V loop unrolling
  83. add esi,ebx ; U loop unrolling
  84. add edi,ebx ; V loop unrolling
  85. jmp DWORD PTR m32_jumptable[ebx] ; NP loop unrolling
  86. align 4
  87. m32_jumptable:
  88. dd m32_case0
  89. dd m32_case1
  90. dd m32_case2
  91. dd m32_case3
  92. nop
  93. align 8
  94. nop
  95. nop
  96. nop ; Get loop nicely aligned
  97. m32_case0:
  98. sub ebp,4 ; U
  99. jbe SHORT m32_done ; V
  100. m32_loop:
  101. mov eax,[esi+4] ; U
  102. mov ebx,edx ; V Remember carry for later
  103. add esi,16 ; U
  104. add edi,16 ; V
  105. mul ecx ; NP
  106. add eax,ebx ; U Add carry in from previous word
  107. adc edx,0 ; U
  108. mov [edi-12],eax ; V
  109. m32_case3:
  110. mov eax,[esi-8] ; U
  111. mov ebx,edx ; V Remember carry for later
  112. mul ecx ; NP
  113. add eax,ebx ; U Add carry in from previous word
  114. adc edx,0 ; U
  115. mov [edi-8],eax ; V
  116. m32_case2:
  117. mov eax,[esi-4] ; U
  118. mov ebx,edx ; V Remember carry for later
  119. mul ecx ; NP
  120. add eax,ebx ; U Add carry in from previous word
  121. adc edx,0 ; U
  122. mov [edi-4],eax ; V
  123. m32_case1:
  124. mov eax,[esi] ; U
  125. mov ebx,edx ; V Remember carry for later
  126. mul ecx ; NP
  127. add eax,ebx ; U Add carry in from previous word
  128. adc edx,0 ; U
  129. mov [edi],eax ; V
  130. sub ebp,4 ; U
  131. ja SHORT m32_loop ; V
  132. m32_done:
  133. mov [edi+4],edx ; U
  134. pop edi ; V
  135. pop ebx ; U
  136. pop ebp ; V
  137. pop esi ; U
  138. ret ; NP
  139. _lbnMulN1_32 endp
  140. align 16
  141. _lbnMulAdd1_32 proc near
  142. push esi ; U
  143. mov esi,[esp+12] ; V load in
  144. push edi ; U
  145. mov edi,[esp+12] ; V load out
  146. push ebp ; U
  147. mov ebp,[esp+24] ; V load len
  148. push ebx ; U
  149. mov ecx,[esp+32] ; V load k
  150. ;; First multiply step has no carry in.
  151. mov eax,[esi] ; U
  152. mov ebx,[edi] ; V
  153. mul ecx ; NP first multiply
  154. add ebx,eax ; U
  155. lea eax,[ebp*4-4] ; V loop unrolling
  156. adc edx,0 ; U
  157. and eax,12 ; V loop unrolling
  158. mov [edi],ebx ; U
  159. add esi,eax ; V loop unrolling
  160. add edi,eax ; U loop unrolling
  161. jmp DWORD PTR ma32_jumptable[eax] ; NP loop unrolling
  162. align 4
  163. ma32_jumptable:
  164. dd ma32_case0
  165. dd ma32_case1
  166. dd ma32_case2
  167. dd ma32_case3
  168. nop
  169. align 8
  170. nop
  171. nop
  172. nop ; To align loop properly
  173. ma32_case0:
  174. sub ebp,4 ; U
  175. jbe SHORT ma32_done ; V
  176. ma32_loop:
  177. mov eax,[esi+4] ; U
  178. mov ebx,edx ; V Remember carry for later
  179. add esi,16 ; U
  180. add edi,16 ; V
  181. mul ecx ; NP
  182. add eax,ebx ; U Add carry in from previous word
  183. mov ebx,[edi-12] ; V
  184. adc edx,0 ; U
  185. add ebx,eax ; V
  186. adc edx,0 ; U
  187. mov [edi-12],ebx ; V
  188. ma32_case3:
  189. mov eax,[esi-8] ; U
  190. mov ebx,edx ; V Remember carry for later
  191. mul ecx ; NP
  192. add eax,ebx ; U Add carry in from previous word
  193. mov ebx,[edi-8] ; V
  194. adc edx,0 ; U
  195. add ebx,eax ; V
  196. adc edx,0 ; U
  197. mov [edi-8],ebx ; V
  198. ma32_case2:
  199. mov eax,[esi-4] ; U
  200. mov ebx,edx ; V Remember carry for later
  201. mul ecx ; NP
  202. add eax,ebx ; U Add carry in from previous word
  203. mov ebx,[edi-4] ; V
  204. adc edx,0 ; U
  205. add ebx,eax ; V
  206. adc edx,0 ; U
  207. mov [edi-4],ebx ; V
  208. ma32_case1:
  209. mov eax,[esi] ; U
  210. mov ebx,edx ; V Remember carry for later
  211. mul ecx ; NP
  212. add eax,ebx ; U Add carry in from previous word
  213. mov ebx,[edi] ; V
  214. adc edx,0 ; U
  215. add ebx,eax ; V
  216. adc edx,0 ; U
  217. mov [edi],ebx ; V
  218. sub ebp,4 ; U
  219. ja SHORT ma32_loop ; V
  220. ma32_done:
  221. pop ebx ; U
  222. pop ebp ; V
  223. mov eax,edx ; U
  224. pop edi ; V
  225. pop esi ; U
  226. ret ; NP
  227. _lbnMulAdd1_32 endp
  228. align 16
  229. _lbnMulSub1_32 proc near
  230. push esi ; U
  231. mov esi,[esp+12] ; V load in
  232. push edi ; U
  233. mov edi,[esp+12] ; V load out
  234. push ebp ; U
  235. mov ebp,[esp+24] ; V load len
  236. push ebx ; U
  237. mov ecx,[esp+32] ; V load k
  238. ;; First multiply step has no carry in.
  239. push esi ; U
  240. mov esi,[esp+12] ; V load in
  241. push edi ; U
  242. mov edi,[esp+12] ; V load out
  243. push ebp ; U
  244. mov ebp,[esp+24] ; V load len
  245. mov ecx,[esp+28] ; U load k
  246. ;; First multiply step has no carry in.
  247. mov eax,[esi] ; V
  248. mov ebx,[edi] ; U
  249. mul ecx ; NP first multiply
  250. sub ebx,eax ; U
  251. lea eax,[ebp*4-4] ; V loop unrolling
  252. adc edx,0 ; U
  253. and eax,12 ; V loop unrolling
  254. mov [edi],ebx ; U
  255. add esi,eax ; V loop unrolling
  256. add edi,eax ; U loop unrolling
  257. jmp DWORD PTR ms32_jumptable[eax] ; NP loop unrolling
  258. align 4
  259. ms32_jumptable:
  260. dd ms32_case0
  261. dd ms32_case1
  262. dd ms32_case2
  263. dd ms32_case3
  264. nop
  265. align 8
  266. nop
  267. nop
  268. nop
  269. ms32_case0:
  270. sub ebp,4 ; U
  271. jbe SHORT ms32_done ; V
  272. ms32_loop:
  273. mov eax,[esi+4] ; U
  274. mov ebx,edx ; V Remember carry for later
  275. add esi,16 ; U
  276. add edi,16 ; V
  277. mul ecx ; NP
  278. add eax,ebx ; U Add carry in from previous word
  279. mov ebx,[edi-12] ; V
  280. adc edx,0 ; U
  281. sub ebx,eax ; V
  282. adc edx,0 ; U
  283. mov [edi-12],ebx ; V
  284. ms32_case3:
  285. mov eax,[esi-8] ; U
  286. mov ebx,edx ; V Remember carry for later
  287. mul ecx ; NP
  288. add eax,ebx ; U Add carry in from previous word
  289. mov ebx,[edi-8] ; V
  290. adc edx,0 ; U
  291. sub ebx,eax ; V
  292. adc edx,0 ; U
  293. mov [edi-8],ebx ; V
  294. ms32_case2:
  295. mov eax,[esi-4] ; U
  296. mov ebx,edx ; V Remember carry for later
  297. mul ecx ; NP
  298. add eax,ebx ; U Add carry in from previous word
  299. mov ebx,[edi-4] ; V
  300. adc edx,0 ; U
  301. sub ebx,eax ; V
  302. adc edx,0 ; U
  303. mov [edi-4],ebx ; V
  304. ms32_case1:
  305. mov eax,[esi] ; U
  306. mov ebx,edx ; V Remember carry for later
  307. mul ecx ; NP
  308. add eax,ebx ; U Add carry in from previous word
  309. mov ebx,[edi] ; V
  310. adc edx,0 ; U
  311. sub ebx,eax ; V
  312. adc edx,0 ; U
  313. mov [edi],ebx ; V
  314. sub ebp,4 ; U
  315. ja SHORT ms32_loop ; V
  316. ms32_done:
  317. pop ebx ; U
  318. pop ebp ; V
  319. mov eax,edx ; U
  320. pop edi ; V
  321. pop esi ; U
  322. ret ; NP
  323. _lbnMulSub1_32 endp
  324. ;; Two-word by one-word divide. Stores quotient, returns remainder.
  325. ;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
  326. ;; 4 8 12 16
  327. align 4
  328. _lbnDiv21_32 proc near
  329. mov edx,[esp+8] ; U Load nh
  330. mov eax,[esp+12] ; V Load nl
  331. mov ecx,[esp+4] ; U Load q
  332. div DWORD PTR [esp+16] ; NP
  333. mov [ecx],eax ; U Store quotient
  334. mov eax,edx ; V Return remainder
  335. ret
  336. _lbnDiv21_32 endp
  337. ;; Multi-word by one-word remainder.
  338. ;; This speeds up key generation. It's not worth unrolling and so on;
  339. ;; using 32-bit divides is enough of a speedup.
  340. ;;
  341. ;; The modulus (in ebp) is often 16 bits. Given that the dividend is 32
  342. ;; bits, the chances of saving the first divide because the high word of the
  343. ;; dividend is less than the modulus are low enough it's not worth taking
  344. ;; the cycles to test for it.
  345. ;;
  346. ;; unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
  347. ;; 4 8 12
  348. align 4
  349. _lbnModQ_32 proc near
  350. mov eax,[esp+4] ; U Load n
  351. push ebp ; V
  352. mov ebp,[esp+12] ; U Load len
  353. push esi ; V
  354. lea esi,[ebp*4+eax-4] ; U
  355. mov ecx,[esp+20] ; V Load d
  356. xor edx,edx ; U Clear edx for first iteration
  357. modq32_loop:
  358. mov eax,[esi] ; U Load new low word for divide
  359. sub esi,4 ; V
  360. div ecx ; NP edx = edx:eax % ecx
  361. dec ebp ; U
  362. jnz SHORT modq32_loop ; V
  363. pop esi ; U
  364. mov eax,edx ; V Return remainder in eax
  365. pop ebp ; U
  366. ret ; NP
  367. _lbnModQ_32 endp
  368. end