2
0

lbn8086.asm 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038
  1. ;;; Copyright (c) 1995, Colin Plumb.
  2. ;;; For licensing and other legal details, see the file legal.c.
  3. ;;;
  4. ;;; Assembly primitives for bignum library, 80x86 family.
  5. ;;;
  6. ;;; Several primitives are included here. Only lbnMulAdd1 is *really*
  7. ;;; critical, but once that's written, lnmMul1 and lbnSub1 are quite
  8. ;;; easy to write as well, so they are included here as well.
  9. ;;; lbnDiv21 and lbnModQ are so easy to write that they're included, too.
  10. ;;;
  11. ;;; All functions here are for large code, large data.
  12. ;;; All use standard "cdecl" calling convention: arguments pushed on the
  13. ;;; stack (ss:sp) right to left (the leftmost agrument at the lowest address)
  14. ;;; and popped by the caller, return values in ax or dx:ax, and register
  15. ;;; usage as follows:
  16. ;;;
  17. ;;; Callee-save (preserved by callee if needed):
  18. ;;; ss, esp, cs, eip, ds, esi, edi, ebp, high byte of FLAGS except DF,
  19. ;;; all other registers (CRx, DRx, TRx, IDT, GDT, LDT, TR, etc.).
  20. ;;; Caller-save (may be corrupted by callee):
  21. ;;; es, eax, ebx, ecx, edx, low byte of flags (SF, ZF, AF, PF, CF)
  22. ;;;
  23. ;;; The direction flag (DF) is either preserved or cleared.
  24. ;;; I'm not sure what the calling convention is for fs and gs. This
  25. ;;; code never alters them.
  26. ;; Not all of this code has to be '386 code, but STUPID FUCKING MASM (5.0)
  27. ;; gives an error if you change in the middle of a segment. Rather than
  28. ;; fight the thing, just enable '386 instructions everywhere. (And lose
  29. ;; the error checking.)
  30. .386
  31. _TEXT segment para public use16 'CODE' ; 16-byte aligned because '486 cares
  32. assume cs:_TEXT
  33. public _lbnMulN1_16
  34. public _lbnMulAdd1_16
  35. public _lbnMulSub1_16
  36. public _lbnDiv21_16
  37. public _lbnModQ_16
  38. public _lbnMulN1_32
  39. public _lbnMulAdd1_32
  40. public _lbnMulSub1_32
  41. public _lbnDiv21_32
  42. public _lbnModQ_32
  43. public _not386
  44. ;; Prototype:
  45. ;; BNWORD16
  46. ;; lbnMulAdd_16(BNWORD16 *out, BNWORD16 *in, unsigned len, BNWORD16 k)
  47. ;;
  48. ;; Multiply len words of "in" by k and add to len words of "out";
  49. ;; return the len+1st word of carry. All pointers are to the least-
  50. ;; significant ends of the appropriate arrays. len is guaraneed > 0.
  51. ;;
  52. ;; This 16-bit code is optimized for an 8086/80286. It will not be run
  53. ;; on 32-bit processors except for debugging during development.
  54. ;;
  55. ;; NOTE that it may be possible to assume that the direction flag is clear
  56. ;; on entry; this would avoid the need for the cld instructions. Hoewever,
  57. ;; the Microsoft C libraries require that the direction flag be clear.
  58. ;; Thus, lbnModQ_16 clears it before returning.
  59. ;;
  60. ;; Stack frame:
  61. ;; +--------+ bp+18
  62. ;; | k |
  63. ;; +--------+ bp+16
  64. ;; | len |
  65. ;; +--------+ bp+14
  66. ;; | |
  67. ;; +- in -+
  68. ;; | |
  69. ;; +--------+ bp+10
  70. ;; | |
  71. ;; +- out -+
  72. ;; | |
  73. ;; +--------+ bp+6
  74. ;; | |
  75. ;; +-return-+
  76. ;; | |
  77. ;; +--------+ bp+2
  78. ;; | old bp |
  79. ;; +--------+ bp
  80. ;;
  81. ;; Register usage for lbnMul1_16:
  82. ;; ds:[si] in
  83. ;; es:[di] out
  84. ;; bp k
  85. ;; cx loop counter (len/4)
  86. ;; dx,ax high,low parts of product
  87. ;; bx carry from previous multiply iteration
  88. ;;
  89. ;; Register usage for lbnMulAdd1_16 and lbnMulSub1_16:
  90. ;; ds:[si] in
  91. ;; es:[bx+si] out
  92. ;; bp k
  93. ;; cx loop counter (len/4)
  94. ;; dx,ax high,low parts of product
  95. ;; di carry from previous multiply iteration
  96. ;;
  97. ;; The reson for the difference is that straight mul can use stosw, but
  98. ;; the multiply and add or multiply and subtract add the result in, so
  99. ;; they have to reference es:[di] to add it in.
  100. ;;
  101. ;; The options are either "add ax,es:[di]; stosw" or "add es:[di],ax;
  102. ;; add di,2"; both take 10 cycles on an 80286, 27 on an 8086 and 35 on
  103. ;; an 8088 although the former is preferred since it's one byte smaller.
  104. ;; However, using [bx+si] is even faster; "add es:[bx+si],ax" takes
  105. ;; 7 cycles on an 80286, 25 on an 8086 and 33 on an 8088, as well as
  106. ;; being the smallest. (Of course, stosw, at 3 on an 80286, 11 on an
  107. ;; 8086 amd 15 on an 8088 wins easily in the straight multiply case over
  108. ;; mov es:[bx+si],ax, which takes 3/18/22 cycles and is larger to boot.)
  109. ;;
  110. ;; Most of these register assignments are driven by the 8086's instruction
  111. ;; set. The only really practical variation would be to put the multiplier
  112. ;; k into bx or di and use bp for carry, but if someone can make a faster
  113. ;; Duff's device using a lookup table, bx and di are useful because indexing
  114. ;; off them is more flexible than bp.
  115. ;;
  116. ;; Overview of code:
  117. ;;
  118. ;; len is guaranteed to be at least 1, so do the first multiply (with no
  119. ;; carry in) unconditionally. Then go to a min loop unrolled 4 times,
  120. ;; jumping into the middle using a variant of Duff's device.
  121. ;;
  122. ;; The loop is constructed using the loop instruction, which does
  123. ;; "} while (--cnt)". This means that we have to divide the count
  124. ;; by 4, and increment it so it doesn't start at 0. To gain a little
  125. ;; bit more efficiency, we actually increment the count by 2, so the
  126. ;; minimum possible value is 3, which will be shifted down to produce 0.
  127. ;; usually in Duff's device, if the number of iterations is a multiple
  128. ;; of the unrolling factor, you branch to just before the loop conditional
  129. ;; and let it handle the case of 0. Here, we have a special test for 0
  130. ;; at the head of the loop and fall through into the top of the loop
  131. ;; if it passes.
  132. ;;
  133. ;; Basically, with STEP being a multiply step, it's:
  134. ;;
  135. ;; STEP;
  136. ;; count += 2;
  137. ;; mod4 = count % 4;
  138. ;; count /= 4;
  139. ;; switch(mod4) {
  140. ;; case 3:
  141. ;; if (count) {
  142. ;; do {
  143. ;; STEP;
  144. ;; case 2:
  145. ;; STEP;
  146. ;; case 1:
  147. ;; STEP;
  148. ;; case 0:
  149. ;; STEP;
  150. ;; } while (--count);
  151. ;; }
  152. ;; }
  153. ;;
  154. ;; The switch() is actually done by two levels of branch instructions
  155. ;; rather than a lookup table.
  156. _lbnMulN1_16 proc far
  157. push bp
  158. mov bp,sp
  159. push ds
  160. push si
  161. push di
  162. cld
  163. les di,[bp+6] ; out
  164. lds si,[bp+10] ; in
  165. mov cx,[bp+14] ; len
  166. mov bp,[bp+16] ; k
  167. ;; First multiply step has no carry in
  168. lodsw
  169. mul bp
  170. stosw
  171. ;; The switch() for Duff's device starts here
  172. ;; Note: this *is* faster than a jump table for an 8086 and '286.
  173. ;; 8086: jump table: 44 cycles; this: 27/29/31/41
  174. ;; 80286: jump table: 25 cycles; this: 17/17/20/22
  175. shr cx,1
  176. jc SHORT m16_odd
  177. inc cx
  178. shr cx,1
  179. jc SHORT m16_case2
  180. jmp SHORT m16_case0
  181. nop ; To align loop
  182. m16_odd:
  183. inc cx
  184. shr cx,1
  185. jnc SHORT m16_case1
  186. jz SHORT m16_done ; Avoid entire loop in this case
  187. m16_loop:
  188. lodsw
  189. mov bx,dx ; Remember carry for later
  190. mul bp
  191. add ax,bx ; Add carry in from previous word
  192. adc dx,0
  193. stosw
  194. m16_case2:
  195. lodsw
  196. mov bx,dx ; Remember carry for later
  197. mul bp
  198. add ax,bx ; Add carry in from previous word
  199. adc dx,0
  200. stosw
  201. m16_case1:
  202. lodsw
  203. mov bx,dx ; Remember carry for later
  204. mul bp
  205. add ax,bx ; Add carry in from previous word
  206. adc dx,0
  207. stosw
  208. m16_case0:
  209. lodsw
  210. mov bx,dx ; Remember carry for later
  211. mul bp
  212. add ax,bx ; Add carry in from previous word
  213. adc dx,0
  214. stosw
  215. loop m16_loop
  216. m16_done:
  217. mov ax,dx
  218. stosw ; Store last word
  219. pop di
  220. pop si
  221. pop ds
  222. pop bp
  223. ret
  224. _lbnMulN1_16 endp
  225. align 2
  226. _lbnMulAdd1_16 proc far
  227. push bp
  228. mov bp,sp
  229. push ds
  230. push si
  231. push di
  232. cld
  233. les bx,[bp+6] ; out
  234. lds si,[bp+10] ; in
  235. mov cx,[bp+14] ; len
  236. mov bp,[bp+16] ; k
  237. ;; First multiply step has no carry in
  238. lodsw
  239. mul bp
  240. add es:[bx],ax ; This time, store in [bx] directly
  241. adc dx,0
  242. sub bx,si ; Prepare to use [bx+si].
  243. ;; The switch() for Duff's device starts here
  244. ;; Note: this *is* faster than a jump table for an 8086 and '286.
  245. ;; 8086: jump table: 44 cycles; this: 27/29/31/41
  246. ;; 80286: jump table: 25 cycles; this: 17/17/20/22
  247. shr cx,1
  248. jc SHORT ma16_odd
  249. inc cx
  250. shr cx,1
  251. jc SHORT ma16_case2
  252. jmp SHORT ma16_case0
  253. ma16_odd:
  254. inc cx
  255. shr cx,1
  256. jnc SHORT ma16_case1
  257. jz SHORT ma16_done ; Avoid entire loop in this case
  258. ma16_loop:
  259. lodsw
  260. mov di,dx ; Remember carry for later
  261. mul bp
  262. add ax,di ; Add carry in from previous word
  263. adc dx,0
  264. add es:[bx+si],ax
  265. adc dx,0
  266. ma16_case2:
  267. lodsw
  268. mov di,dx ; Remember carry for later
  269. mul bp
  270. add ax,di ; Add carry in from previous word
  271. adc dx,0
  272. add es:[bx+si],ax
  273. adc dx,0
  274. ma16_case1:
  275. lodsw
  276. mov di,dx ; Remember carry for later
  277. mul bp
  278. add ax,di ; Add carry in from previous word
  279. adc dx,0
  280. add es:[bx+si],ax
  281. adc dx,0
  282. ma16_case0:
  283. lodsw
  284. mov di,dx ; Remember carry for later
  285. mul bp
  286. add ax,di ; Add carry in from previous word
  287. adc dx,0
  288. add es:[bx+si],ax
  289. adc dx,0
  290. loop ma16_loop
  291. ma16_done:
  292. mov ax,dx
  293. pop di
  294. pop si
  295. pop ds
  296. pop bp
  297. ret
  298. _lbnMulAdd1_16 endp
  299. align 2
  300. _lbnMulSub1_16 proc far
  301. push bp
  302. mov bp,sp
  303. push ds
  304. push si
  305. push di
  306. cld
  307. les bx,[bp+6] ; out
  308. lds si,[bp+10] ; in
  309. mov cx,[bp+14] ; len
  310. mov bp,[bp+16] ; k
  311. ;; First multiply step has no carry in
  312. lodsw
  313. mul bp
  314. sub es:[bx],ax ; This time, store in [bx] directly
  315. adc dx,0
  316. sub bx,si ; Prepare to use [bx+si].
  317. ;; The switch() for Duff's device starts here
  318. ;; Note: this *is* faster than a jump table for an 8086 and '286.
  319. ;; 8086: jump table: 44 cycles; this: 27/29/31/41
  320. ;; 80286: jump table: 25 cycles; this: 17/17/20/22
  321. shr cx,1
  322. jc SHORT ms16_odd
  323. inc cx
  324. shr cx,1
  325. jc SHORT ms16_case2
  326. jmp SHORT ms16_case0
  327. ms16_odd:
  328. inc cx
  329. shr cx,1
  330. jnc SHORT ms16_case1
  331. jz SHORT ms16_done ; Avoid entire loop in this case
  332. ms16_loop:
  333. lodsw
  334. mov di,dx ; Remember carry for later
  335. mul bp
  336. add ax,di ; Add carry in from previous word
  337. adc dx,0
  338. sub es:[bx+si],ax
  339. adc dx,0
  340. ms16_case2:
  341. lodsw
  342. mov di,dx ; Remember carry for later
  343. mul bp
  344. add ax,di ; Add carry in from previous word
  345. adc dx,0
  346. sub es:[bx+si],ax
  347. adc dx,0
  348. ms16_case1:
  349. lodsw
  350. mov di,dx ; Remember carry for later
  351. mul bp
  352. add ax,di ; Add carry in from previous word
  353. adc dx,0
  354. sub es:[bx+si],ax
  355. adc dx,0
  356. ms16_case0:
  357. lodsw
  358. mov di,dx ; Remember carry for later
  359. mul bp
  360. add ax,di ; Add carry in from previous word
  361. adc dx,0
  362. sub es:[bx+si],ax
  363. adc dx,0
  364. loop ms16_loop
  365. ms16_done:
  366. mov ax,dx
  367. pop di
  368. pop si
  369. pop ds
  370. pop bp
  371. ret
  372. _lbnMulSub1_16 endp
  373. ;; Two-word by one-word divide. Stores quotient, returns remainder.
  374. ;; BNWORD16 lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
  375. ;; 4 8 10 12
  376. align 2
  377. _lbnDiv21_16 proc far
  378. mov cx,bp ; bp NOT pushed; note change in offsets
  379. mov bp,sp
  380. mov dx,[bp+8]
  381. mov ax,[bp+10]
  382. div WORD PTR [bp+12]
  383. les bx,[bp+4]
  384. mov es:[bx],ax
  385. mov ax,dx
  386. mov bp,cx
  387. ret
  388. nop ; To align loop in lbnModQ properly
  389. _lbnDiv21_16 endp
  390. ;; Multi-word by one-word remainder.
  391. ;; BNWORD16 lbnModQ_16(BNWORD16 *q, unsigned len, unsigned d)
  392. ;; 6 10 12
  393. _lbnModQ_16 proc far
  394. push bp
  395. mov bp,sp
  396. push ds
  397. mov bx,si
  398. mov cx,10[bp] ; load len
  399. lds si,6[bp] ; load q
  400. std ; loop MSW to LSW
  401. add si,cx
  402. mov bp,12[bp] ; load d
  403. add si,cx
  404. xor dx,dx ; Set up for first divide
  405. sub si,2 ; Adjust pointer to point to MSW
  406. lodsw ; Load first word
  407. cmp ax,bp ; See if we can skip first divide
  408. jnc SHORT modq16_inner ; No such luck
  409. mov dx,ax ; Yes! Modulus > input, so remainder = input
  410. dec cx ; Do loop
  411. jz SHORT modq16_done
  412. modq16_loop:
  413. lodsw
  414. modq16_inner:
  415. div bp
  416. loop modq16_loop
  417. modq16_done:
  418. pop ds
  419. mov ax,dx ; Return remainder
  420. pop bp
  421. mov si,bx
  422. cld ; Microsoft C's libraries assume this
  423. ret
  424. _lbnModQ_16 endp
  425. ;; Similar, but using 32-bit operations.
  426. ;;
  427. ;; The differences are that the switch() in Duff's device is done using
  428. ;; a jump table, and lods is not used because it's slower than load and
  429. ;; increment. The pointers are only updated once per loop; offset
  430. ;; addressing modes are used, since they're no slower. [di] is used
  431. ;; instead of [bx+si] because the extra increment of di take only one
  432. ;; cycle per loop a '486, while [bx+si] takes one extra cycle per multiply.
  433. ;;
  434. ;; The register assignments are also slightly different:
  435. ;;
  436. ;; es:[si] in
  437. ;; ds:[di] out
  438. ;; ecx k
  439. ;; bp loop counter (len/4)
  440. ;; edx,eax high,low parts of product
  441. ;; ebx carry word from previous multiply iteration
  442. ;;
  443. ;; The use of bp for a loop counter lets all the 32-bit values go
  444. ;; in caller-save registers, so there's no need to do any 32-bit
  445. ;; saves and restores. Using ds:di for the destination saves one
  446. ;; segment override in the lbnMulN1_32 code, since there's one more
  447. ;; store to [di] than load from es:[si].
  448. ;;
  449. ;; Given the number of 32-bit references that this code uses, optimizing
  450. ;; it for the Pentium is interesting, because the Pentium has a very
  451. ;; inefficient implementation of prefix bytes. Each prefix byte, with
  452. ;; the exception of 0x0f *>> on conditional branch instructions ONLY <<*
  453. ;; is a 1-cycle non-pairiable instruction. Which has the effect of
  454. ;; forcing the instruction it's on into the U pipe. But this code uses
  455. ;; *lots* of prefix bytes, notably the 0x66 operand size override.
  456. ;;
  457. ;; For example "add [di],eax" is advised against in Intel's optimization
  458. ;; papers, because it takes 3 cycles and 2 of them are not pairable.
  459. ;; But any longer sequence would have a prefix byte on every instruction,
  460. ;; resulting in even more non-pairable cycles. Also, only two instructions
  461. ;; in the multiply kernel can go in the V pipe (the increments of si and
  462. ;; di), and they're already there, so the pairable cycles would be wasted.
  463. ;;
  464. ;; Things would be *quite* different in native 32-bit mode.
  465. ;;
  466. ;; All instructions that could go in the V pipe that aren't there are
  467. ;; marked.
  468. ;;
  469. ;; The setup code is quite intricately interleaved to get the best possible
  470. ;; performance out of a Pentium. If you want to follow the code,
  471. ;; pretend that the sections actually come in the following order:
  472. ;; 1) prologue (push registers)
  473. ;; 2) load (fetch arguments)
  474. ;; 3) first multiply
  475. ;; 4) loop unrolling
  476. ;;
  477. ;; The loop unrolling setup consists of taking the count, adjusting
  478. ;; it to account for the first multiply, and splitting it into
  479. ;; two parts: the high bits are a loop count, while the low bits are
  480. ;; used to find the right entry in the Duff's device jump table and
  481. ;; to adjust the initial data pointers.
  482. ;;
  483. ;; Known slack: There is one instruction in the prologue and one in
  484. ;; the epilogue that could go in the V pipe if I could find a U-pipe
  485. ;; instruction to pair them with, but all the U-pipe instructions
  486. ;; are already paired, so it looks difficult.
  487. ;;
  488. ;; There is a cycle of Address Generation Interlock in the lbnMulN1_32
  489. ;; code on the Pentium (not on a '486). I can't figure out how to
  490. ;; get rid of it without wasting time elsewhere. The problem is that
  491. ;; the load of bx needs to be done as soon as possible to let it
  492. ;; be set up in time for the switch(). The other problem is the
  493. ;; epilogue code which can waste time if the order of the pushed
  494. ;; registers is diddled with so that ds doesn't come between si and di.
  495. ;;
  496. ;; The increment of si after the last load is redundant, and the
  497. ;; copy of the high word of the product to the carry after the last
  498. ;; multiply is likewise unnecessary.
  499. ;;
  500. ;; In these cases, the operations were done that way in order to remove
  501. ;; cycles from the loop on the '486 and/or Pentium, even though it costs
  502. ;; a few overhead cycles on a '386.
  503. ;; The increment fo si has to be done early because a load based on si
  504. ;; is the first thing in any given multiply step, and the address
  505. ;; generation interlock on the '486 and Pentium requires that a full
  506. ;; cycle (i.e. possibly two instructions on a Pentium) pass between
  507. ;; incrementing a register and using it in an address.
  508. ;; This saves one cycle per multiply on a '486 and Pentium, and costs
  509. ;; 2 cycles per call to the function on a '386 and 1 cycle on a '486.
  510. ;;
  511. ;; The carry word is copied where it is so that the decrement of the loop
  512. ;; counter happens in the V pipe. The instruction between the decrement
  513. ;; of the loop counter and the branch should be a U-pipe instruction that
  514. ;; doesn't affect the flags. Thus, the "mov" was rotated down from
  515. ;; the top of the loop to fill the slot.
  516. ;; This is a bit more marginal: it saves one cycle per loop iteration on
  517. ;; a Pentium, and costs 2 cycles per call on a '386, '486 or Pentium.
  518. ;;
  519. ;; The same logic applies to the copy of the carry and increment of si
  520. ;; before the test, in case 0, for skipping the loop entirely.
  521. ;; It makes no difference in speed if the loop is executed, but
  522. ;; incrementing si before saves an address generation interlock cycle
  523. ;; On a '486 and Pentium in the case that the loop is executed.
  524. ;; And the loop is executed more often than not.
  525. ;;
  526. ;; Given that just one multiply on a '386 takes 12 to 41 cycles (with the
  527. ;; average being very much at the high end of that) 4 cycles of additional
  528. ;; overhead per call is not a big deal.
  529. ;;
  530. ;; On a Pentium, it would actually be easier to *not* unroll the loop
  531. ;; at all, since the decrement and compare are completely hidden
  532. ;; in the V-pipe and it wouldn't cost anything to do them more often.
  533. ;; That would save the setup for the unrolling and Duff's device at the
  534. ;; beginning. But the overhead for that is pretty minor: ignoring what's
  535. ;; hidden in the V pipe, it's two cycles plus the indirect jump.
  536. ;; Not too much, and special-casing the pentium is quite a hassle.
  537. ;; (For starters, you have to detect it, and since you're probably in
  538. ;; V86 mode, without access to the EFLAGS register to test the CPUID bit.)
  539. align 16
  540. _lbnMulN1_32 proc far
  541. push bp ; U prologue ** Could be V
  542. mov bp,sp ; V prologue
  543. push si ; U prologue ** Could be V
  544. mov bx,[bp+14] ; U load len ** Could be V (AGI!)r
  545. push ds ; NP prologue
  546. les si,[bp+10] ; NP load in
  547. mov ecx,[bp+16] ; U load k
  548. dec bx ; V loop unrolling
  549. shl bx,2 ; U loop unrolling
  550. push di ; V prologue
  551. lds di,[bp+6] ; NP load out
  552. mov bp,bx ; U loop unrolling ** Could be V
  553. and bx,12 ; V loop unrolling
  554. ;; First multiply step has no carry in.
  555. mov eax,es:[si] ; U first multiply
  556. add si,bx ; V loop unrolling
  557. mul ecx ; NP first multiply
  558. mov [di],eax ; U first multiply
  559. add di,bx ; V loop unrolling
  560. ;; The switch() for Duff's device. This jump table is (slightly!) faster
  561. ;; than a bunch of branches on a '386 and '486, and is probably better yet
  562. ;; on higher processors.
  563. jmp WORD PTR cs:m32_jumptable[bx] ; NP loop unrolling
  564. align 2
  565. m32_jumptable:
  566. dw OFFSET m32_case0, 0
  567. dw OFFSET m32_case1, 0
  568. dw OFFSET m32_case2, 0
  569. dw OFFSET m32_case3, 0, 0, 0, 0 ; Get loop aligned properly
  570. m32_case0:
  571. add si,16 ; U Fix up si ** Could be V
  572. test bp,bp ; V
  573. mov ebx,edx ; U Remember carry for later
  574. jbe SHORT m32_done ; V Avoid entire loop if loop count is 0
  575. m32_loop:
  576. mov eax,es:[si-12] ; U
  577. add di, 16 ; V
  578. mul ecx ; NP
  579. add eax,ebx ; U Add carry in from previous word
  580. adc edx,0 ; U
  581. mov [di-12],eax ; U
  582. m32_case3:
  583. mov ebx,edx ; U Remember carry for later
  584. mov eax,es:[si-8] ; U
  585. mul ecx ; NP
  586. add eax,ebx ; U Add carry in from previous word
  587. adc edx,0 ; U
  588. mov [di-8],eax ; U
  589. m32_case2:
  590. mov ebx,edx ; U Remember carry for later
  591. mov eax,es:[si-4] ; U
  592. mul ecx ; NP
  593. add eax,ebx ; U Add carry in from previous word
  594. adc edx,0 ; U
  595. mov [di-4],eax ; U
  596. m32_case1:
  597. mov ebx,edx ; U Remember carry for later
  598. mov eax,es:[si] ; U
  599. mul ecx ; NP
  600. add eax,ebx ; U Add carry in from previous word
  601. adc edx,0 ; U
  602. add si,16 ; V
  603. mov [di],eax ; U
  604. sub bp,16 ; V
  605. mov ebx,edx ; U Remember carry for later
  606. ja m32_loop ; V
  607. m32_done:
  608. mov [di+4],edx ; U
  609. pop di ; V
  610. pop ds ; NP
  611. pop si ; U ** Could be V
  612. pop bp ; V
  613. ret ; NP
  614. _lbnMulN1_32 endp
  615. align 16
  616. _lbnMulAdd1_32 proc far
  617. push bp ; U prologue ** Could be V
  618. mov bp,sp ; V prologue
  619. push ds ; NP prologue
  620. mov ecx,[bp+16] ; U load k
  621. mov bx,[bp+14] ; V load len
  622. push di ; U prologue ** Could be V
  623. dec bx ; V loop unrolling
  624. lds di,[bp+6] ; NP load out
  625. shl bx,2 ; U loop unrolling
  626. push si ; V prologue
  627. les si,[bp+10] ; NP load in
  628. mov bp,bx ; U loop unrolling ** Could be V
  629. and bx,12 ; V loop unrolling
  630. ;; First multiply step has no carry in.
  631. mov eax,es:[si] ; U first multiply
  632. add si,bx ; V loop unrolling
  633. mul ecx ; NP first multiply
  634. add [di],eax ; U first multiply
  635. adc edx,0 ; U first multiply
  636. add di,bx ; V loop unrolling
  637. ;; The switch() for Duff's device. This jump table is (slightly!) faster
  638. ;; than a bunch of branches on a '386 and '486, and is probably better yet
  639. ;; on higher processors.
  640. jmp WORD PTR cs:ma32_jumptable[bx] ; NP loop unrolling
  641. align 2
  642. ma32_jumptable:
  643. dw OFFSET ma32_case0, 0
  644. dw OFFSET ma32_case1, 0
  645. dw OFFSET ma32_case2, 0
  646. dw OFFSET ma32_case3, 0, 0 ; To get loop aligned properly
  647. ma32_case0:
  648. add si,16 ; U Fix up si ** Could be V
  649. test bp,bp ; V
  650. mov ebx,edx ; U Remember carry for later
  651. jbe SHORT ma32_done ; V Avoid entire loop if loop count is 0
  652. ma32_loop:
  653. mov eax,es:[si-12] ; U
  654. add di, 16 ; V
  655. mul ecx ; NP
  656. add eax,ebx ; U Add carry in from previous word
  657. adc edx,0 ; U
  658. add [di-12],eax ; U
  659. adc edx,0 ; U
  660. ma32_case3:
  661. mov ebx,edx ; U Remember carry for later
  662. mov eax,es:[si-8] ; U
  663. mul ecx ; NP
  664. add eax,ebx ; U Add carry in from previous word
  665. adc edx,0 ; U
  666. add [di-8],eax ; U
  667. adc edx,0 ; U
  668. ma32_case2:
  669. mov ebx,edx ; U Remember carry for later
  670. mov eax,es:[si-4] ; U
  671. mul ecx ; NP
  672. add eax,ebx ; U Add carry in from previous word
  673. adc edx,0 ; U
  674. add [di-4],eax ; U
  675. adc edx,0 ; U
  676. ma32_case1:
  677. mov ebx,edx ; U Remember carry for later
  678. mov eax,es:[si] ; U
  679. mul ecx ; NP
  680. add eax,ebx ; U Add carry in from previous word
  681. adc edx,0 ; U
  682. add si,16 ; V
  683. add [di],eax ; U
  684. adc edx,0 ; U
  685. sub bp,16 ; V
  686. mov ebx,edx ; U Remember carry for later
  687. ja ma32_loop ; V
  688. ma32_done:
  689. pop si ; U ** Could be V
  690. pop di ; V
  691. mov ax,dx ; U return value low ** Could be V
  692. pop ds ; NP
  693. shr edx,16 ; U return value high
  694. pop bp ; V
  695. ret ; NP
  696. _lbnMulAdd1_32 endp
  697. align 16
  698. _lbnMulSub1_32 proc far
  699. push bp ; U prologue ** Could be V
  700. mov bp,sp ; V prologue
  701. push ds ; NP prologue
  702. mov ecx,[bp+16] ; U load k
  703. mov bx,[bp+14] ; V load len
  704. push di ; U prologue ** Could be V
  705. dec bx ; V loop unrolling
  706. lds di,[bp+6] ; NP load out
  707. shl bx,2 ; U loop unrolling
  708. push si ; V prologue
  709. les si,[bp+10] ; NP load in
  710. mov bp,bx ; U loop unrolling ** Could be V
  711. and bx,12 ; V loop unrolling
  712. ;; First multiply step has no carry in.
  713. mov eax,es:[si] ; U first multiply
  714. add si,bx ; V loop unrolling
  715. mul ecx ; NP first multiply
  716. sub [di],eax ; U first multiply
  717. adc edx,0 ; U first multiply
  718. add di,bx ; V loop unrolling
  719. ;; The switch() for Duff's device. This jump table is (slightly!) faster
  720. ;; than a bunch of branches on a '386 and '486, and is probably better yet
  721. ;; on higher processors.
  722. jmp WORD PTR cs:ms32_jumptable[bx] ; NP loop unrolling
  723. align 2
  724. ms32_jumptable:
  725. dw OFFSET ms32_case0, 0
  726. dw OFFSET ms32_case1, 0
  727. dw OFFSET ms32_case2, 0
  728. dw OFFSET ms32_case3, 0, 0 ; To get loop aligned properly
  729. ms32_case0:
  730. add si,16 ; U Fix up si ** Could be V
  731. test bp,bp ; V
  732. mov ebx,edx ; U Remember carry for later
  733. jbe SHORT ms32_done ; V Avoid entire loop if loop count is 0
  734. ms32_loop:
  735. mov eax,es:[si-12] ; U
  736. add di, 16 ; V
  737. mul ecx ; NP
  738. add eax,ebx ; U Add carry in from previous word
  739. adc edx,0 ; U
  740. sub [di-12],eax ; U
  741. adc edx,0 ; U
  742. ms32_case3:
  743. mov ebx,edx ; U Remember carry for later
  744. mov eax,es:[si-8] ; U
  745. mul ecx ; NP
  746. add eax,ebx ; U Add carry in from previous word
  747. adc edx,0 ; U
  748. sub [di-8],eax ; U
  749. adc edx,0 ; U
  750. ms32_case2:
  751. mov ebx,edx ; U Remember carry for later
  752. mov eax,es:[si-4] ; U
  753. mul ecx ; NP
  754. add eax,ebx ; U Add carry in from previous word
  755. adc edx,0 ; U
  756. sub [di-4],eax ; U
  757. adc edx,0 ; U
  758. ms32_case1:
  759. mov ebx,edx ; U Remember carry for later
  760. mov eax,es:[si] ; U
  761. mul ecx ; NP
  762. add eax,ebx ; U Add carry in from previous word
  763. adc edx,0 ; U
  764. add si,16 ; V
  765. sub [di],eax ; U
  766. adc edx,0 ; U
  767. sub bp,16 ; V
  768. mov ebx,edx ; U Remember carry for later
  769. ja ms32_loop ; V
  770. ms32_done:
  771. pop si ; U ** Could be V
  772. pop di ; V
  773. mov ax,dx ; U return value low ** Could be V
  774. pop ds ; NP
  775. shr edx,16 ; U return value high
  776. pop bp ; V
  777. ret ; NP
  778. _lbnMulSub1_32 endp
  779. ;; Just for interest's sake, here's a completely Pentium-optimized version.
  780. ;; In addition to being smaller, it takes 8 + (8+mul_time)*n cycles, as
  781. ;; compared to the 10 + jmp_time + (8+mul_time)*n cycles for the loop above.
  782. ;; (I don't know how long a 32x32->64 bit multiply or an indirect jump
  783. ;; take on a Pentium, so plug those numbers in.)
  784. ; align 2
  785. ; nop ; To align loop nicely
  786. ;P_lbnMulAdd1_32 proc far
  787. ;
  788. ; push bp ; U prologue ** Could be V
  789. ; mov bp,sp ; V prologue
  790. ; push ds ; NP prologue
  791. ; mov ecx,[bp+16] ; U load k
  792. ; push si ; V prologue
  793. ; lds si,[bp+10] ; NP load in
  794. ; mov eax,[si] ; U first multiply
  795. ; push di ; V prologue
  796. ; mul ecx ; NP first multiply
  797. ; les di,[bp+6] ; NP load out
  798. ; add es:[di],eax ; U first multiply
  799. ; mov bp,[bp+14] ; V load len
  800. ; adc edx,0 ; U first multiply
  801. ; dec bp ; V
  802. ; mov ebx,edx ; U Remember carry for later
  803. ; je Pma32_done ; V
  804. ;Pma32_loop:
  805. ; mov eax,[si+4] ; U
  806. ; add di,4 ; V
  807. ; mul ecx ; NP
  808. ; add eax,ebx ; U Add carry in from previous word
  809. ; adc edx,0 ; U
  810. ; add si,4 ; V
  811. ; add es:[di],eax ; U
  812. ; adc edx,0 ; U
  813. ; dec bp ; V
  814. ; mov ebx,edx ; U Remember carry for later
  815. ; jne Pma32_loop ; V
  816. ;Pma32_done:
  817. ; pop di ; U ** Could be V
  818. ; pop si ; V
  819. ; pop ds ; NP
  820. ; mov ax,dx ; U return value low ** Could be V
  821. ; pop bp ; V
  822. ; shr edx,16 ; U return value high
  823. ; ret ; NP
  824. ;
  825. ;P_lbnMulAdd1_32 endp
  826. ;; Two-word by one-word divide. Stores quotient, returns remainder.
  827. ;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
  828. ;; 4 8 12 16
  829. align 16
  830. _lbnDiv21_32 proc far
  831. mov cx,bp ; U bp NOT pushed; offsets differ
  832. mov bp,sp ; V
  833. ; AGI
  834. mov edx,[bp+8] ; U
  835. mov eax,[bp+12] ; U
  836. div DWORD PTR [bp+16] ; NP
  837. les bx,[bp+4] ; NP
  838. mov es:[bx],eax ; U
  839. mov ax,dx ; V
  840. shr edx,16 ; U
  841. mov bp,cx ; V
  842. ret ; NP
  843. nop
  844. nop
  845. nop
  846. nop ; Get lbnModQ_32 aligned properly
  847. _lbnDiv21_32 endp
  848. ;; Multi-word by one-word remainder.
  849. ;; This speeds up key generation. It's not worth unrolling and so on;
  850. ;; using 32-bit divides is enough of a speedup.
  851. ;;
  852. ;; bp is used as a counter so that all the 32-bit values can be in
  853. ;; caller-save registers (eax, ecx, edx). bx is needed as a pointer.
  854. ;;
  855. ;; The modulus (in ebp) is 16 bits. Given that the dividend is 32 bits,
  856. ;; the chances of saving the first divide because the high word of the
  857. ;; dividend is less than the modulus are low enough it's not worth taking
  858. ;; the cycles to test for it.
  859. ;;
  860. ;; unsigned lbnModQ_32(BNWORD16 *q, unsigned len, unsigned d)
  861. ;; 6 10 12
  862. _lbnModQ_32 proc far
  863. xor ecx,ecx ; U Clear ecx (really, the high half)
  864. push bp ; V
  865. mov edx,ecx ; U Clear high word for first divide
  866. mov bp,sp ; V
  867. push ds ; NP
  868. lds ax,[bp+6] ; NP Load dividend pointer
  869. mov bx,[bp+10] ; U Load count ** Could be V
  870. sub ax,4 ; V Offset dividend pointer
  871. mov cx,[bp+12] ; U Load modulus ** Could be V
  872. mov bp,bx ; V Copy count
  873. shl bx,2 ; U Shift index
  874. add bx,ax ; U Add base ** Could be V
  875. ; lea bx,[eax+ebp*4-4]; U Move pointer to high word
  876. modq32_loop:
  877. mov eax,[bx] ; U
  878. sub bx,4 ; V
  879. div ecx ; NP
  880. dec bp ; U ** Could be V
  881. jnz modq32_loop ; V
  882. modq32_done:
  883. pop ds ; NP
  884. mov ax,dx ; U ** Could be V
  885. pop bp ; V
  886. ret ; NP
  887. _lbnModQ_32 endp
  888. ;; int not386(void) returns 0 on a 32-bit (386 or better) processor;
  889. ;; non-zero if an 80286 or lower. The Z flag is set to reflect
  890. ;; ax on return. This is only called once, so it doesn't matter how
  891. ;; it's aligned.
  892. _not386 proc far
  893. ;;
  894. ;; This first test detects 80x86 for x < 2. On the 8086 and '186,
  895. ;; "push sp" does "--sp; sp[0] = sp". On all later processors, it does
  896. ;; "sp[-1] = sp; --sp".
  897. ;;
  898. push sp
  899. pop ax
  900. sub ax,sp
  901. jne SHORT return
  902. ;; This test is the key one. It will probably detect 8086, V30 and 80186
  903. ;; as well as 80286, but I haven't had access to test it on any of those,
  904. ;; so it's protected by the well-known test above. It has been tested
  905. ;; on the 80286, 80386, 80486, Pentium and AMD tested it on their K5.
  906. ;; I have not been able to confirm effectiveness on the P6 yet, although
  907. ;; someone I spoke to at Intel said it should work.
  908. ;;
  909. ;; This test uses the fact that the '386 and above have a barrel shifter
  910. ;; to do shifts, while the '286 does left shifts by releated adds.
  911. ;; That means that on the '286, the auxilliary carry gets a copy of
  912. ;; bit 4 of the shift output, while on the '386 and up, it's trashed
  913. ;; (as it happens, set to 1) independent of the result. (It's documented
  914. ;; as undefined.)
  915. ;;
  916. ;; We do two shifts, which should produce different auxilliary carries
  917. ;; on a '286 and XOR them to see if they are different. Even on a
  918. ;; future processor that does something different with the aux carry
  919. ;; flag, it probably does something data-independent, so this will still
  920. ;; work. Note that all flags except aux carry are defined for shl
  921. ;; output and will be the same for both cases.
  922. mov al,4
  923. shl al,1 ; Expected to produce ac = 0 on a '286
  924. lahf
  925. shl al,1 ; Expected to produce ac = 1 on a '286
  926. mov al,ah
  927. lahf
  928. xor al,ah ; Xor the flags together to detect the difference
  929. mov ah,al ; Clear ah if al is clear, leave Z flag alone
  930. return:
  931. ret
  932. _not386 endp
  933. _TEXT ends
  934. end