2
0

lbn68360.s 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. * Copyright (c) 1995 Colin Plumb. All rights reserved.
  2. * For licensing and other legal details, see the file legal.c.
  3. *
  4. * lbn68360.c - 32-bit bignum primitives for 683xx processors.
  5. *
  6. * This code is using InterTools calling convention, which is a bit odd.
  7. * One minor note is that the default variable sizes are
  8. * char = unsigned 8, short = 8 (in violation of ANSI!),
  9. * int = 16, long = 32. Longs (including on the stack) are 16-bit aligned.
  10. * Arguments are apdded to 16 bits.
  11. * A6 is used as a frame pointer, and globals are indexed off A5.
  12. * Return valies are passes id D0 or A0 (or FP0), depending on type.
  13. * D0, D1, A0 and A4 (!) are volatile across function calls. A1
  14. * must be preserved!
  15. *
  16. * This code assumes 16-bit ints. Code for 32-bit ints is commented out
  17. * with "**".
  18. *
  19. * Regardless of UINT_MAX, only bignums up to 64K words (2 million bits)
  20. * are supported. (68k hackers will recognize this as a consequence of
  21. * using dbra.) This could be extended easily if anyone cares.
  22. *
  23. * These primitives use little-endian word order.
  24. * (The order of bytes within words is irrelevant to this issue.)
  25. * The Metrowerks C compiler (1.2.2) produces bad 68k code for the
  26. * following input, which happens to be the inner loop of lbnSub1,
  27. * so it has been rewritees in assembly, even though it is not terribly
  28. * speed-critical. (Optimizer on or off does not matter.)
  29. *
  30. * unsigned
  31. * decrement(unsigned *num, unsigned len)
  32. * {
  33. * do {
  34. * if ((*num++)-- != 0)
  35. * return 0;
  36. * } while (--len);
  37. * return 1;
  38. * }
  39. * BNWORD32 lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
  40. SECTION S_lbnSub1_32,,"code"
  41. XDEF _lbnSub1_32
  42. _lbnSub1_32:
  43. movea.l 4(sp),a0 * num
  44. move.l 10(sp),d0 * borrow
  45. ** move.l 12(sp),d0 * borrow
  46. sub.l d0,(a0)+
  47. bcc sub_done
  48. move.w 8(sp),d0 * len
  49. ** move.w 10(sp),d0 * len
  50. subq.w #2,d0
  51. bcs sub_done
  52. sub_loop:
  53. subq.l #1,(a0)+
  54. dbcc d0,sub_loop
  55. sub_done:
  56. moveq.l #0,d0
  57. addx.w d0,d0
  58. rts
  59. * BNWORD32 lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
  60. SECTION S_lbnAdd1_32,,"code"
  61. XDEF _lbnAdd1_32
  62. _lbnAdd1_32:
  63. movea.l 4(sp),a0 * num
  64. move.l 10(sp),d0 * carry
  65. ** move.l 12(sp),d0 * carry
  66. add.l d0,(a0)+
  67. bcc add_done
  68. move.w 8(sp),d0 * len
  69. ** move.w 10(sp),d0 * len
  70. subq.w #2,d0
  71. bcs add_done
  72. add_loop:
  73. addq.l #1,(a0)+
  74. dbcc d0,add_loop
  75. add_done:
  76. moveq.l #0,d0
  77. addx.w d0,d0
  78. rts
  79. * void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  80. SECTION S_lbnMulN1_32,,"code"
  81. XDEF _lbnMulN1_32
  82. _lbnMulN1_32:
  83. movem.l d2-d5,-(sp) * 16 bytes of extra data
  84. moveq.l #0,d4
  85. move.l 20(sp),a4 * out
  86. move.l 24(sp),a0 * in
  87. move.w 28(sp),d5 * len
  88. move.l 30(sp),d2 * k
  89. ** move.w 30(sp),d5 * len
  90. ** move.l 32(sp),d2 * k
  91. move.l (a0)+,d3 * First multiply
  92. mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
  93. move.l d3,(a4)+
  94. subq.w #1,d5 * Setup for loop unrolling
  95. lsr.w #1,d5
  96. bcs.s m32_even
  97. beq.s m32_short
  98. subq.w #1,d5 * Set up software pipeline properly
  99. move.l d1,d0
  100. m32_loop:
  101. move.l (a0)+,d3
  102. mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
  103. add.l d0,d3
  104. addx.l d4,d1
  105. move.l d3,(a4)+
  106. m32_even:
  107. move.l (a0)+,d3
  108. mulu.l d2,d0:d3 * dc.w 0x4c02, 0x3400
  109. add.l d1,d3
  110. addx.l d4,d0
  111. move.l d3,(a4)+
  112. dbra d5,m32_loop
  113. move.l d0,(a4)
  114. movem.l (sp)+,d2-d5
  115. rts
  116. m32_short:
  117. move.l d1,(a4)
  118. movem.l (sp)+,d2-d5
  119. rts
  120. * BNWORD32
  121. * lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  122. SECTION S_lbnMulAdd1_32,,"code"
  123. XDEF _lbnMulAdd1_32
  124. _lbnMulAdd1_32:
  125. movem.l d2-d5,-(sp) * 16 bytes of extra data
  126. moveq.l #0,d4
  127. move.l 20(sp),a4 * out
  128. move.l 24(sp),a0 * in
  129. move.w 28(sp),d5 * len
  130. move.l 30(sp),d2 * k
  131. ** move.w 30(sp),d5 * len
  132. ** move.l 32(sp),d2 * k
  133. move.l (a0)+,d3 * First multiply
  134. mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
  135. add.l d3,(a4)+
  136. addx.l d4,d1
  137. subq.w #1,d5 * Setup for loop unrolling
  138. lsr.w #1,d5
  139. bcs.s ma32_even
  140. beq.s ma32_short
  141. subq.w #1,d5 * Set up software pipeline properly
  142. move.l d1,d0
  143. ma32_loop:
  144. move.l (a0)+,d3
  145. mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
  146. add.l d0,d3
  147. addx.l d4,d1
  148. add.l d3,(a4)+
  149. addx.l d4,d1
  150. ma32_even:
  151. move.l (a0)+,d3
  152. mulu.l d2,d0:d3 * dc.w 0x4c02, 0x3400
  153. add.l d1,d3
  154. addx.l d4,d0
  155. add.l d3,(a4)+
  156. addx.l d4,d0
  157. dbra d5,ma32_loop
  158. movem.l (sp)+,d2-d5
  159. rts
  160. ma32_short:
  161. move.l d1,d0
  162. movem.l (sp)+,d2-d5
  163. rts
  164. * BNWORD32
  165. * lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  166. SECTION S_lbnMulSub1_32,,"code"
  167. XDEF _lbnMulSub1_32
  168. _lbnMulSub1_32:
  169. movem.l d2-d5,-(sp) * 16 bytes of extra data
  170. moveq.l #0,d4
  171. move.l 20(sp),a4 * out
  172. move.l 24(sp),a0 * in
  173. move.w 28(sp),d5 * len
  174. move.l 30(sp),d2 * k
  175. ** move.w 30(sp),d5 * len
  176. ** move.l 32(sp),d2 * k
  177. move.l (a0)+,d3 * First multiply
  178. mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
  179. sub.l d3,(a4)+
  180. addx.l d4,d1
  181. subq.w #1,d5 * Setup for loop unrolling
  182. lsr.w #1,d5
  183. bcs.s ms32_even
  184. beq.s ms32_short
  185. subq.w #1,d5 * Set up software pipeline properly
  186. move.l d1,d0
  187. ms32_loop:
  188. move.l (a0)+,d3
  189. mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
  190. add.l d0,d3
  191. addx.l d4,d1
  192. sub.l d3,(a4)+
  193. addx.l d4,d1
  194. ms32_even:
  195. move.l (a0)+,d3
  196. mulu.l d2,d0:d3 * dc.w 0x4c02, 0x3400
  197. add.l d1,d3
  198. addx.l d4,d0
  199. sub.l d3,(a4)+
  200. addx.l d4,d0
  201. dbra d5,ms32_loop
  202. movem.l (sp)+,d2-d5
  203. rts
  204. ms32_short:
  205. move.l d1,d0
  206. movem.l (sp)+,d2-d5
  207. rts
  208. * BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
  209. SECTION S_lbnDiv21_32,,"code"
  210. XDEF _lbnDiv21_32
  211. _lbnDiv21_32:
  212. move.l 8(sp),d0
  213. move.l 12(sp),d1
  214. move.l 4(sp),a0
  215. divu.l 16(sp),d0:d1 * dc.w 0x4c6f, 0x1400, 16
  216. move.l d1,(a0)
  217. rts
  218. * unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
  219. SECTION S_lbnModQ_32,,"code"
  220. XDEF _lbnModQ_32
  221. _lbnModQ_32:
  222. move.l 4(sp),a0 * n
  223. move.l d2,-(sp)
  224. move.l d3,a4
  225. moveq.l #0,d1
  226. moveq.l #0,d2
  227. move.w 12(sp),d1 * len
  228. move.w 14(sp),d2 * d
  229. ** move.l 12(sp),d1 * len
  230. ** move.l 16(sp),d2 * d
  231. lea -4(a0,d1.L*4),a0 * dc.w 0x41f0, 0x1cfc
  232. * First time, divide 32/32 - may be faster than 64/32
  233. move.l (a0),d3
  234. divul.l d2,d0:d3 * dc.w 0x4c02, 0x3000
  235. subq.w #2,d1
  236. bmi mq32_done
  237. mq32_loop:
  238. move.l -(a0),d3
  239. divu.l d2,d0:d3 * dc.w 0x4c02,0x3400
  240. dbra d1,mq32_loop
  241. mq32_done:
  242. move.l (sp)+,d2
  243. move.l a4,d3
  244. rts
  245. end