lbn68020.c 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. /*
  2. * Copyright (c) 1995 Colin Plumb. All rights reserved.
  3. * For licensing and other legal details, see the file legal.c.
  4. *
  5. * lbn68020.c - 32-bit bignum primitives for the 68020+ (0r 683xx) processors.
  6. *
  7. * This was written for Metrowerks C, and while it should be reasonably
  8. * portable, NOTE that Metrowerks lets a callee trash a0, a1, d0, d1, and d2.
  9. * Some 680x0 compilers make d2 callee-save, so instructions to save it
  10. * will have to be added.
  11. *
  12. * This code supports 16 or 32-bit ints, based on UINT_MAX.
  13. * Regardless of UINT_MAX, only bignums up to 64K words (2 million bits)
  14. * are supported. (68k hackers will recognize this as a consequence of
  15. * using dbra.)
  16. *
  17. * These primitives use little-endian word order.
  18. * (The order of bytes within words is irrelevant to this issue.)
  19. *
  20. * TODO: Schedule this for the 68040's pipeline. (When I get a 68040 manual.)
  21. */
  22. #include <limits.h>
  23. #include "lbn.h" /* Should include lbn68020.h */
  24. /*
  25. * The Metrowerks C compiler (1.2.2) produces bad 68k code for the
  26. * following input, which happens to be the inner loop of lbnSub1,
  27. * so a few less than critical routines have been recoded in assembly
  28. * to avoid the bug. (Optimizer on or off does not matter.)
  29. *
  30. * unsigned
  31. * decrement(unsigned *num, unsigned len)
  32. * {
  33. * do {
  34. * if ((*num++)-- != 0)
  35. * return 0;
  36. * } while (--len);
  37. * return 1;
  38. * }
  39. */
  40. asm BNWORD32
  41. lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
  42. {
  43. movea.l 4(sp),a0 /* num */
  44. #if UINT_MAX == 0xffff
  45. move.l 10(sp),d0 /* borrow */
  46. #else
  47. move.l 12(sp),d0 /* borrow */
  48. #endif
  49. sub.l d0,(a0)+
  50. bcc done
  51. #if UINT_MAX == 0xffff
  52. move.w 8(sp),d0 /* len */
  53. #else
  54. move.w 10(sp),d0 /* len */
  55. #endif
  56. subq.w #2,d0
  57. bcs done
  58. loop:
  59. subq.l #1,(a0)+
  60. dbcc d0,loop
  61. done:
  62. moveq.l #0,d0
  63. addx.w d0,d0
  64. rts
  65. }
  66. asm BNWORD32
  67. lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
  68. {
  69. movea.l 4(sp),a0 /* num */
  70. #if UINT_MAX == 0xffff
  71. move.l 10(sp),d0 /* carry */
  72. #else
  73. move.l 12(sp),d0 /* carry */
  74. #endif
  75. add.l d0,(a0)+
  76. bcc done
  77. #if UINT_MAX == 0xffff
  78. move.w 8(sp),d0 /* len */
  79. #else
  80. move.w 10(sp),d0 /* len */
  81. #endif
  82. subq.w #2,d0
  83. bcs done
  84. loop:
  85. addq.l #1,(a0)+
  86. dbcc d0,loop
  87. done:
  88. moveq.l #0,d0
  89. addx.w d0,d0
  90. rts
  91. }
  92. asm void
  93. lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  94. {
  95. machine 68020
  96. movem.l d3-d5,-(sp) /* 12 bytes of extra data */
  97. moveq.l #0,d4
  98. move.l 16(sp),a1 /* out */
  99. move.l 20(sp),a0 /* in */
  100. #if UINT_MAX == 0xffff
  101. move.w 24(sp),d5 /* len */
  102. move.l 26(sp),d2 /* k */
  103. #else
  104. move.w 26(sp),d5 /* len */
  105. move.l 28(sp),d2 /* k */
  106. #endif
  107. move.l (a0)+,d3 /* First multiply */
  108. mulu.l d2,d1:d3 /* dc.w 0x4c02, 0x3401 */
  109. move.l d3,(a1)+
  110. subq.w #1,d5 /* Setup for loop unrolling */
  111. lsr.w #1,d5
  112. bcs.s m32_even
  113. beq.s m32_short
  114. subq.w #1,d5 /* Set up software pipeline properly */
  115. move.l d1,d0
  116. m32_loop:
  117. move.l (a0)+,d3
  118. mulu.l d2,d1:d3 /* dc.w 0x4c02, 0x3401 */
  119. add.l d0,d3
  120. addx.l d4,d1
  121. move.l d3,(a1)+
  122. m32_even:
  123. move.l (a0)+,d3
  124. mulu.l d2,d0:d3 /* dc.w 0x4c02, 0x3400 */
  125. add.l d1,d3
  126. addx.l d4,d0
  127. move.l d3,(a1)+
  128. dbra d5,m32_loop
  129. move.l d0,(a1)
  130. movem.l (sp)+,d3-d5
  131. rts
  132. m32_short:
  133. move.l d1,(a1)
  134. movem.l (sp)+,d3-d5
  135. rts
  136. }
  137. asm BNWORD32
  138. lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  139. {
  140. machine 68020
  141. movem.l d3-d5,-(sp) /* 12 bytes of extra data */
  142. moveq.l #0,d4
  143. move.l 16(sp),a1 /* out */
  144. move.l 20(sp),a0 /* in */
  145. #if UINT_MAX == 0xffff
  146. move.w 24(sp),d5 /* len */
  147. move.l 26(sp),d2 /* k */
  148. #else
  149. move.w 26(sp),d5 /* len */
  150. move.l 28(sp),d2 /* k */
  151. #endif
  152. move.l (a0)+,d3 /* First multiply */
  153. mulu.l d2,d1:d3 /* dc.w 0x4c02, 0x3401 */
  154. add.l d3,(a1)+
  155. addx.l d4,d1
  156. subq.w #1,d5 /* Setup for loop unrolling */
  157. lsr.w #1,d5
  158. bcs.s ma32_even
  159. beq.s ma32_short
  160. subq.w #1,d5 /* Set up software pipeline properly */
  161. move.l d1,d0
  162. ma32_loop:
  163. move.l (a0)+,d3
  164. mulu.l d2,d1:d3 /* dc.w 0x4c02, 0x3401 */
  165. add.l d0,d3
  166. addx.l d4,d1
  167. add.l d3,(a1)+
  168. addx.l d4,d1
  169. ma32_even:
  170. move.l (a0)+,d3
  171. mulu.l d2,d0:d3 /* dc.w 0x4c02, 0x3400 */
  172. add.l d1,d3
  173. addx.l d4,d0
  174. add.l d3,(a1)+
  175. addx.l d4,d0
  176. dbra d5,ma32_loop
  177. movem.l (sp)+,d3-d5
  178. rts
  179. ma32_short:
  180. move.l d1,d0
  181. movem.l (sp)+,d3-d5
  182. rts
  183. }
  184. asm BNWORD32
  185. lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  186. {
  187. machine 68020
  188. movem.l d3-d5,-(sp) /* 12 bytes of extra data */
  189. moveq.l #0,d4
  190. move.l 16(sp),a1 /* out */
  191. move.l 20(sp),a0 /* in */
  192. #if UINT_MAX == 0xffff
  193. move.w 24(sp),d5 /* len */
  194. move.l 26(sp),d2 /* k */
  195. #else
  196. move.w 26(sp),d5 /* len */
  197. move.l 28(sp),d2 /* k */
  198. #endif
  199. move.l (a0)+,d3 /* First multiply */
  200. mulu.l d2,d1:d3 /* dc.w 0x4c02, 0x3401 */
  201. sub.l d3,(a1)+
  202. addx.l d4,d1
  203. subq.w #1,d5 /* Setup for loop unrolling */
  204. lsr.w #1,d5
  205. bcs.s ms32_even
  206. beq.s ms32_short
  207. subq.w #1,d5 /* Set up software pipeline properly */
  208. move.l d1,d0
  209. ms32_loop:
  210. move.l (a0)+,d3
  211. mulu.l d2,d1:d3 /* dc.w 0x4c02, 0x3401 */
  212. add.l d0,d3
  213. addx.l d4,d1
  214. sub.l d3,(a1)+
  215. addx.l d4,d1
  216. ms32_even:
  217. move.l (a0)+,d3
  218. mulu.l d2,d0:d3 /* dc.w 0x4c02, 0x3400 */
  219. add.l d1,d3
  220. addx.l d4,d0
  221. sub.l d3,(a1)+
  222. addx.l d4,d0
  223. dbra d5,ms32_loop
  224. movem.l (sp)+,d3-d5
  225. rts
  226. ms32_short:
  227. move.l d1,d0
  228. movem.l (sp)+,d3-d5
  229. rts
  230. }
  231. asm BNWORD32
  232. lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
  233. {
  234. machine 68020
  235. move.l 8(sp),d0
  236. move.l 12(sp),d1
  237. move.l 4(sp),a0
  238. divu.l 16(sp),d0:d1 /* dc.w 0x4c6f, 0x1400, 16 */
  239. move.l d1,(a0)
  240. rts
  241. }
  242. asm unsigned
  243. lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
  244. {
  245. machine 68020
  246. move.l 4(sp),a0 /* n */
  247. move.l d3,a1
  248. #if UINT_MAX == 0xffff
  249. moveq.l #0,d2
  250. move.w 8(sp),d1 /* len */
  251. move.w 10(sp),d2 /* d */
  252. #else
  253. move.w 10(sp),d1 /* len */
  254. move.l 12(sp),d2 /* d */
  255. #endif
  256. dc.w 0x41f0, 0x1cfc /* lea -4(a0,d1.L*4),a0 */
  257. /* First time, divide 32/32 - may be faster than 64/32 */
  258. move.l (a0),d3
  259. divul.l d2,d0:d3 /* dc.w 0x4c02, 0x3000 */
  260. subq.w #2,d1
  261. bmi mq32_done
  262. mq32_loop:
  263. move.l -(a0),d3
  264. divu.l d2,d0:d3 /* dc.w 0x4c02,0x3400 */
  265. dbra d1,mq32_loop
  266. mq32_done:
  267. move.l a1,d3
  268. rts
  269. }
  270. /* 45678901234567890123456789012345678901234567890123456789012345678901234567 */