lbn960jx.s 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. # Copyright (c) 1995 Colin Plumb. All rights reserved.
  2. # For licensing and other legal details, see the file legal.c.
  3. #
  4. # Assembly-language bignum primitives for the i960 Jx series.
  5. #
  6. # The Jx series is fairly straightforward single-instruction-issue
  7. # implementation, with a 1-cycle-issue 4-cycle-latency non-pipelined
  8. # multiplier that we can use. Note also that loads which hit in the
  9. # cache have 2 cycles of latency and stores stall until all pending
  10. # loads are done.
  11. #
  12. # What is intensely annoying about the i960 is that it uses the same
  13. # flags for all conditional branches (even compare-and-branch sets the
  14. # flags) AND for the carry bit. Further, it is hard to manipulate
  15. # that bit.
  16. #
  17. # Calling conventions:
  18. # The r registers are all local, if you set them up. There's an alternative
  19. # calling convention that uses bal (branch and link) and doesn't set them up.
  20. # Currently, all of these functions are designed to work that way.
  21. # g0-g7 are argument registers and volatile across calls. return in g0-g3.
  22. # g8-g11 are extra argument registers, and volatile if used, but
  23. # preserved if not. Here, they are not.
  24. # g12 is used for PIC, and is preserved.
  25. # g13 is a pointer to a structure return value, if used, and is volatile.
  26. # g14 is magic, and is used as a return address in the branch-and-link
  27. # convention, and as a pointer to an argument block if the arguments
  28. # won't fit in registers, but is usually hardwired 0 and must be
  29. # returned set to zero (0).
  30. # g15 is the frame pointer, and shouldn't be messed with.
  31. # The AC (condition codes) are all volatile.
  32. # The fp registers are all volatile, but irrelevant.
  33. #
  34. # BNWORD32
  35. # lbnMultAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  36. # This adds "k" * "in" to "len" words of "out" and returns the word of
  37. # carry.
  38. #
  39. # For doing multiply-add, the 960 is a bit annoying because it uses
  40. # the same status bits for the carry flag and for the loop indexing
  41. # computation, and doesn't have an "add with carry out but not carry in"
  42. # instruction. Fortunately, we can arrange to have the loop indexing
  43. # leave the carry bit clear most of the time.
  44. #
  45. # The basic sequence of the loop is:
  46. # 1. Multiply k * *in++ -> high, low
  47. # 2. Addc carry word and carry bit to low
  48. # 3. Addc carry bit to high, producing carry word (note: cannot generate carry!)
  49. # 4. Addc low to *out++
  50. #
  51. # Note that the carry bit set in step 4 is used in step 2. The only place
  52. # in this loop that the carry flag isn't in use is between steps 3 and 4,
  53. # so we have to rotate the loop to place the loop indexing operations here.
  54. # (Which consist of a compare-and-decrement and a conditional branch.)
  55. # The loop above ignores the details of when to do loads and stores, which
  56. # have some flexibility, but must be carefully scheduled to avoid stalls.
  57. #
  58. # The first iteration has no carry word in, so it requires only steps 1 and 4,
  59. # and since we begin the loop with step 4, it boils down to just step 1
  60. # followed by the loop indexing (which clears the carry bit in preparation
  61. # for step 4).
  62. #
  63. # Arguments are passed as follows:
  64. # g0 - out pointer
  65. # g1 - in pointer
  66. # g2 - length
  67. # g3 - k
  68. # The other registers are used as follows.
  69. # g4 - low word of product
  70. # g5 - high word of product
  71. # g6 - current word of "out"
  72. # g7 - carry word
  73. # g13 - current word of "in"
  74. .globl _lbnMulAdd1_32
  75. _lbnMulAdd1_32:
  76. ld (g1),g13 # Fetch *in
  77. addo g1,4,g1 # Increment in
  78. emul g13,g3,g4 # Do multiply (step 1)
  79. ld (g0),g6 # Fetch *out
  80. chkbit 0,g2 # Check if loop counter was odd
  81. shro 1,g2,g2 # Divide loop counter by 2
  82. mov g5,g7 # Move high word to carry
  83. bno ma_loop1 # If even, jump to ma_loop1
  84. cmpo 0,g2 # If odd, was it 1 (now 0)?
  85. be ma_done # If equal (carry set), jump to ending code
  86. # Entered with carry bit clear
  87. ma_loop:
  88. ld (g1),g13 # Fetch *in
  89. addc g4,g6,g6 # Add low to *out (step 4), generate carry
  90. emul g13,g3,g4 # Do multiply (step 1)
  91. st g6,(g0) # Write out *out
  92. addo g0,4,g0 # Increment out
  93. addo g1,4,g1 # Increment in
  94. ld (g0),g6 # Fetch next *out
  95. addc g7,g4,g4 # Add carries to low (step 2)
  96. addc g5,0,g7 # Add carry bit to high (step 3) & clear carry
  97. ma_loop1:
  98. ld (g1),g13 # Fetch *in
  99. addc g4,g6,g6 # Add low to *out (step 4), generate carry
  100. emul g13,g3,g4 # Do multiply (step 1)
  101. st g6,(g0) # Write out *out
  102. addo g0,4,g0 # Increment out
  103. addo g1,4,g1 # Increment in
  104. ld (g0),g6 # Fetch next *out
  105. addc g7,g4,g4 # Add carries to low (step 2)
  106. addc g5,0,g7 # Add carry bit to high (step 3) & clear carry
  107. cmpdeco 1,g2,g2
  108. bne ma_loop
  109. # When we come here, carry is *set*, and we stil have to do step 4
  110. ma_done:
  111. cmpi 0,1 # Clear carry (equal flag)
  112. addc g4,g6,g6 # Add low to *out (step 4), generate carry
  113. st g6,(g0) # Write out *out
  114. addc g7,0,g0 # Add carry bit and word to produce return value
  115. ret
  116. # Now, multiply N by 1 is similarly annoying. We only have one add in the
  117. # whole loop, which should just be able to leave its carry output in the
  118. # carry flag for the next iteration, but we need the condition codes to do
  119. # loop testing. *Sigh*.
  120. #
  121. # void
  122. # lbnMultN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  123. # This stores len+1 words of "k" * len words of "in" and stores the result
  124. # in "out".
  125. #
  126. # To avoid having to do a move after the first iteration, for the first
  127. # step, g4/g5 is the product. For second step, g6/g7 is used for product
  128. # storage and g5 is the carry in. It alternates from then on.
  129. .globl _lbnMulN1_32
  130. _lbnMulN1_32:
  131. ld (g1),g13 # Fetch *in
  132. addo g1,4,g1 # Increment in
  133. emul g13,g3,g4 # Do multiply (step 1)
  134. chkbit 0,g2 # Check if loop counter was odd
  135. shro 1,g2,g2 # Divide loop counter by 2
  136. bno m_loop1 # If even, jump to ma_loop1
  137. mov g4,g6
  138. cmpo 0,g2 # If counter was odd, was it 1 (now 0)?
  139. mov g5,g7
  140. be m_done # If equal (carry set), jump to ending code
  141. # Entered with carry bit clear
  142. m_loop:
  143. # Result in g6, carry word in g7
  144. ld (g1),g13 # Fetch *in
  145. addo g1,4,g1 # Increment in
  146. emul g13,g3,g4 # Do multiply (step 1)
  147. st g6,(g0) # Write out *out
  148. addo g0,4,g0 # Increment out
  149. addc g7,g4,g4 # Add carries to low (step 2)
  150. # No need to add carry bit here, because it'll get remembered until next addc.
  151. # addc g5,0,g5 # Add carry bit to high (step 3)
  152. m_loop1:
  153. # Carry word in g5
  154. ld (g1),g13 # Fetch *in
  155. addo g1,4,g1 # Increment in
  156. emul g13,g3,g6 # Do multiply (step 1)
  157. st g4,(g0) # Write out *out
  158. addo g0,4,g0 # Increment out
  159. addc g5,g6,g6 # Add carries to low (step 2)
  160. addc g7,0,g7 # Add carry bit to high (step 3)
  161. cmpdeco 1,g2,g2
  162. bne m_loop
  163. # When we come here, we have to store g6 and the carry word in g7.
  164. m_done:
  165. st g6,(g0) # Write out *out
  166. st g7,4(g0) # Write out *out
  167. ret
  168. # BNWORD32
  169. # lbnMultSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  170. # This subtracts "k" * "in" from "len" words of "out" and returns the word of
  171. # borrow.
  172. #
  173. # This is similar to multiply-add, but actually a bit more obnoxious,
  174. # because of the carry situation. The 960 uses a carry (rather than a borrow)
  175. # bit on subtracts, so the carry bit should be 1 for a subc to do the
  176. # same thing as an ordinary subo. So we use two carry chains: one from
  177. # the add of the low-order words to the high-order carry word, and a second,
  178. # which uses an extra register, to connect the subtracts. This avoids
  179. # the need to fiddle with inverting the bit in the usual case.
  180. #
  181. # Arguments are passed as follows:
  182. # g0 - out pointer
  183. # g1 - in pointer
  184. # g2 - length
  185. # g3 - k
  186. # The other registers are used as follows.
  187. # g4 - low word of product
  188. # g5 - high word of product
  189. # g6 - current word of "out"
  190. # g7 - carry word
  191. # g13 - current word of "in"
  192. # g14 - remembered carry bit
  193. .globl _lbnMulSub1_32
  194. _lbnMulSub1_32:
  195. ld (g1),g13 # Fetch *in
  196. addo g1,4,g1 # Increment in
  197. emul g13,g3,g4 # Do multiply (step 1)
  198. ld (g0),g6 # Fetch *out
  199. chkbit 0,g2 # Check if loop counter was odd
  200. mov 1,g14 # Set remembered carry for first iteration
  201. shro 1,g2,g2 # Divide loop counter by 2
  202. mov g5,g7 # Move high word to carry
  203. bno ms_loop1 # If even, jump to ma_loop1
  204. cmpo 0,g2 # If odd, was it 1 (now 0)?
  205. be ms_done # If equal (carry set), jump to ending code
  206. # Entered with carry bit clear
  207. ms_loop:
  208. ld (g1),g13 # Fetch *in
  209. cmpi g14,1 # Set carry flag
  210. subc g4,g6,g6 # Subtract low from *out (step 4), gen. carry
  211. emul g13,g3,g4 # Do multiply (step 1)
  212. addc 0,0,g14 # g14 = carry, then clear carry
  213. st g6,(g0) # Write out *out
  214. addo g0,4,g0 # Increment out
  215. addo g1,4,g1 # Increment in
  216. ld (g0),g6 # Fetch next *out
  217. addc g7,g4,g4 # Add carries to low (step 2)
  218. addc g5,0,g7 # Add carry bit to high (step 3)
  219. ms_loop1:
  220. ld (g1),g13 # Fetch *in
  221. cmpi g14,1 # Set carry flag for subtrsct
  222. subc g4,g6,g6 # Subtract low from *out (step 4), gen. carry
  223. emul g13,g3,g4 # Do multiply (step 1)
  224. addc 0,0,g14 # g14 = carry, then clear carry
  225. st g6,(g0) # Write out *out
  226. addo g0,4,g0 # Increment out
  227. addo g1,4,g1 # Increment in
  228. ld (g0),g6 # Fetch next *out
  229. addc g7,g4,g4 # Add carries to low (step 2)
  230. addc g5,0,g7 # Add carry bit to high (step 3)
  231. cmpdeco 1,g2,g2
  232. bne ms_loop
  233. # When we come here, carry is *set*, and we stil have to do step 4
  234. ms_done:
  235. cmpi g14,1 # set carry (equal flag)
  236. subc g4,g6,g6 # Add low to *out (step 4), generate carry
  237. st g6,(g0) # Write out *out
  238. subc 0,0,g14 # g14 = -1 if no carry (borrow), 0 if carry
  239. subo g14,g7,g0 # Add borrow bit to produce return value
  240. mov 0,g14 # Restore g14 to 0 for return
  241. ret