lbnarm.s 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. @ lbnarm.s - 32-bit bignum primitives for ARM processors with 32x32-bit multiply
  2. @
  3. @ This uses the standard ARM calling convetion, which is that arguments
  4. @ are passed, and results returned, in r0..r3. r0..r3, r12 (IP) and r14 (LR)
  5. @ are volatile across the function; all others are callee-save.
  6. @ However, note that r14 (LR) is the return address, so it would be
  7. @ wise to save it somewhere before trashing it. Fortunately, there is
  8. @ a neat trick possible, in that you can pop LR from the stack straight
  9. @ into r15 (PC), effecting a return at the same time.
  10. @
  11. @ Also, r13 (SP) is probably best left alone, and r15 (PC) is obviously
  12. @ reserved by hardware. Temps should use lr, then r4..r9 in order.
  13. .text
  14. .align 2
  15. @ out[0..len] = in[0..len-1] * k
  16. @ void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  17. .global lbnMulN1_32
  18. .type lbnMulN1_32, %function
  19. lbnMulN1_32:
  20. stmfd sp!, {r4, r5, lr}
  21. ldr lr, [r1], #4 @ lr = *in++
  22. umull r5, r4, lr, r3 @ (r4,r5) = lr * r3
  23. str r5, [r0], #4 @ *out++ = r5
  24. movs r2, r2, lsr #1
  25. bcc m32_even
  26. mov r5, r4 @ Get carry in the right register
  27. beq m32_done
  28. m32_loop:
  29. @ carry is in r5
  30. ldr lr, [r1], #4 @ lr = *in++
  31. mov r4, #0
  32. umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
  33. str r5, [r0], #4 @ *out++ = r5
  34. m32_even:
  35. @ carry is in r4
  36. ldr lr, [r1], #4 @ lr = *in++
  37. mov r5, #0
  38. umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
  39. subs r2, r2, #1
  40. str r4, [r0], #4 @ *out++ = r4
  41. bne m32_loop
  42. m32_done:
  43. str r5, [r0, #0] @ store carry
  44. ldmfd sp!, {r4, r5, pc}
  45. .size lbnMulN1_32, .-lbnMulN1_32
  46. @ out[0..len-1] += in[0..len-1] * k, return carry
  47. @ BNWORD32
  48. @ lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  49. .global lbnMulAdd1_32
  50. .type lbnMulAdd1_32, %function
  51. lbnMulAdd1_32:
  52. stmfd sp!, {r4, r5, lr}
  53. mov r4, #0
  54. ldr lr, [r1], #4 @ lr = *in++
  55. ldr r5, [r0, #0] @ r5 = *out
  56. mov r4, #0
  57. umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
  58. str r5, [r0], #4 @ *out++ = r5
  59. movs r2, r2, lsr #1
  60. bcc ma32_even
  61. beq ma32_done
  62. ma32_loop:
  63. @ carry is in r4
  64. ldr lr, [r1], #4 @ lr = *in++
  65. mov r5, #0
  66. umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
  67. ldr lr, [r0, #0] @ lr = *out
  68. adds lr, lr, r4 @ lr += product.low
  69. str lr, [r0], #4 @ *out++ = lr
  70. adc r4, r5, #0 @ Compute carry and move back to r4
  71. ma32_even:
  72. @ another unrolled copy
  73. ldr lr, [r1], #4 @ lr = *in++
  74. mov r5, #0
  75. umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
  76. ldr lr, [r0, #0] @ lr = *out
  77. adds lr, lr, r4 @ lr += product.low
  78. adc r4, r5, #0 @ Compute carry and move back to r4
  79. str lr, [r0], #4 @ *out++ = lr
  80. subs r2, r2, #1
  81. bne ma32_loop
  82. ma32_done:
  83. mov r0, r4
  84. ldmfd sp!, {r4, r5, pc}
  85. .size lbnMulAdd1_32, .-lbnMulAdd1_32
  86. @@@ This is a bit messy... punt for now...
  87. @ out[0..len-1] -= in[0..len-1] * k, return carry (borrow)
  88. @ BNWORD32
  89. @ lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
  90. .global lbnMulSub1_32
  91. .type lbnMulSub1_32, %function
  92. lbnMulSub1_32:
  93. stmfd sp!, {r4, r5, lr}
  94. mov r4, #0
  95. mov r5, #0
  96. ldr lr, [r1], #4 @ lr = *in++
  97. umull r4, r5, lr, r3 @ (r5,r4) = lr * r3
  98. ldr lr, [r0, #0] @ lr = *out
  99. subs lr, lr, r4 @ lr -= product.low
  100. str lr, [r0], #4 @ *out++ = lr
  101. addcc r5, r5, #1 @ propagate carry
  102. movs r2, r2, lsr #1
  103. bcc ms32_even
  104. mov r4, r5
  105. beq ms32_done
  106. ms32_loop:
  107. @ carry is in r4
  108. ldr lr, [r1], #4 @ lr = *in++
  109. mov r5, #0
  110. umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
  111. ldr lr, [r0, #0] @ lr = *out
  112. subs lr, lr, r4 @ lr -= product.low
  113. str lr, [r0], #4 @ *out++ = lr
  114. addcc r5, r5, #1 @ propagate carry
  115. ms32_even:
  116. @ carry is in r5
  117. ldr lr, [r1], #4 @ lr = *in++
  118. mov r4, #0
  119. umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
  120. ldr lr, [r0, #0] @ lr = *out
  121. subs lr, lr, r5 @ lr -= product.low
  122. str lr, [r0], #4 @ *out++ = lr
  123. addcc r4, r4, #1 @ Propagate carry
  124. subs r2, r2, #1
  125. bne ms32_loop
  126. ms32_done:
  127. mov r0, r4
  128. ldmfd sp!, {r4, r5, pc}
  129. .size lbnMulSub1_32, .-lbnMulSub1_32
  130. @@
  131. @@ It's possible to eliminate the store traffic by doing the multiplies
  132. @@ in a different order, forming all the partial products in one column
  133. @@ at a time. But it requires 32x32 + 64 -> 65-bit MAC. The
  134. @@ ARM has the MAC, but no carry out.
  135. @@
  136. @@ The question is, is it faster to do the add directly (3 instructions),
  137. @@ or can we compute the carry out in 1 instruction (+1 to do the add)?
  138. @@ Well... it takes at least 1 instruction to copy the original accumulator,
  139. @@ out of the way, and 1 to do a compare, so no.
  140. @@
  141. @@ Now, the overall loop... this is an nxn->2n multiply. For i=0..n-1,
  142. @@ we sum i+1 multiplies in each (plus the carry in from the
  143. @@ previous one). For i = n..2*n-1 we sum 2*n-1-i, plus the previous
  144. @@ carry.
  145. @@
  146. @@ This "non-square" structure makes things more complicated.
  147. @@
  148. @@ void
  149. @@ lbnMulX_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2,
  150. @@ unsigned len)
  151. @ .global lbnMulX_32
  152. @ .type lbnMulX_32, %function
  153. @lbnMulX_32:
  154. @ stmfd sp!, {r4, r5, r6, r7, lr}
  155. @
  156. @ mov r4, #0
  157. @ mov r5, #0
  158. @ mov r0, r4
  159. @ ldmfd sp!, {r4, r5, pc}
  160. @ .size lbnMulX_32, .-lbnMulX_32