123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173 |
- @ lbnarm.s - 32-bit bignum primitives for ARM processors with 32x32-bit multiply
- @
- @ This uses the standard ARM calling convetion, which is that arguments
- @ are passed, and results returned, in r0..r3. r0..r3, r12 (IP) and r14 (LR)
- @ are volatile across the function; all others are callee-save.
- @ However, note that r14 (LR) is the return address, so it would be
- @ wise to save it somewhere before trashing it. Fortunately, there is
- @ a neat trick possible, in that you can pop LR from the stack straight
- @ into r15 (PC), effecting a return at the same time.
- @
- @ Also, r13 (SP) is probably best left alone, and r15 (PC) is obviously
- @ reserved by hardware. Temps should use lr, then r4..r9 in order.
- .text
- .align 2
- @ out[0..len] = in[0..len-1] * k
- @ void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
- .global lbnMulN1_32
- .type lbnMulN1_32, %function
- lbnMulN1_32:
- stmfd sp!, {r4, r5, lr}
- ldr lr, [r1], #4 @ lr = *in++
- umull r5, r4, lr, r3 @ (r4,r5) = lr * r3
- str r5, [r0], #4 @ *out++ = r5
- movs r2, r2, lsr #1
- bcc m32_even
- mov r5, r4 @ Get carry in the right register
- beq m32_done
- m32_loop:
- @ carry is in r5
- ldr lr, [r1], #4 @ lr = *in++
- mov r4, #0
- umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
- str r5, [r0], #4 @ *out++ = r5
- m32_even:
- @ carry is in r4
- ldr lr, [r1], #4 @ lr = *in++
- mov r5, #0
- umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
- subs r2, r2, #1
- str r4, [r0], #4 @ *out++ = r4
- bne m32_loop
- m32_done:
- str r5, [r0, #0] @ store carry
- ldmfd sp!, {r4, r5, pc}
- .size lbnMulN1_32, .-lbnMulN1_32
- @ out[0..len-1] += in[0..len-1] * k, return carry
- @ BNWORD32
- @ lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
- .global lbnMulAdd1_32
- .type lbnMulAdd1_32, %function
- lbnMulAdd1_32:
- stmfd sp!, {r4, r5, lr}
- mov r4, #0
- ldr lr, [r1], #4 @ lr = *in++
- ldr r5, [r0, #0] @ r5 = *out
- mov r4, #0
- umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
- str r5, [r0], #4 @ *out++ = r5
- movs r2, r2, lsr #1
- bcc ma32_even
- beq ma32_done
- ma32_loop:
- @ carry is in r4
- ldr lr, [r1], #4 @ lr = *in++
- mov r5, #0
- umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
- ldr lr, [r0, #0] @ lr = *out
- adds lr, lr, r4 @ lr += product.low
- str lr, [r0], #4 @ *out++ = lr
- adc r4, r5, #0 @ Compute carry and move back to r4
- ma32_even:
- @ another unrolled copy
- ldr lr, [r1], #4 @ lr = *in++
- mov r5, #0
- umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
- ldr lr, [r0, #0] @ lr = *out
- adds lr, lr, r4 @ lr += product.low
- adc r4, r5, #0 @ Compute carry and move back to r4
- str lr, [r0], #4 @ *out++ = lr
- subs r2, r2, #1
- bne ma32_loop
- ma32_done:
- mov r0, r4
- ldmfd sp!, {r4, r5, pc}
- .size lbnMulAdd1_32, .-lbnMulAdd1_32
- @@@ This is a bit messy... punt for now...
- @ out[0..len-1] -= in[0..len-1] * k, return carry (borrow)
- @ BNWORD32
- @ lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
- .global lbnMulSub1_32
- .type lbnMulSub1_32, %function
- lbnMulSub1_32:
- stmfd sp!, {r4, r5, lr}
- mov r4, #0
- mov r5, #0
- ldr lr, [r1], #4 @ lr = *in++
- umull r4, r5, lr, r3 @ (r5,r4) = lr * r3
- ldr lr, [r0, #0] @ lr = *out
- subs lr, lr, r4 @ lr -= product.low
- str lr, [r0], #4 @ *out++ = lr
- addcc r5, r5, #1 @ propagate carry
- movs r2, r2, lsr #1
- bcc ms32_even
- mov r4, r5
- beq ms32_done
- ms32_loop:
- @ carry is in r4
- ldr lr, [r1], #4 @ lr = *in++
- mov r5, #0
- umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
- ldr lr, [r0, #0] @ lr = *out
- subs lr, lr, r4 @ lr -= product.low
- str lr, [r0], #4 @ *out++ = lr
- addcc r5, r5, #1 @ propagate carry
- ms32_even:
- @ carry is in r5
- ldr lr, [r1], #4 @ lr = *in++
- mov r4, #0
- umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
- ldr lr, [r0, #0] @ lr = *out
- subs lr, lr, r5 @ lr -= product.low
- str lr, [r0], #4 @ *out++ = lr
- addcc r4, r4, #1 @ Propagate carry
- subs r2, r2, #1
- bne ms32_loop
- ms32_done:
- mov r0, r4
- ldmfd sp!, {r4, r5, pc}
- .size lbnMulSub1_32, .-lbnMulSub1_32
- @@
- @@ It's possible to eliminate the store traffic by doing the multiplies
- @@ in a different order, forming all the partial products in one column
- @@ at a time. But it requires 32x32 + 64 -> 65-bit MAC. The
- @@ ARM has the MAC, but no carry out.
- @@
- @@ The question is, is it faster to do the add directly (3 instructions),
- @@ or can we compute the carry out in 1 instruction (+1 to do the add)?
- @@ Well... it takes at least 1 instruction to copy the original accumulator,
- @@ out of the way, and 1 to do a compare, so no.
- @@
- @@ Now, the overall loop... this is an nxn->2n multiply. For i=0..n-1,
- @@ we sum i+1 multiplies in each (plus the carry in from the
- @@ previous one). For i = n..2*n-1 we sum 2*n-1-i, plus the previous
- @@ carry.
- @@
- @@ This "non-square" structure makes things more complicated.
- @@
- @@ void
- @@ lbnMulX_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2,
- @@ unsigned len)
- @ .global lbnMulX_32
- @ .type lbnMulX_32, %function
- @lbnMulX_32:
- @ stmfd sp!, {r4, r5, r6, r7, lr}
- @
- @ mov r4, #0
- @ mov r5, #0
- @ mov r0, r4
- @ ldmfd sp!, {r4, r5, pc}
- @ .size lbnMulX_32, .-lbnMulX_32
|