123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280 |
- * Copyright (c) 1995 Colin Plumb. All rights reserved.
- * For licensing and other legal details, see the file legal.c.
- *
- * lbn68360.c - 32-bit bignum primitives for 683xx processors.
- *
- * This code is using InterTools calling convention, which is a bit odd.
- * One minor note is that the default variable sizes are
- * char = unsigned 8, short = 8 (in violation of ANSI!),
- * int = 16, long = 32. Longs (including on the stack) are 16-bit aligned.
- * Arguments are apdded to 16 bits.
- * A6 is used as a frame pointer, and globals are indexed off A5.
- * Return valies are passes id D0 or A0 (or FP0), depending on type.
- * D0, D1, A0 and A4 (!) are volatile across function calls. A1
- * must be preserved!
- *
- * This code assumes 16-bit ints. Code for 32-bit ints is commented out
- * with "**".
- *
- * Regardless of UINT_MAX, only bignums up to 64K words (2 million bits)
- * are supported. (68k hackers will recognize this as a consequence of
- * using dbra.) This could be extended easily if anyone cares.
- *
- * These primitives use little-endian word order.
- * (The order of bytes within words is irrelevant to this issue.)
- * The Metrowerks C compiler (1.2.2) produces bad 68k code for the
- * following input, which happens to be the inner loop of lbnSub1,
- * so it has been rewritees in assembly, even though it is not terribly
- * speed-critical. (Optimizer on or off does not matter.)
- *
- * unsigned
- * decrement(unsigned *num, unsigned len)
- * {
- * do {
- * if ((*num++)-- != 0)
- * return 0;
- * } while (--len);
- * return 1;
- * }
- * BNWORD32 lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
- SECTION S_lbnSub1_32,,"code"
- XDEF _lbnSub1_32
- _lbnSub1_32:
- movea.l 4(sp),a0 * num
- move.l 10(sp),d0 * borrow
- ** move.l 12(sp),d0 * borrow
- sub.l d0,(a0)+
- bcc sub_done
- move.w 8(sp),d0 * len
- ** move.w 10(sp),d0 * len
- subq.w #2,d0
- bcs sub_done
- sub_loop:
- subq.l #1,(a0)+
- dbcc d0,sub_loop
- sub_done:
- moveq.l #0,d0
- addx.w d0,d0
- rts
- * BNWORD32 lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
- SECTION S_lbnAdd1_32,,"code"
- XDEF _lbnAdd1_32
- _lbnAdd1_32:
- movea.l 4(sp),a0 * num
- move.l 10(sp),d0 * carry
- ** move.l 12(sp),d0 * carry
- add.l d0,(a0)+
- bcc add_done
- move.w 8(sp),d0 * len
- ** move.w 10(sp),d0 * len
- subq.w #2,d0
- bcs add_done
- add_loop:
- addq.l #1,(a0)+
- dbcc d0,add_loop
- add_done:
- moveq.l #0,d0
- addx.w d0,d0
- rts
- * void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
- SECTION S_lbnMulN1_32,,"code"
- XDEF _lbnMulN1_32
- _lbnMulN1_32:
- movem.l d2-d5,-(sp) * 16 bytes of extra data
- moveq.l #0,d4
- move.l 20(sp),a4 * out
- move.l 24(sp),a0 * in
- move.w 28(sp),d5 * len
- move.l 30(sp),d2 * k
- ** move.w 30(sp),d5 * len
- ** move.l 32(sp),d2 * k
- move.l (a0)+,d3 * First multiply
- mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
- move.l d3,(a4)+
- subq.w #1,d5 * Setup for loop unrolling
- lsr.w #1,d5
- bcs.s m32_even
- beq.s m32_short
-
- subq.w #1,d5 * Set up software pipeline properly
- move.l d1,d0
-
- m32_loop:
- move.l (a0)+,d3
- mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
- add.l d0,d3
- addx.l d4,d1
- move.l d3,(a4)+
- m32_even:
- move.l (a0)+,d3
- mulu.l d2,d0:d3 * dc.w 0x4c02, 0x3400
- add.l d1,d3
- addx.l d4,d0
- move.l d3,(a4)+
- dbra d5,m32_loop
-
- move.l d0,(a4)
- movem.l (sp)+,d2-d5
- rts
- m32_short:
- move.l d1,(a4)
- movem.l (sp)+,d2-d5
- rts
- * BNWORD32
- * lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
- SECTION S_lbnMulAdd1_32,,"code"
- XDEF _lbnMulAdd1_32
- _lbnMulAdd1_32:
- movem.l d2-d5,-(sp) * 16 bytes of extra data
- moveq.l #0,d4
- move.l 20(sp),a4 * out
- move.l 24(sp),a0 * in
- move.w 28(sp),d5 * len
- move.l 30(sp),d2 * k
- ** move.w 30(sp),d5 * len
- ** move.l 32(sp),d2 * k
- move.l (a0)+,d3 * First multiply
- mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
- add.l d3,(a4)+
- addx.l d4,d1
- subq.w #1,d5 * Setup for loop unrolling
- lsr.w #1,d5
- bcs.s ma32_even
- beq.s ma32_short
-
- subq.w #1,d5 * Set up software pipeline properly
- move.l d1,d0
-
- ma32_loop:
- move.l (a0)+,d3
- mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
- add.l d0,d3
- addx.l d4,d1
- add.l d3,(a4)+
- addx.l d4,d1
- ma32_even:
- move.l (a0)+,d3
- mulu.l d2,d0:d3 * dc.w 0x4c02, 0x3400
- add.l d1,d3
- addx.l d4,d0
- add.l d3,(a4)+
- addx.l d4,d0
- dbra d5,ma32_loop
-
- movem.l (sp)+,d2-d5
- rts
- ma32_short:
- move.l d1,d0
- movem.l (sp)+,d2-d5
- rts
- * BNWORD32
- * lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
- SECTION S_lbnMulSub1_32,,"code"
- XDEF _lbnMulSub1_32
- _lbnMulSub1_32:
- movem.l d2-d5,-(sp) * 16 bytes of extra data
- moveq.l #0,d4
- move.l 20(sp),a4 * out
- move.l 24(sp),a0 * in
- move.w 28(sp),d5 * len
- move.l 30(sp),d2 * k
- ** move.w 30(sp),d5 * len
- ** move.l 32(sp),d2 * k
- move.l (a0)+,d3 * First multiply
- mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
- sub.l d3,(a4)+
- addx.l d4,d1
- subq.w #1,d5 * Setup for loop unrolling
- lsr.w #1,d5
- bcs.s ms32_even
- beq.s ms32_short
-
- subq.w #1,d5 * Set up software pipeline properly
- move.l d1,d0
-
- ms32_loop:
- move.l (a0)+,d3
- mulu.l d2,d1:d3 * dc.w 0x4c02, 0x3401
- add.l d0,d3
- addx.l d4,d1
- sub.l d3,(a4)+
- addx.l d4,d1
- ms32_even:
- move.l (a0)+,d3
- mulu.l d2,d0:d3 * dc.w 0x4c02, 0x3400
- add.l d1,d3
- addx.l d4,d0
- sub.l d3,(a4)+
- addx.l d4,d0
- dbra d5,ms32_loop
-
- movem.l (sp)+,d2-d5
- rts
-
- ms32_short:
- move.l d1,d0
- movem.l (sp)+,d2-d5
- rts
- * BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
- SECTION S_lbnDiv21_32,,"code"
- XDEF _lbnDiv21_32
- _lbnDiv21_32:
- move.l 8(sp),d0
- move.l 12(sp),d1
- move.l 4(sp),a0
- divu.l 16(sp),d0:d1 * dc.w 0x4c6f, 0x1400, 16
- move.l d1,(a0)
- rts
- * unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
- SECTION S_lbnModQ_32,,"code"
- XDEF _lbnModQ_32
- _lbnModQ_32:
- move.l 4(sp),a0 * n
- move.l d2,-(sp)
- move.l d3,a4
- moveq.l #0,d1
- moveq.l #0,d2
- move.w 12(sp),d1 * len
- move.w 14(sp),d2 * d
- ** move.l 12(sp),d1 * len
- ** move.l 16(sp),d2 * d
- lea -4(a0,d1.L*4),a0 * dc.w 0x41f0, 0x1cfc
- * First time, divide 32/32 - may be faster than 64/32
- move.l (a0),d3
- divul.l d2,d0:d3 * dc.w 0x4c02, 0x3000
- subq.w #2,d1
- bmi mq32_done
- mq32_loop:
- move.l -(a0),d3
- divu.l d2,d0:d3 * dc.w 0x4c02,0x3400
- dbra d1,mq32_loop
-
- mq32_done:
- move.l (sp)+,d2
- move.l a4,d3
- rts
- end
|