* Copyright (c) 1995  Colin Plumb.  All rights reserved.
* For licensing and other legal details, see the file legal.c.
*
* lbn68360.c - 32-bit bignum primitives for 683xx processors.
*
* This code is using InterTools calling convention, which is a bit odd.
* One minor note is that the default variable sizes are
* char = unsigned 8, short = 8 (in violation of ANSI!),
* int = 16, long = 32.  Longs (including on the stack) are 16-bit aligned.
* Arguments are apdded to 16 bits.
* A6 is used as a frame pointer, and globals are indexed off A5.
* Return valies are passes id D0 or A0 (or FP0), depending on type.
* D0, D1, A0 and A4 (!) are volatile across function calls.  A1
* must be preserved!
* 
* This code assumes 16-bit ints.  Code for 32-bit ints is commented out
* with "**".
*
* Regardless of UINT_MAX, only bignums up to 64K words (2 million bits)
* are supported.  (68k hackers will recognize this as a consequence of
* using dbra.)  This could be extended easily if anyone cares.
*
* These primitives use little-endian word order.
* (The order of bytes within words is irrelevant to this issue.)

* The Metrowerks C compiler (1.2.2) produces bad 68k code for the
* following input, which happens to be the inner loop of lbnSub1,
* so it has been rewritees in assembly, even though it is not terribly
* speed-critical.  (Optimizer on or off does not matter.)
* 
* unsigned
* decrement(unsigned *num, unsigned len)
* {
*      do {
*              if ((*num++)-- != 0)
*                      return 0;
*      } while (--len);
*      return 1;
* }

* BNWORD32 lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
	SECTION	S_lbnSub1_32,,"code"
	XDEF	_lbnSub1_32
_lbnSub1_32:
	movea.l	4(sp),a0	* num
	move.l	10(sp),d0	* borrow
**	move.l	12(sp),d0	* borrow
	sub.l	d0,(a0)+
	bcc	sub_done
	move.w	8(sp),d0	* len
**	move.w	10(sp),d0	* len
	subq.w	#2,d0
	bcs	sub_done
sub_loop:
	subq.l	#1,(a0)+
	dbcc	d0,sub_loop
sub_done:
	moveq.l	#0,d0
	addx.w	d0,d0
	rts

* BNWORD32 lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
	SECTION	S_lbnAdd1_32,,"code"
	XDEF	_lbnAdd1_32
_lbnAdd1_32:
	movea.l	4(sp),a0	* num
	move.l	10(sp),d0	* carry
**	move.l	12(sp),d0	* carry
	add.l	d0,(a0)+
	bcc	add_done
	move.w	8(sp),d0	* len
**	move.w	10(sp),d0	* len
	subq.w	#2,d0
	bcs	add_done
add_loop:
	addq.l	#1,(a0)+
	dbcc	d0,add_loop
add_done:
	moveq.l	#0,d0
	addx.w	d0,d0
	rts

* void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
	SECTION	S_lbnMulN1_32,,"code"
	XDEF	_lbnMulN1_32
_lbnMulN1_32:
	movem.l	d2-d5,-(sp)	* 16 bytes of extra data
	moveq.l	#0,d4
	move.l	20(sp),a4	* out
	move.l	24(sp),a0	* in
	move.w	28(sp),d5	* len
	move.l	30(sp),d2	* k
**	move.w	30(sp),d5	* len
**	move.l	32(sp),d2	* k

	move.l	(a0)+,d3	* First multiply
	mulu.l	d2,d1:d3	* dc.w    0x4c02, 0x3401
	move.l	d3,(a4)+

	subq.w	#1,d5		* Setup for loop unrolling
	lsr.w	#1,d5
	bcs.s	m32_even
	beq.s	m32_short
	
	subq.w	#1,d5		* Set up software pipeline properly
	move.l	d1,d0
	
m32_loop:
	move.l	(a0)+,d3
	mulu.l	d2,d1:d3	* dc.w    0x4c02, 0x3401
	add.l	d0,d3
	addx.l	d4,d1
	move.l	d3,(a4)+
m32_even:

	move.l	(a0)+,d3
	mulu.l	d2,d0:d3	* dc.w    0x4c02, 0x3400
	add.l	d1,d3
	addx.l	d4,d0
	move.l	d3,(a4)+

	dbra	d5,m32_loop
	
	move.l	d0,(a4)
	movem.l	(sp)+,d2-d5
	rts
m32_short:
	move.l	d1,(a4)
	movem.l	(sp)+,d2-d5
	rts

* BNWORD32
* lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
	SECTION	S_lbnMulAdd1_32,,"code"
	XDEF	_lbnMulAdd1_32
_lbnMulAdd1_32:
	movem.l	d2-d5,-(sp)	* 16 bytes of extra data
	moveq.l	#0,d4
	move.l	20(sp),a4	* out
	move.l	24(sp),a0	* in
	move.w	28(sp),d5	* len
	move.l	30(sp),d2	* k
**	move.w	30(sp),d5	* len
**	move.l	32(sp),d2	* k

	move.l	(a0)+,d3	* First multiply
	mulu.l	d2,d1:d3	* dc.w    0x4c02, 0x3401
	add.l	d3,(a4)+
	addx.l	d4,d1

	subq.w	#1,d5	* Setup for loop unrolling
	lsr.w	#1,d5
	bcs.s	ma32_even
	beq.s	ma32_short
	
	subq.w	#1,d5	* Set up software pipeline properly
	move.l	d1,d0
	
ma32_loop:
	move.l	(a0)+,d3
	mulu.l	d2,d1:d3	* dc.w    0x4c02, 0x3401
	add.l	d0,d3
	addx.l	d4,d1
	add.l	d3,(a4)+
	addx.l	d4,d1
ma32_even:

	move.l	(a0)+,d3
	mulu.l	d2,d0:d3	* dc.w    0x4c02, 0x3400
	add.l	d1,d3
	addx.l	d4,d0
	add.l	d3,(a4)+
	addx.l	d4,d0

	dbra	d5,ma32_loop
	
	movem.l	(sp)+,d2-d5
	rts
ma32_short:
	move.l	d1,d0   
	movem.l	(sp)+,d2-d5
	rts

* BNWORD32
* lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
	SECTION	S_lbnMulSub1_32,,"code"
	XDEF	_lbnMulSub1_32
_lbnMulSub1_32:
	movem.l	d2-d5,-(sp)	* 16 bytes of extra data
	moveq.l	#0,d4
	move.l	20(sp),a4	* out
	move.l	24(sp),a0	* in
	move.w	28(sp),d5	* len
	move.l	30(sp),d2	* k
**	move.w	30(sp),d5	* len
**	move.l	32(sp),d2	* k

	move.l	(a0)+,d3	* First multiply
	mulu.l	d2,d1:d3	* dc.w    0x4c02, 0x3401
	sub.l	d3,(a4)+
	addx.l	d4,d1

	subq.w	#1,d5	* Setup for loop unrolling
	lsr.w	#1,d5
	bcs.s	ms32_even
	beq.s	ms32_short
	
	subq.w	#1,d5	* Set up software pipeline properly
	move.l	d1,d0
	
ms32_loop:
	move.l	(a0)+,d3
	mulu.l	d2,d1:d3	* dc.w	0x4c02, 0x3401
	add.l	d0,d3
	addx.l	d4,d1
	sub.l	d3,(a4)+
	addx.l	d4,d1
ms32_even:

	move.l	(a0)+,d3
	mulu.l	d2,d0:d3	* dc.w	0x4c02, 0x3400
	add.l	d1,d3
	addx.l	d4,d0
	sub.l	d3,(a4)+
	addx.l	d4,d0

	dbra	d5,ms32_loop
	
	movem.l	(sp)+,d2-d5
	rts
	
ms32_short:
	move.l	d1,d0
	movem.l	(sp)+,d2-d5
	rts


* BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
	SECTION	S_lbnDiv21_32,,"code"
	XDEF	_lbnDiv21_32
_lbnDiv21_32:
	move.l	8(sp),d0
	move.l	12(sp),d1
	move.l	4(sp),a0
	divu.l	16(sp),d0:d1	*  dc.w	0x4c6f, 0x1400, 16
	move.l	d1,(a0)
	rts

* unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
	SECTION	S_lbnModQ_32,,"code"
	XDEF	_lbnModQ_32
_lbnModQ_32:
	move.l	4(sp),a0	* n
	move.l	d2,-(sp)
	move.l	d3,a4
	moveq.l	#0,d1
	moveq.l	#0,d2
	move.w	12(sp),d1	* len
	move.w	14(sp),d2	* d
**	move.l	12(sp),d1	* len
**	move.l	16(sp),d2	* d
	lea  -4(a0,d1.L*4),a0	* dc.w	0x41f0, 0x1cfc

* First time, divide 32/32 - may be faster than 64/32
	move.l	(a0),d3
	divul.l	d2,d0:d3	* dc.w    0x4c02, 0x3000
	subq.w	#2,d1
	bmi	mq32_done

mq32_loop:
	move.l	-(a0),d3
	divu.l	d2,d0:d3	* dc.w    0x4c02,0x3400
	dbra	d1,mq32_loop    
	                
mq32_done:
	move.l	(sp)+,d2
	move.l	a4,d3
	rts

	end