;;; Copyright (c) 1995, Colin Plumb.
;;; For licensing and other legal details, see the file legal.c.
;;;
;;; Assembly primitives for bignum library, 80386 family, 32-bit code.
;;;
;;; Several primitives are included here.  Only lbnMulAdd1 is *really*
;;; critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite
;;; easy to write as well, so they are included here as well.
;;; lbnDiv21 and lbnModQ are so easy to write that they're included, too.
;;;
;;; All functions here are for 32-bit flat mode.  I.e. near code and
;;; near data, although the near offsets are 32 bits.
;;;
;;; The usual 80x86 calling conventions have AX, BX, CX and DX
;;; volatile, and SI, DI, SP and BP preserved across calls.
;;; This includes the "E"xtended forms of all of those registers
;;; 
;;; However, just to be confusing, recent 32-bit DOS compilers have
;;; quietly changed that to require EBX preserved across calls, too.
;;; Joy.

.386
;_TEXT   segment para public use32 'CODE' ; 16-byte aligned because 486 cares
;_TEXT	ends

ifdef @Version
if @Version le 510
FLAT	group	_TEXT
endif
else
FLAT	group	_TEXT
endif
	assume	cs:FLAT, ds:FLAT, ss:FLAT
_TEXT   segment para public use32 'CODE' ; 16-byte aligned because 486 cares

	public  _lbnMulN1_32
	public  _lbnMulAdd1_32
	public  _lbnMulSub1_32
	public	_lbnDiv21_32
	public	_lbnModQ_32

;; Register usage:
;; eax - low half of product
;; ebx - carry to next iteration
;; ecx - multiplier (k)
;; edx - high half of product
;; esi - source pointer
;; edi - dest pointer
;; ebp - loop counter
;;
;; Stack frame:
;; +--------+ esp+20  esp+24  esp+28  esp+32  esp+36
;; |    k   |
;; +--------+ esp+16  esp+20  esp+24  esp+28  esp+32
;; |   len  |
;; +--------+ esp+12  esp+16  esp+20  esp+24  esp+28
;; |   in   |
;; +--------+ esp+8   esp+12  esp+16  esp+20  esp+24
;; |   out  |
;; +--------+ esp+4   esp+8   esp+12  esp+16  esp+20
;; | return |
;; +--------+ esp     esp+4   esp+8   esp+12  esp+16
;; |   esi  |
;; +--------+         esp     esp+4   esp+8   esp+12
;; |   ebp  |
;; +--------+                 esp     esp+4   esp+8
;; |   ebx  |
;; +--------+                         esp     esp+4
;; |   edi  |
;; +--------+                                 esp

	align	16
_lbnMulN1_32	proc	near

	push	esi		; U
	mov	esi,[esp+12]	;  V	load in
	push	ebp		; U
	mov	ebp,[esp+20]	;  V	load len
	push	ebx		; U
	mov	ecx,[esp+28]	;  V	load k
	push	edi		; U
	mov	edi,[esp+20]	;  V	load out

;; First multiply step has no carry in.
	mov	eax,[esi]	; U
	lea	ebx,[ebp*4-4]	;  V	loop unrolling
	mul	ecx		; NP	first multiply
	mov	[edi],eax	; U
	and	ebx,12		;  V	loop unrolling

	add	esi,ebx		; U	loop unrolling
	add	edi,ebx		;  V	loop unrolling

	jmp	DWORD PTR m32_jumptable[ebx]	; NP	loop unrolling

	align	4
m32_jumptable:
	dd	m32_case0
	dd	m32_case1
	dd	m32_case2
	dd	m32_case3

	nop
	align	8
	nop
	nop
	nop	; Get loop nicely aligned

m32_case0:
	sub	ebp,4		; U
	jbe	SHORT m32_done	;  V

m32_loop:
	mov	eax,[esi+4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	add	esi,16		; U
	add	edi,16		;  V
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	adc	edx,0		; U
	mov	[edi-12],eax	;  V
m32_case3:
	mov	eax,[esi-8]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	adc	edx,0		; U
	mov	[edi-8],eax	;  V
m32_case2:
	mov	eax,[esi-4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	adc	edx,0		; U
	mov	[edi-4],eax	;  V
m32_case1:
	mov	eax,[esi]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	adc	edx,0		; U
	mov	[edi],eax	;  V

	sub	ebp,4		; U
	ja	SHORT m32_loop	;  V

m32_done:
	mov	[edi+4],edx	; U
	pop	edi		;  V
	pop	ebx		; U
	pop	ebp		;  V
	pop	esi		; U
	ret			; NP
_lbnMulN1_32	endp


	align	16
_lbnMulAdd1_32	proc	near

	push	esi		; U
	mov	esi,[esp+12]	;  V	load in
	push	edi		; U
	mov	edi,[esp+12]	;  V	load out
	push	ebp		; U
	mov	ebp,[esp+24]	;  V	load len
	push	ebx		; U
	mov	ecx,[esp+32]	;  V	load k

;; First multiply step has no carry in.
	mov	eax,[esi]	; U
	mov	ebx,[edi]	;  V
	mul	ecx		; NP	first multiply
	add	ebx,eax		; U
	lea	eax,[ebp*4-4]	;  V	loop unrolling
	adc	edx,0		; U
	and	eax,12		;  V	loop unrolling
	mov	[edi],ebx	; U

	add	esi,eax		;  V	loop unrolling
	add	edi,eax		; U	loop unrolling

	jmp	DWORD PTR ma32_jumptable[eax]	; NP	loop unrolling

	align	4
ma32_jumptable:
	dd	ma32_case0
	dd	ma32_case1
	dd	ma32_case2
	dd	ma32_case3

	nop
	align	8
	nop
	nop
	nop			; To align loop properly


ma32_case0:
	sub	ebp,4		; U
	jbe	SHORT ma32_done	;  V

ma32_loop:
	mov	eax,[esi+4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	add	esi,16		; U
	add	edi,16		;  V
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-12]	;  V
	adc	edx,0		; U
	add	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-12],ebx	;  V
ma32_case3:
	mov	eax,[esi-8]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-8]	;  V
	adc	edx,0		; U
	add	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-8],ebx	;  V
ma32_case2:
	mov	eax,[esi-4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-4]	;  V
	adc	edx,0		; U
	add	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-4],ebx	;  V
ma32_case1:
	mov	eax,[esi]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi]	;  V
	adc	edx,0		; U
	add	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi],ebx	;  V

	sub	ebp,4		; U
	ja	SHORT ma32_loop	;  V

ma32_done:
	pop	ebx		; U
	pop	ebp		;  V
	mov	eax,edx		; U
	pop	edi		;  V
	pop	esi		; U
	ret			; NP
_lbnMulAdd1_32	endp


	align	16
_lbnMulSub1_32	proc	near
	push	esi		; U
	mov	esi,[esp+12]	;  V	load in
	push	edi		; U
	mov	edi,[esp+12]	;  V	load out
	push	ebp		; U
	mov	ebp,[esp+24]	;  V	load len
	push	ebx		; U
	mov	ecx,[esp+32]	;  V	load k

;; First multiply step has no carry in.
	push	esi		; U
	mov	esi,[esp+12]	;  V	load in
	push	edi		; U
	mov	edi,[esp+12]	;  V	load out
	push	ebp		; U
	mov	ebp,[esp+24]	;  V	load len
	mov	ecx,[esp+28]	; U	load k

;; First multiply step has no carry in.
	mov	eax,[esi]	;  V
	mov	ebx,[edi]	; U
	mul	ecx		; NP	first multiply
	sub	ebx,eax		; U
	lea	eax,[ebp*4-4]	;  V	loop unrolling
	adc	edx,0		; U
	and	eax,12		;  V	loop unrolling
	mov	[edi],ebx	; U

	add	esi,eax		;  V	loop unrolling
	add	edi,eax		; U	loop unrolling

	jmp	DWORD PTR ms32_jumptable[eax]	; NP	loop unrolling

	align	4
ms32_jumptable:
	dd	ms32_case0
	dd	ms32_case1
	dd	ms32_case2
	dd	ms32_case3

	nop
	align	8
	nop
	nop
	nop

ms32_case0:
	sub	ebp,4		; U
	jbe	SHORT ms32_done	;  V

ms32_loop:
	mov	eax,[esi+4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	add	esi,16		; U
	add	edi,16		;  V
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-12]	;  V
	adc	edx,0		; U
	sub	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-12],ebx	;  V
ms32_case3:
	mov	eax,[esi-8]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-8]	;  V
	adc	edx,0		; U
	sub	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-8],ebx	;  V
ms32_case2:
	mov	eax,[esi-4]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi-4]	;  V
	adc	edx,0		; U
	sub	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi-4],ebx	;  V
ms32_case1:
	mov	eax,[esi]	; U
	mov	ebx,edx		;  V	Remember carry for later
	mul	ecx		; NP
	add	eax,ebx		; U	Add carry in from previous word
	mov	ebx,[edi]	;  V
	adc	edx,0		; U
	sub	ebx,eax		;  V
	adc	edx,0		; U
	mov	[edi],ebx	;  V

	sub	ebp,4		; U
	ja	SHORT ms32_loop	;  V

ms32_done:
	pop	ebx		; U
	pop	ebp		;  V
	mov	eax,edx		; U
	pop	edi		;  V
	pop	esi		; U
	ret			; NP
_lbnMulSub1_32	endp


;; Two-word by one-word divide.  Stores quotient, returns remainder.
;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
;;                      4            8            12           16
align 4
_lbnDiv21_32	proc	near
	mov	edx,[esp+8]		; U	Load nh
	mov	eax,[esp+12]		;  V	Load nl
	mov	ecx,[esp+4]		; U	Load q
	div	DWORD PTR [esp+16]	; NP
	mov	[ecx],eax		; U	Store quotient
	mov	eax,edx			;  V	Return remainder
	ret
_lbnDiv21_32	endp

;; Multi-word by one-word remainder.
;; This speeds up key generation.  It's not worth unrolling and so on;
;; using 32-bit divides is enough of a speedup.
;;
;; The modulus (in ebp) is often 16 bits.  Given that the dividend is 32
;; bits, the chances of saving the first divide because the high word of the
;; dividend is less than the modulus are low enough it's not worth taking
;; the cycles to test for it.
;;
;; unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
;;                     4                  8             12
align 4
_lbnModQ_32	proc	near
	mov	eax,[esp+4]		; U	Load n
	push	ebp			;  V
	mov	ebp,[esp+12]		; U	Load len
	push	esi			;  V
	lea	esi,[ebp*4+eax-4]	; U
	mov	ecx,[esp+20]		;  V	Load d
	xor	edx,edx			; U	Clear edx for first iteration
modq32_loop:
	mov	eax,[esi]		; U	Load new low word for divide
	sub	esi,4			;  V
	div	ecx			; NP	edx = edx:eax % ecx
	dec	ebp			; U
	jnz	SHORT modq32_loop	;  V

	pop	esi			; U
	mov	eax,edx			;  V	Return remainder in eax
	pop	ebp			; U
	ret				; NP
_lbnModQ_32	endp

	end