123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414 |
- ;;; Copyright (c) 1995, Colin Plumb.
- ;;; For licensing and other legal details, see the file legal.c.
- ;;;
- ;;; Assembly primitives for bignum library, 80386 family, 32-bit code.
- ;;;
- ;;; Several primitives are included here. Only lbnMulAdd1 is *really*
- ;;; critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite
- ;;; easy to write as well, so they are included here as well.
- ;;; lbnDiv21 and lbnModQ are so easy to write that they're included, too.
- ;;;
- ;;; All functions here are for 32-bit flat mode. I.e. near code and
- ;;; near data, although the near offsets are 32 bits.
- ;;;
- ;;; The usual 80x86 calling conventions have AX, BX, CX and DX
- ;;; volatile, and SI, DI, SP and BP preserved across calls.
- ;;; This includes the "E"xtended forms of all of those registers
- ;;;
- ;;; However, just to be confusing, recent 32-bit DOS compilers have
- ;;; quietly changed that to require EBX preserved across calls, too.
- ;;; Joy.
- .386
- ;_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares
- ;_TEXT ends
- ifdef @Version
- if @Version le 510
- FLAT group _TEXT
- endif
- else
- FLAT group _TEXT
- endif
- assume cs:FLAT, ds:FLAT, ss:FLAT
- _TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares
- public _lbnMulN1_32
- public _lbnMulAdd1_32
- public _lbnMulSub1_32
- public _lbnDiv21_32
- public _lbnModQ_32
- ;; Register usage:
- ;; eax - low half of product
- ;; ebx - carry to next iteration
- ;; ecx - multiplier (k)
- ;; edx - high half of product
- ;; esi - source pointer
- ;; edi - dest pointer
- ;; ebp - loop counter
- ;;
- ;; Stack frame:
- ;; +--------+ esp+20 esp+24 esp+28 esp+32 esp+36
- ;; | k |
- ;; +--------+ esp+16 esp+20 esp+24 esp+28 esp+32
- ;; | len |
- ;; +--------+ esp+12 esp+16 esp+20 esp+24 esp+28
- ;; | in |
- ;; +--------+ esp+8 esp+12 esp+16 esp+20 esp+24
- ;; | out |
- ;; +--------+ esp+4 esp+8 esp+12 esp+16 esp+20
- ;; | return |
- ;; +--------+ esp esp+4 esp+8 esp+12 esp+16
- ;; | esi |
- ;; +--------+ esp esp+4 esp+8 esp+12
- ;; | ebp |
- ;; +--------+ esp esp+4 esp+8
- ;; | ebx |
- ;; +--------+ esp esp+4
- ;; | edi |
- ;; +--------+ esp
- align 16
- _lbnMulN1_32 proc near
- push esi ; U
- mov esi,[esp+12] ; V load in
- push ebp ; U
- mov ebp,[esp+20] ; V load len
- push ebx ; U
- mov ecx,[esp+28] ; V load k
- push edi ; U
- mov edi,[esp+20] ; V load out
- ;; First multiply step has no carry in.
- mov eax,[esi] ; U
- lea ebx,[ebp*4-4] ; V loop unrolling
- mul ecx ; NP first multiply
- mov [edi],eax ; U
- and ebx,12 ; V loop unrolling
- add esi,ebx ; U loop unrolling
- add edi,ebx ; V loop unrolling
- jmp DWORD PTR m32_jumptable[ebx] ; NP loop unrolling
- align 4
- m32_jumptable:
- dd m32_case0
- dd m32_case1
- dd m32_case2
- dd m32_case3
- nop
- align 8
- nop
- nop
- nop ; Get loop nicely aligned
- m32_case0:
- sub ebp,4 ; U
- jbe SHORT m32_done ; V
- m32_loop:
- mov eax,[esi+4] ; U
- mov ebx,edx ; V Remember carry for later
- add esi,16 ; U
- add edi,16 ; V
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- adc edx,0 ; U
- mov [edi-12],eax ; V
- m32_case3:
- mov eax,[esi-8] ; U
- mov ebx,edx ; V Remember carry for later
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- adc edx,0 ; U
- mov [edi-8],eax ; V
- m32_case2:
- mov eax,[esi-4] ; U
- mov ebx,edx ; V Remember carry for later
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- adc edx,0 ; U
- mov [edi-4],eax ; V
- m32_case1:
- mov eax,[esi] ; U
- mov ebx,edx ; V Remember carry for later
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- adc edx,0 ; U
- mov [edi],eax ; V
- sub ebp,4 ; U
- ja SHORT m32_loop ; V
- m32_done:
- mov [edi+4],edx ; U
- pop edi ; V
- pop ebx ; U
- pop ebp ; V
- pop esi ; U
- ret ; NP
- _lbnMulN1_32 endp
- align 16
- _lbnMulAdd1_32 proc near
- push esi ; U
- mov esi,[esp+12] ; V load in
- push edi ; U
- mov edi,[esp+12] ; V load out
- push ebp ; U
- mov ebp,[esp+24] ; V load len
- push ebx ; U
- mov ecx,[esp+32] ; V load k
- ;; First multiply step has no carry in.
- mov eax,[esi] ; U
- mov ebx,[edi] ; V
- mul ecx ; NP first multiply
- add ebx,eax ; U
- lea eax,[ebp*4-4] ; V loop unrolling
- adc edx,0 ; U
- and eax,12 ; V loop unrolling
- mov [edi],ebx ; U
- add esi,eax ; V loop unrolling
- add edi,eax ; U loop unrolling
- jmp DWORD PTR ma32_jumptable[eax] ; NP loop unrolling
- align 4
- ma32_jumptable:
- dd ma32_case0
- dd ma32_case1
- dd ma32_case2
- dd ma32_case3
- nop
- align 8
- nop
- nop
- nop ; To align loop properly
- ma32_case0:
- sub ebp,4 ; U
- jbe SHORT ma32_done ; V
- ma32_loop:
- mov eax,[esi+4] ; U
- mov ebx,edx ; V Remember carry for later
- add esi,16 ; U
- add edi,16 ; V
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- mov ebx,[edi-12] ; V
- adc edx,0 ; U
- add ebx,eax ; V
- adc edx,0 ; U
- mov [edi-12],ebx ; V
- ma32_case3:
- mov eax,[esi-8] ; U
- mov ebx,edx ; V Remember carry for later
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- mov ebx,[edi-8] ; V
- adc edx,0 ; U
- add ebx,eax ; V
- adc edx,0 ; U
- mov [edi-8],ebx ; V
- ma32_case2:
- mov eax,[esi-4] ; U
- mov ebx,edx ; V Remember carry for later
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- mov ebx,[edi-4] ; V
- adc edx,0 ; U
- add ebx,eax ; V
- adc edx,0 ; U
- mov [edi-4],ebx ; V
- ma32_case1:
- mov eax,[esi] ; U
- mov ebx,edx ; V Remember carry for later
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- mov ebx,[edi] ; V
- adc edx,0 ; U
- add ebx,eax ; V
- adc edx,0 ; U
- mov [edi],ebx ; V
- sub ebp,4 ; U
- ja SHORT ma32_loop ; V
- ma32_done:
- pop ebx ; U
- pop ebp ; V
- mov eax,edx ; U
- pop edi ; V
- pop esi ; U
- ret ; NP
- _lbnMulAdd1_32 endp
- align 16
- _lbnMulSub1_32 proc near
- push esi ; U
- mov esi,[esp+12] ; V load in
- push edi ; U
- mov edi,[esp+12] ; V load out
- push ebp ; U
- mov ebp,[esp+24] ; V load len
- push ebx ; U
- mov ecx,[esp+32] ; V load k
- ;; First multiply step has no carry in.
- push esi ; U
- mov esi,[esp+12] ; V load in
- push edi ; U
- mov edi,[esp+12] ; V load out
- push ebp ; U
- mov ebp,[esp+24] ; V load len
- mov ecx,[esp+28] ; U load k
- ;; First multiply step has no carry in.
- mov eax,[esi] ; V
- mov ebx,[edi] ; U
- mul ecx ; NP first multiply
- sub ebx,eax ; U
- lea eax,[ebp*4-4] ; V loop unrolling
- adc edx,0 ; U
- and eax,12 ; V loop unrolling
- mov [edi],ebx ; U
- add esi,eax ; V loop unrolling
- add edi,eax ; U loop unrolling
- jmp DWORD PTR ms32_jumptable[eax] ; NP loop unrolling
- align 4
- ms32_jumptable:
- dd ms32_case0
- dd ms32_case1
- dd ms32_case2
- dd ms32_case3
- nop
- align 8
- nop
- nop
- nop
- ms32_case0:
- sub ebp,4 ; U
- jbe SHORT ms32_done ; V
- ms32_loop:
- mov eax,[esi+4] ; U
- mov ebx,edx ; V Remember carry for later
- add esi,16 ; U
- add edi,16 ; V
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- mov ebx,[edi-12] ; V
- adc edx,0 ; U
- sub ebx,eax ; V
- adc edx,0 ; U
- mov [edi-12],ebx ; V
- ms32_case3:
- mov eax,[esi-8] ; U
- mov ebx,edx ; V Remember carry for later
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- mov ebx,[edi-8] ; V
- adc edx,0 ; U
- sub ebx,eax ; V
- adc edx,0 ; U
- mov [edi-8],ebx ; V
- ms32_case2:
- mov eax,[esi-4] ; U
- mov ebx,edx ; V Remember carry for later
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- mov ebx,[edi-4] ; V
- adc edx,0 ; U
- sub ebx,eax ; V
- adc edx,0 ; U
- mov [edi-4],ebx ; V
- ms32_case1:
- mov eax,[esi] ; U
- mov ebx,edx ; V Remember carry for later
- mul ecx ; NP
- add eax,ebx ; U Add carry in from previous word
- mov ebx,[edi] ; V
- adc edx,0 ; U
- sub ebx,eax ; V
- adc edx,0 ; U
- mov [edi],ebx ; V
- sub ebp,4 ; U
- ja SHORT ms32_loop ; V
- ms32_done:
- pop ebx ; U
- pop ebp ; V
- mov eax,edx ; U
- pop edi ; V
- pop esi ; U
- ret ; NP
- _lbnMulSub1_32 endp
- ;; Two-word by one-word divide. Stores quotient, returns remainder.
- ;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
- ;; 4 8 12 16
- align 4
- _lbnDiv21_32 proc near
- mov edx,[esp+8] ; U Load nh
- mov eax,[esp+12] ; V Load nl
- mov ecx,[esp+4] ; U Load q
- div DWORD PTR [esp+16] ; NP
- mov [ecx],eax ; U Store quotient
- mov eax,edx ; V Return remainder
- ret
- _lbnDiv21_32 endp
- ;; Multi-word by one-word remainder.
- ;; This speeds up key generation. It's not worth unrolling and so on;
- ;; using 32-bit divides is enough of a speedup.
- ;;
- ;; The modulus (in ebp) is often 16 bits. Given that the dividend is 32
- ;; bits, the chances of saving the first divide because the high word of the
- ;; dividend is less than the modulus are low enough it's not worth taking
- ;; the cycles to test for it.
- ;;
- ;; unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
- ;; 4 8 12
- align 4
- _lbnModQ_32 proc near
- mov eax,[esp+4] ; U Load n
- push ebp ; V
- mov ebp,[esp+12] ; U Load len
- push esi ; V
- lea esi,[ebp*4+eax-4] ; U
- mov ecx,[esp+20] ; V Load d
- xor edx,edx ; U Clear edx for first iteration
- modq32_loop:
- mov eax,[esi] ; U Load new low word for divide
- sub esi,4 ; V
- div ecx ; NP edx = edx:eax % ecx
- dec ebp ; U
- jnz SHORT modq32_loop ; V
- pop esi ; U
- mov eax,edx ; V Return remainder in eax
- pop ebp ; U
- ret ; NP
- _lbnModQ_32 endp
- end
|