### Copyright (c) 1995, Colin Plumb. ### For licensing and other legal details, see the file legal.c. ### ### Assembly primitives for bignum library, 80386 family, 32-bit code. ### ### Several primitives are included here. Only lbnMulAdd1 is *really* ### critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite ### easy to write as well, so they are included here as well. ### lbnDiv21 and lbnModQ are so easy to write that they're included, too. ### ### All functions here are for 32-bit flat mode. I.e. near code and ### near data, although the near offsets are 32 bits. ### Preserved registers are esp, ebp, esi, edi and ebx. That last ### is needed by ELF for PIC, and differs from the IBM PC calling ### convention. # Different assemblers have different conventions here align4=4 # could be 2 or 4 align8=8 # could be 3 or 8 align16=16 # cound be 4 or 16 .text # We declare each symbol with two names, to deal with ELF/a.out variances. .globl lbnMulN1_32 .globl _lbnMulN1_32 .globl lbnMulAdd1_32 .globl _lbnMulAdd1_32 .globl lbnMulSub1_32 .globl _lbnMulSub1_32 .globl lbnDiv21_32 .globl _lbnDiv21_32 .globl lbnModQ_32 .globl _lbnModQ_32 ## Register usage: ## %eax - low half of product ## %ebx - carry to next iteration ## %ecx - multiplier (k) ## %edx - high half of product ## %esi - source pointer ## %edi - dest pointer ## %ebp - loop counter ## ## Stack frame: ## +--------+ %esp+20 %esp+24 %esp+28 %esp+32 %esp+36 ## | k | ## +--------+ %esp+16 %esp+20 %esp+24 %esp+28 %esp+32 ## | len | ## +--------+ %esp+12 %esp+16 %esp+20 %esp+24 %esp+28 ## | in | ## +--------+ %esp+8 %esp+12 %esp+16 %esp+20 %esp+24 ## | out | ## +--------+ %esp+4 %esp+8 %esp+12 %esp+16 %esp+20 ## | return | ## +--------+ %esp %esp+4 %esp+8 %esp+12 %esp+16 ## | %esi | ## +--------+ %esp %esp+4 %esp+8 %esp+12 ## | %ebp | ## +--------+ %esp %esp+4 %esp+8 ## | %ebx | ## +--------+ %esp %esp+4 ## | %edi | ## +--------+ %esp .align align16 lbnMulN1_32: _lbnMulN1_32: pushl %esi # U movl 12(%esp),%esi # V load in pushl %ebp # U movl 20(%esp),%ebp # V load len pushl %ebx # U movl 28(%esp),%ecx # V load k pushl %edi # U movl 20(%esp),%edi # V load out ## First multiply step has no carry in. movl (%esi),%eax # V leal -4(,%ebp,4),%ebx # U loop unrolling mull %ecx # NP first multiply movl %eax,(%edi) # U andl $12,%ebx # V loop unrolling addl %ebx,%esi # U loop unrolling addl %ebx,%edi # V loop unrolling jmp *m32_jumptable(%ebx) # NP loop unrolling .align align4 m32_jumptable: .long m32_case0 .long m32_case1 .long m32_case2 .long m32_case3 nop .align align8 nop nop nop # Get loop nicely aligned m32_case0: subl $4,%ebp # U jbe m32_done # V m32_loop: movl 4(%esi),%eax # U movl %edx,%ebx # V Remember carry for later addl $16,%esi # U addl $16,%edi # V mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word adcl $0,%edx # U movl %eax,-12(%edi) # V m32_case3: movl -8(%esi),%eax # U movl %edx,%ebx # V Remember carry for later mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word adcl $0,%edx # U movl %eax,-8(%edi) # V m32_case2: movl -4(%esi),%eax # U movl %edx,%ebx # V Remember carry for later mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word adcl $0,%edx # U movl %eax,-4(%edi) # V m32_case1: movl (%esi),%eax # U movl %edx,%ebx # V Remember carry for later mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word adcl $0,%edx # U movl %eax,(%edi) # V subl $4,%ebp # U ja m32_loop # V m32_done: movl %edx,4(%edi) # U popl %edi # V popl %ebx # U popl %ebp # V popl %esi # U ret # NP .align align16 lbnMulAdd1_32: _lbnMulAdd1_32: pushl %esi # U movl 12(%esp),%esi # V load in pushl %edi # U movl 12(%esp),%edi # V load out pushl %ebp # U movl 24(%esp),%ebp # V load len pushl %ebx # U movl 32(%esp),%ecx # V load k ## First multiply step has no carry in. movl (%esi),%eax # V movl (%edi),%ebx # U mull %ecx # NP first multiply addl %eax,%ebx # U leal -4(,%ebp,4),%eax # V loop unrolling adcl $0,%edx # U andl $12,%eax # V loop unrolling movl %ebx,(%edi) # U addl %eax,%esi # V loop unrolling addl %eax,%edi # U loop unrolling jmp *ma32_jumptable(%eax) # NP loop unrolling .align align4 ma32_jumptable: .long ma32_case0 .long ma32_case1 .long ma32_case2 .long ma32_case3 .align align8 nop nop nop # To align loop properly ma32_case0: subl $4,%ebp # U jbe ma32_done # V ma32_loop: movl 4(%esi),%eax # U movl %edx,%ebx # V Remember carry for later addl $16,%esi # U addl $16,%edi # V mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word movl -12(%edi),%ebx # V adcl $0,%edx # U addl %eax,%ebx # V adcl $0,%edx # U movl %ebx,-12(%edi) # V ma32_case3: movl -8(%esi),%eax # U movl %edx,%ebx # V Remember carry for later mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word movl -8(%edi),%ebx # V adcl $0,%edx # U addl %eax,%ebx # V adcl $0,%edx # U movl %ebx,-8(%edi) # V ma32_case2: movl -4(%esi),%eax # U movl %edx,%ebx # V Remember carry for later mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word movl -4(%edi),%ebx # V adcl $0,%edx # U addl %eax,%ebx # V adcl $0,%edx # U movl %ebx,-4(%edi) # V ma32_case1: movl (%esi),%eax # U movl %edx,%ebx # V Remember carry for later mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word movl (%edi),%ebx # V adcl $0,%edx # U addl %eax,%ebx # V adcl $0,%edx # U movl %ebx,(%edi) # V subl $4,%ebp # U ja ma32_loop # V ma32_done: popl %ebx # U popl %ebp # V movl %edx,%eax # U popl %edi # V popl %esi # U ret # NP .align align16 lbnMulSub1_32: _lbnMulSub1_32: pushl %esi # U movl 12(%esp),%esi # V load in pushl %edi # U movl 12(%esp),%edi # V load out pushl %ebp # U movl 24(%esp),%ebp # V load len pushl %ebx # U movl 32(%esp),%ecx # V load k /* First multiply step has no carry in. */ movl (%esi),%eax # V movl (%edi),%ebx # U mull %ecx # NP first multiply subl %eax,%ebx # U leal -4(,%ebp,4),%eax # V loop unrolling adcl $0,%edx # U andl $12,%eax # V loop unrolling movl %ebx,(%edi) # U addl %eax,%esi # V loop unrolling addl %eax,%edi # U loop unrolling jmp *ms32_jumptable(%eax) # NP loop unrolling .align align4 ms32_jumptable: .long ms32_case0 .long ms32_case1 .long ms32_case2 .long ms32_case3 .align align8 nop nop nop ms32_case0: subl $4,%ebp # U jbe ms32_done # V ms32_loop: movl 4(%esi),%eax # U movl %edx,%ebx # V Remember carry for later addl $16,%esi # U addl $16,%edi # V mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word movl -12(%edi),%ebx # V adcl $0,%edx # U subl %eax,%ebx # V adcl $0,%edx # U movl %ebx,-12(%edi) # V ms32_case3: movl -8(%esi),%eax # U movl %edx,%ebx # V Remember carry for later mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word movl -8(%edi),%ebx # V adcl $0,%edx # U subl %eax,%ebx # V adcl $0,%edx # U movl %ebx,-8(%edi) # V ms32_case2: movl -4(%esi),%eax # U movl %edx,%ebx # V Remember carry for later mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word movl -4(%edi),%ebx # V adcl $0,%edx # U subl %eax,%ebx # V adcl $0,%edx # U movl %ebx,-4(%edi) # V ms32_case1: movl (%esi),%eax # U movl %edx,%ebx # V Remember carry for later mull %ecx # NP addl %ebx,%eax # U Add carry in from previous word movl (%edi),%ebx # V adcl $0,%edx # U subl %eax,%ebx # V adcl $0,%edx # U movl %ebx,(%edi) # V subl $4,%ebp # U ja ms32_loop # V ms32_done: popl %ebx # U popl %ebp # V movl %edx,%eax # U popl %edi # V popl %esi # U ret # NP ## Two-word by one-word divide. Stores quotient, returns remainder. ## BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d) ## 4 8 12 16 .align align16 lbnDiv21_32: _lbnDiv21_32: movl 8(%esp),%edx # U Load nh movl 12(%esp),%eax # V Load nl movl 4(%esp),%ecx # U Load q divl 16(%esp) # NP movl %eax,(%ecx) # U Store quotient movl %edx,%eax # V Return remainder ret ## Multi-word by one-word remainder. ## This speeds up key generation. It's not worth unrolling and so on; ## using 32-bit divides is enough of a speedup. ## ## The modulus (in %ebp) is often 16 bits. Given that the dividend is 32 ## bits, the chances of saving the first divide because the high word of the ## dividend is less than the modulus are low enough it's not worth taking ## the cycles to test for it. ## ## unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d) ## 4 8 12 .align align16 lbnModQ_32: _lbnModQ_32: movl 4(%esp),%eax # U Load n pushl %ebp # V movl 12(%esp),%ebp # U Load len pushl %esi # V leal -4(%eax,%ebp,4),%esi # U movl 20(%esp),%ecx # V Load d xorl %edx,%edx # U Clear MSW for first divide modq32_loop: movl (%esi),%eax # U subl $4,%esi # V divl %ecx # NP decl %ebp # U jnz modq32_loop # V popl %esi # U movl %edx,%eax # V popl %ebp # U ret # NP