lbnppc.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. /*
  2. * Copyright (c) 1995 Colin Plumb. All rights reserved.
  3. * For licensing and other legal details, see the file legal.c.
  4. */
  5. #include "lbnppc.h"
  6. /*
  7. * lbnppc.c - Assembly primitives for the bignum library, PowerPC version.
  8. *
  9. * Copyright (c) 1995 Colin Plumb. All rights reserved.
  10. * For licensing and other legal details, see the file legal.c.
  11. *
  12. * Register usage during function calls is:
  13. * r0 - volatile
  14. * r1 - stack pointer, preserved
  15. * r2 - TOC pointer, preserved
  16. * r3 - First argument and return value register
  17. * r4-r10 - More argument registers, volatile
  18. * r11-r12 - Volatile
  19. * r13-r31 - Preserved
  20. * LR, CTR, XER and MQ are all volatile.
  21. * LR holds return address on entry.
  22. *
  23. * On the PPC 601, unrolling the loops more doesn't seem to speed things
  24. * up at all. I'd be curious if other chips differed.
  25. */
  26. #if __MWERKS__ < 0x800
  27. #include "ppcasm.h" /* PowerPC assembler */
  28. /*
  29. * MulN1 expects (*out, *in, len, k), count >= 1
  30. * r3 r4 r5 r6
  31. */
  32. static const unsigned mulN1[] = {
  33. PPC_LWZ(7,4,0), /* Load first word of in in r7 */
  34. PPC_MULLW(8,7,6), /* Low half of multiply in r8 */
  35. PPC_MTCTR(5), /* Move len into CTR */
  36. PPC_ADDIC(0,0,0), /* Clear carry bit for loop */
  37. PPC_MULHWU(5,7,6), /* High half of multiply in r5 */
  38. PPC_STW(8,3,0),
  39. PPC_BC(18,31,7), /* Branch to Label if --ctr == 0 */
  40. /* Loop: */
  41. PPC_LWZU(7,4,4), /* r7 = *++in */
  42. PPC_MULLW(8,7,6), /* r8 = low word of product */
  43. PPC_ADDE(8,8,5), /* Add carry word r5 and bit CF to r8 */
  44. PPC_STWU(8,3,4), /* *++out = r8 */
  45. PPC_MULHWU(5,7,6), /* r5 is high word of product, for carry word */
  46. PPC_BC(16,31,-5), /* Branch to Loop if --ctr != 0 */
  47. /* Label: */
  48. PPC_ADDZE(5,5), /* Add carry flag to r5 */
  49. PPC_STW(5,3,4), /* out[1] = r5 */
  50. PPC_BLR()
  51. };
  52. /*
  53. * MulAdd1 expects (*out, *in, len, k), count >= 1
  54. * r3 r4 r5 r6
  55. */
  56. static unsigned const mulAdd1[] = {
  57. PPC_LWZ(7,4,0), /* Load first word of in in r7 */
  58. PPC_LWZ(0,3,0), /* Load first word of out into r0 */
  59. PPC_MULLW(8,7,6), /* Low half of multiply in r8 */
  60. PPC_MTCTR(5), /* Move len into CTR */
  61. PPC_MULHWU(5,7,6), /* High half of multiply in r5 */
  62. PPC_ADDC(8,8,0), /* r8 = r8 + r0 */
  63. PPC_STW(8,3,0), /* Store result to memory */
  64. PPC_BC(18,31,10), /* Branch to Label if --ctr == 0 */
  65. /* Loop: */
  66. PPC_LWZU(7,4,4), /* r7 = *++in */
  67. PPC_LWZU(0,3,4), /* r0 = *++out */
  68. PPC_MULLW(8,7,6), /* r8 = low word of product */
  69. PPC_ADDE(8,8,5), /* Add carry word r5 and carry bit CF to r8 */
  70. PPC_MULHWU(5,7,6), /* r5 is high word of product, for carry word */
  71. PPC_ADDZE(5,5), /* Add carry bit from low add to r5 */
  72. PPC_ADDC(8,8,0), /* r8 = r8 + r0 */
  73. PPC_STW(8,3,0), /* *out = r8 */
  74. PPC_BC(16,31,-8), /* Branch to Loop if --ctr != 0 */
  75. /* Label: */
  76. PPC_ADDZE(3,5), /* Add carry flag to r5 and move to r3 */
  77. PPC_BLR()
  78. };
  79. /*
  80. * MulSub1 expects (*out, *in, len, k), count >= 1
  81. * r3 r4 r5 r6
  82. *
  83. * Multiply and subtract is rather a pain. If the subtract of the
  84. * low word of the product from out[i] generates a borrow, we want to
  85. * increment the carry word (initially in the range 0..0xfffffffe).
  86. * However, the PPC's carry bit CF is *clear* after a subtract, so
  87. * we want to add (1-CF) to the carry word. This is done using two
  88. * instructions:
  89. *
  90. * SUBFME, subtract from minus one extended. This computes
  91. * rD = ~rS + 0xffffffff + CF. Since rS is from 0 to 0xfffffffe,
  92. * ~rS is from 1 through 0xffffffff, and the sum with 0xffffffff+CF is
  93. * from 0 through 0xfffffffff, setting the carry flag unconditionally, and
  94. * NOR, which is used as a bitwise invert NOT instruction.
  95. *
  96. * The SUBFME performs the computation rD = ~rS + 0xffffffff + CF,
  97. * = (-rS - 1) + (CF - 1) = -(rS - CF + 1) - 1 = ~(rS + 1-CF),
  98. * which is the bitwise complement of the value we want.
  99. * We want to add the complement of that result to the low word of the
  100. * product, which is just what a subtract would do, if only we could get
  101. * the carry flag clear. But it's always set, except for SUBFE, and the
  102. * operation we just performed unconditionally *sets* the carry flag. Ugh.
  103. * So find the complement in a separate instruction.
  104. */
  105. static unsigned const mulSub1[] = {
  106. PPC_LWZ(7,4,0), /* Load first word of in in r7 */
  107. PPC_LWZ(0,3,0), /* Load first word of out into r0 */
  108. PPC_MTCTR(5), /* Move len into CTR */
  109. PPC_MULLW(8,7,6), /* Low half of multiply in r8 */
  110. PPC_MULHWU(5,7,6), /* High half of multiply in r5 */
  111. PPC_SUBFC(8,8,0), /* r8 = r0 - r8, setting CF */
  112. PPC_STW(8,3,0), /* Store result to memory */
  113. PPC_SUBFME(5,5), /* First of two insns to add (1-CF) to r5 */
  114. PPC_BC(18,31,12), /* Branch to Label if --ctr == 0 */
  115. /* Loop: */
  116. PPC_LWZU(7,4,4), /* r7 = *++in */
  117. PPC_LWZU(0,3,4), /* r0 = *++out */
  118. PPC_NOR(5,5,5), /* Second of two insns to add (1-CF) to r5 */
  119. PPC_MULLW(8,7,6), /* r8 = low word of product */
  120. PPC_ADDC(8,8,5), /* Add carry word r5 to r8 */
  121. PPC_MULHWU(5,7,6), /* r5 is high word of product, for carry word */
  122. PPC_ADDZE(5,5), /* Add carry bit from low add to r5 */
  123. PPC_SUBFC(8,8,0), /* r8 = r0 - r8, setting CF */
  124. PPC_STW(8,3,0), /* *out = r8 */
  125. PPC_SUBFME(5,5), /* First of two insns to add (1-CF) to r5 */
  126. PPC_BC(16,31,-10), /* Branch to Loop if --ctr != 0 */
  127. /* Label: */
  128. PPC_NOR(3,5,5), /* Finish adding (1-CF) to r5, store in r3 */
  129. PPC_BLR()
  130. };
  131. #if 0
  132. /*
  133. * Args: BNWORD32 *n, BNWORD32 const *mod, unsigned mlen, BNWORD32 inv)
  134. * r3 r4 r5 r6
  135. * r7, r8 and r9 are the triple-width accumulator.
  136. * r0 and r10 are temporary registers.
  137. * r11 and r12 are temporary pointers into n and mod, respectively.
  138. * r2 (!) is another temporary register.
  139. */
  140. static unsigned const montReduce[] = {
  141. PPC_MTCTR(5), /* ??? */
  142. PPC_LWZ(7,3,0), /* Load low word of n into r7 */
  143. PPC_LWZ(10,4,0), /* Fetch low word of mod */
  144. PPC_MULLW(0,7,6), /* Invert r7 into r0 */
  145. PPC_STW(0,3,0), /* Store back for future use */
  146. PPC_MULHWU(8,10,7), /* Get high word of whatnot */
  147. PPC_MULLW(10,10,7), /* Get low word of it */
  148. PPC_ADDC(7,7,10), /* Add low word of product to r7 */
  149. PPC_ADDZE(8,8), /* Add carry to high word */
  150. PPC_
  151. PPC_MULHW(8,7,6),
  152. PPC_ADDC(7,7,0), /* Add inverse back to r7 */
  153. PPC_ADDZE(8,8),
  154. PPC_
  155. PPC_LWZU(
  156. /* Loop: */
  157. PPC_LWZU(0,11,4),
  158. PPC_LWZU(10,23,-4),
  159. PPC_MULLW(2,0,10),
  160. PPC_ADDC(7,7,2),
  161. PPC_MULHWU(0,0,10),
  162. PPC_ADDE(8,8,0),
  163. PPC_ADDZE(9,9),
  164. PPC_BC(16,31,-7), /* Branch to Loop if --ctr != 0 */
  165. PPC_ADDIC_(count,-1),
  166. PPC_LWZU(0,x,4),
  167. PPC_ADDC(0,7,0),
  168. PPC_STW(0,x,0),
  169. PPC_ADDZE(7,8),
  170. PPC_ADDZE(8,9),
  171. PPC_LI(9,0),
  172. PPC_BC(xx,2,yy),
  173. };
  174. #endif
  175. /*
  176. * Three overlapped transition vectors for three functions.
  177. * A PowerPC transition vector for a (potentially) inter-module
  178. * jump or call consists of two words, an instruction address
  179. * and a Table Of Contents (TOC) pointer, which is loaded into
  180. * r1. Since none of the routines here have global variables,
  181. * they don't need a TOC pointer, so the value is unimportant.
  182. * This array places an unintersting 32-bit value after each address.
  183. */
  184. unsigned const * const lbnPPC_tv[] = {
  185. mulN1,
  186. mulAdd1,
  187. mulSub1,
  188. 0
  189. };
  190. #else /* __MWERKS >= 0x800 */
  191. /*
  192. * MulN1 expects (*out, *in, len, k), count >= 1
  193. * r3 r4 r5 r6
  194. */
  195. asm void
  196. lbnMulN1_32(register unsigned *out, register unsigned const *in,
  197. register unsigned len, register unsigned k)
  198. {
  199. lwz r7,0(in) /* Load first word of in in r7 */
  200. mtctr len /* Move len into CTR */
  201. mullw r8,r7,k /* Low half of multiply in r8 */
  202. addic r0,r0,0 /* Clear carry bit for loop */
  203. mulhwu len,r7,k /* High half of multiply in len */
  204. stw r8,0(out) /* *out = r8 */
  205. mulhwu len,r7,k /* len is high word of product, for carry */
  206. bdz- label /* Branch to Label if --ctr == 0 */
  207. loop:
  208. lwzu r7,4(in) /* r7 = *++in */
  209. mullw r8,r7,k /* Low half of multiply in r8 */
  210. adde r8,r8,len /* Add carry word len and bit CF to r8 */
  211. stwu r8,4(out) /* *++out = r8 */
  212. mulhwu len,r7,k /* len is high word of product, for carry */
  213. bdnz+ loop /* Branch to Loop if --ctr != 0 */
  214. label:
  215. addze len,len /* Add carry flag to carry word */
  216. stw len,4(out)
  217. blr
  218. }
  219. /*
  220. * MulAdd1 expects (*out, *in, len, k), count >= 1
  221. * r3 r4 r5 r6
  222. */
  223. asm unsigned
  224. lbnMulAdd1_32(register unsigned *out, register unsigned const *in,
  225. register unsigned len, register unsigned k)
  226. {
  227. lwz r7,0(in) /* Load first word of in in r7 */
  228. lwz r0,0(out) /* Load first word of out into r0 */
  229. mullw r8,r7,k /* Low half of multiply in r8 */
  230. mtctr len /* Move len into CTR */
  231. mulhwu len,r7,k /* High half of multiply in len */
  232. addc r8,r8,r0 /* r8 = r8 + r0 */
  233. stw r8,0(out) /* Store result to memory */
  234. bdz- label /* Branch to Label if --ctr == 0 */
  235. loop:
  236. lwzu r7,4(in) /* r7 = *++in */
  237. lwzu r0,4(out) /* r0 = *++out */
  238. mullw r8,r7,k /* r8 = low word of product */
  239. adde r8,r8,len /* Add carry word len and carry bit CF to r8 */
  240. mulhwu len,r7,k /* len is high word of product, for carry */
  241. addze len,len /* Add carry bit from low add to r5 */
  242. addc r8,r8,r0 /* r8 = r8 + r0 */
  243. stw r8,0(out) /* *out = r8 */
  244. bdnz+ loop /* Branch to Loop if --ctr != 0 */
  245. label:
  246. addze r3,r5 /* Add carry flag to r5 and move to r3 */
  247. blr
  248. }
  249. /*
  250. * MulSub1 expects (*out, *in, len, k), count >= 1
  251. * r3 r4 r5 r6
  252. *
  253. * Multiply and subtract is rather a pain. If the subtract of the
  254. * low word of the product from out[i] generates a borrow, we want to
  255. * increment the carry word (initially in the range 0..0xfffffffe).
  256. * However, the PPC's carry bit CF is *clear* after a subtract, so
  257. * we want to add (1-CF) to the carry word. This is done using two
  258. * instructions:
  259. *
  260. * SUBFME, subtract from minus one extended. This computes
  261. * rD = ~rS + 0xffffffff + CF. Since rS is from 0 to 0xfffffffe,
  262. * ~rS is from 1 through 0xffffffff, and the sum with 0xffffffff+CF is
  263. * from 0 through 0xfffffffff, setting the carry flag unconditionally, and
  264. * NOR, which is used as a bitwise invert NOT instruction.
  265. *
  266. * The SUBFME performs the computation rD = ~rS + 0xffffffff + CF,
  267. * = (-rS - 1) + (CF - 1) = -(rS - CF + 1) - 1 = ~(rS + 1-CF),
  268. * which is the bitwise complement of the value we want.
  269. * We want to add the complement of that result to the low word of the
  270. * product, which is just what a subtract would do, if only we could get
  271. * the carry flag clear. But it's always set, except for SUBFE, and the
  272. * operation we just performed unconditionally *sets* the carry flag. Ugh.
  273. * So find the complement in a separate instruction.
  274. */
  275. asm unsigned
  276. lbnMulSub1_32(register unsigned *out, register unsigned const *in,
  277. register unsigned len, register unsigned k)
  278. {
  279. lwz r7,0(in) /* Load first word of in in r7 */
  280. lwz r0,0(out) /* Load first word of out into r0 */
  281. mtctr len /* Move len into CTR */
  282. mullw r8,r7,k /* Low half of multiply in r8 */
  283. mulhwu len,r7,k /* High half of multiply in len */
  284. subfc r8,r8,r0 /* r8 = r0 - r8, setting CF */
  285. stw r8,0(out) /* Store result to memory */
  286. subfme len,len /* First of two insns to add (1-CF) to len */
  287. bdz- label /* Branch to Label if --ctr == 0 */
  288. loop:
  289. lwzu r7,4(in) /* r7 = *++in */
  290. lwzu r0,4(out) /* r0 = *++out */
  291. nor len,len,len /* Second of two insns to add (1-CF) to len */
  292. mullw r8,r7,k /* r8 = low word of product */
  293. addc r8,r8,len /* Add carry word len to r8 */
  294. mulhwu len,r7,k /* len is high word of product, for carry */
  295. addze len,len /* Add carry bit from low add to len */
  296. subfc r8,r8,r0 /* r8 = r0 - r8 */
  297. stw r8,0(out) /* *out = r8 */
  298. subfme len,len /* First of two insns to add (1-CF) to len */
  299. bdnz+ loop /* Branch to Loop if --ctr != 0 */
  300. label:
  301. nor r3,r5,r5 /* Finish adding (1-CF) to len, store in r3 */
  302. blr
  303. }
  304. #endif /* __MWERKS >= 0x800 */
  305. /* 45678901234567890123456789012345678901234567890123456789012345678901234567 */