lbn16.c 115 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070
  1. /*
  2. * lbn16.c - Low-level bignum routines, 16-bit version.
  3. *
  4. * Copyright (c) 1995 Colin Plumb. All rights reserved.
  5. *
  6. * NOTE: the magic constants "16" and "32" appear in many places in this
  7. * file, including inside identifiers. Because it is not possible to
  8. * ask "#ifdef" of a macro expansion, it is not possible to use the
  9. * preprocessor to conditionalize these properly. Thus, this file is
  10. * intended to be edited with textual search and replace to produce
  11. * alternate word size versions. Any reference to the number of bits
  12. * in a word must be the string "16", and that string must not appear
  13. * otherwise. Any reference to twice this number must appear as "32",
  14. * which likewise must not appear otherwise. Is that clear?
  15. *
  16. * Remember, when doubling the bit size replace the larger number (32)
  17. * first, then the smaller (16). When halving the bit size, do the
  18. * opposite. Otherwise, things will get wierd. Also, be sure to replace
  19. * every instance that appears. (:%s/foo/bar/g in vi)
  20. *
  21. * These routines work with a pointer to the least-significant end of
  22. * an array of WORD16s. The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
  23. * defined in lbn.h (which expand to x on a big-edian machine and y on a
  24. * little-endian machine) are used to conditionalize the code to work
  25. * either way. If you have no assembly primitives, it doesn't matter.
  26. * Note that on a big-endian machine, the least-significant-end pointer
  27. * is ONE PAST THE END. The bytes are ptr[-1] through ptr[-len].
  28. * On little-endian, they are ptr[0] through ptr[len-1]. This makes
  29. * perfect sense if you consider pointers to point *between* bytes rather
  30. * than at them.
  31. *
  32. * Because the array index values are unsigned integers, ptr[-i]
  33. * may not work properly, since the index -i is evaluated as an unsigned,
  34. * and if pointers are wider, zero-extension will produce a positive
  35. * number rahter than the needed negative. The expression used in this
  36. * code, *(ptr-i) will, however, work. (The array syntax is equivalent
  37. * to *(ptr+-i), which is a pretty subtle difference.)
  38. *
  39. * Many of these routines will get very unhappy if fed zero-length inputs.
  40. * They use assert() to enforce this. An higher layer of code must make
  41. * sure that these aren't called with zero-length inputs.
  42. *
  43. * Any of these routines can be replaced with more efficient versions
  44. * elsewhere, by just #defining their names. If one of the names
  45. * is #defined, the C code is not compiled in and no declaration is
  46. * made. Use the BNINCLUDE file to do that. Typically, you compile
  47. * asm subroutines with the same name and just, e.g.
  48. * #define lbnMulAdd1_16 lbnMulAdd1_16
  49. *
  50. * If you want to write asm routines, start with lbnMulAdd1_16().
  51. * This is the workhorse of modular exponentiation. lbnMulN1_16() is
  52. * also used a fair bit, although not as much and it's defined in terms
  53. * of lbnMulAdd1_16 if that has a custom version. lbnMulSub1_16 and
  54. * lbnDiv21_16 are used in the usual division and remainder finding.
  55. * (Not the Montgomery reduction used in modular exponentiation, though.)
  56. * Once you have lbnMulAdd1_16 defined, writing the other two should
  57. * be pretty easy. (Just make sure you get the sign of the subtraction
  58. * in lbnMulSub1_16 right - it's dest = dest - source * k.)
  59. *
  60. * The only definitions that absolutely need a double-word (BNWORD32)
  61. * type are lbnMulAdd1_16 and lbnMulSub1_16; if those are provided,
  62. * the rest follows. lbnDiv21_16, however, is a lot slower unless you
  63. * have them, and lbnModQ_16 takes after it. That one is used quite a
  64. * bit for prime sieving.
  65. */
  66. #ifndef HAVE_CONFIG_H
  67. #define HAVE_CONFIG_H 0
  68. #endif
  69. #if HAVE_CONFIG_H
  70. #include "bnconfig.h"
  71. #endif
  72. /*
  73. * Some compilers complain about #if FOO if FOO isn't defined,
  74. * so do the ANSI-mandated thing explicitly...
  75. */
  76. #ifndef NO_ASSERT_H
  77. #define NO_ASSERT_H 0
  78. #endif
  79. #ifndef NO_STRING_H
  80. #define NO_STRING_H 0
  81. #endif
  82. #ifndef HAVE_STRINGS_H
  83. #define HAVE_STRINGS_H 0
  84. #endif
  85. #if !NO_ASSERT_H
  86. #include <assert.h>
  87. #else
  88. #define assert(x) (void)0
  89. #endif
  90. #if !NO_STRING_H
  91. #include <string.h> /* For memcpy */
  92. #elif HAVE_STRINGS_H
  93. #include <strings.h>
  94. #endif
  95. #include "lbn.h"
  96. #include "lbn16.h"
  97. #include "lbnmem.h"
  98. #include "kludge.h"
  99. #ifndef BNWORD16
  100. #error 16-bit bignum library requires a 16-bit data type
  101. #endif
  102. /* If this is defined, include bnYield() calls */
  103. #if BNYIELD
  104. extern int (*bnYield)(void); /* From bn.c */
  105. #endif
  106. /*
  107. * Most of the multiply (and Montgomery reduce) routines use an outer
  108. * loop that iterates over one of the operands - a so-called operand
  109. * scanning approach. One big advantage of this is that the assembly
  110. * support routines are simpler. The loops can be rearranged to have
  111. * an outer loop that iterates over the product, a so-called product
  112. * scanning approach. This has the advantage of writing less data
  113. * and doing fewer adds to memory, so is supposedly faster. Some
  114. * code has been written using a product-scanning approach, but
  115. * it appears to be slower, so it is turned off by default. Some
  116. * experimentation would be appreciated.
  117. *
  118. * (The code is also annoying to get right and not very well commented,
  119. * one of my pet peeves about math libraries. I'm sorry.)
  120. */
  121. #ifndef PRODUCT_SCAN
  122. #define PRODUCT_SCAN 0
  123. #endif
  124. /*
  125. * Copy an array of words. <Marvin mode on> Thrilling, isn't it? </Marvin>
  126. * This is a good example of how the byte offsets and BIGLITTLE() macros work.
  127. * Another alternative would have been
  128. * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD16)), but I find that
  129. * putting operators into conditional macros is confusing.
  130. */
  131. #ifndef lbnCopy_16
  132. void
  133. lbnCopy_16(BNWORD16 *dest, BNWORD16 const *src, unsigned len)
  134. {
  135. memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src),
  136. len * sizeof(*src));
  137. }
  138. #endif /* !lbnCopy_16 */
  139. /*
  140. * Fill n words with zero. This does it manually rather than calling
  141. * memset because it can assume alignment to make things faster while
  142. * memset can't. Note how big-endian numbers are naturally addressed
  143. * using predecrement, while little-endian is postincrement.
  144. */
  145. #ifndef lbnZero_16
  146. void
  147. lbnZero_16(BNWORD16 *num, unsigned len)
  148. {
  149. while (len--)
  150. BIGLITTLE(*--num,*num++) = 0;
  151. }
  152. #endif /* !lbnZero_16 */
  153. /*
  154. * Negate an array of words.
  155. * Negation is subtraction from zero. Negating low-order words
  156. * entails doing nothing until a non-zero word is hit. Once that
  157. * is negated, a borrow is generated and never dies until the end
  158. * of the number is hit. Negation with borrow, -x-1, is the same as ~x.
  159. * Repeat that until the end of the number.
  160. *
  161. * Doesn't return borrow out because that's pretty useless - it's
  162. * always set unless the input is 0, which is easy to notice in
  163. * normalized form.
  164. */
  165. #ifndef lbnNeg_16
  166. void
  167. lbnNeg_16(BNWORD16 *num, unsigned len)
  168. {
  169. assert(len);
  170. /* Skip low-order zero words */
  171. while (BIGLITTLE(*--num,*num) == 0) {
  172. if (!--len)
  173. return;
  174. LITTLE(num++;)
  175. }
  176. /* Negate the lowest-order non-zero word */
  177. *num = -*num;
  178. /* Complement all the higher-order words */
  179. while (--len) {
  180. BIGLITTLE(--num,++num);
  181. *num = ~*num;
  182. }
  183. }
  184. #endif /* !lbnNeg_16 */
  185. /*
  186. * lbnAdd1_16: add the single-word "carry" to the given number.
  187. * Used for minor increments and propagating the carry after
  188. * adding in a shorter bignum.
  189. *
  190. * Technique: If we have a double-width word, presumably the compiler
  191. * can add using its carry in inline code, so we just use a larger
  192. * accumulator to compute the carry from the first addition.
  193. * If not, it's more complex. After adding the first carry, which may
  194. * be > 1, compare the sum and the carry. If the sum wraps (causing a
  195. * carry out from the addition), the result will be less than each of the
  196. * inputs, since the wrap subtracts a number (2^16) which is larger than
  197. * the other input can possibly be. If the sum is >= the carry input,
  198. * return success immediately.
  199. * In either case, if there is a carry, enter a loop incrementing words
  200. * until one does not wrap. Since we are adding 1 each time, the wrap
  201. * will be to 0 and we can test for equality.
  202. */
  203. #ifndef lbnAdd1_16 /* If defined, it's provided as an asm subroutine */
  204. #ifdef BNWORD32
  205. BNWORD16
  206. lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry)
  207. {
  208. BNWORD32 t;
  209. assert(len > 0); /* Alternative: if (!len) return carry */
  210. t = (BNWORD32)BIGLITTLE(*--num,*num) + carry;
  211. BIGLITTLE(*num,*num++) = (BNWORD16)t;
  212. if ((t >> 16) == 0)
  213. return 0;
  214. while (--len) {
  215. if (++BIGLITTLE(*--num,*num++) != 0)
  216. return 0;
  217. }
  218. return 1;
  219. }
  220. #else /* no BNWORD32 */
  221. BNWORD16
  222. lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry)
  223. {
  224. assert(len > 0); /* Alternative: if (!len) return carry */
  225. if ((BIGLITTLE(*--num,*num++) += carry) >= carry)
  226. return 0;
  227. while (--len) {
  228. if (++BIGLITTLE(*--num,*num++) != 0)
  229. return 0;
  230. }
  231. return 1;
  232. }
  233. #endif
  234. #endif/* !lbnAdd1_16 */
  235. /*
  236. * lbnSub1_16: subtract the single-word "borrow" from the given number.
  237. * Used for minor decrements and propagating the borrow after
  238. * subtracting a shorter bignum.
  239. *
  240. * Technique: Similar to the add, above. If there is a double-length type,
  241. * use that to generate the first borrow.
  242. * If not, after subtracting the first borrow, which may be > 1, compare
  243. * the difference and the *negative* of the carry. If the subtract wraps
  244. * (causing a borrow out from the subtraction), the result will be at least
  245. * as large as -borrow. If the result < -borrow, then no borrow out has
  246. * appeared and we may return immediately, except when borrow == 0. To
  247. * deal with that case, use the identity that -x = ~x+1, and instead of
  248. * comparing < -borrow, compare for <= ~borrow.
  249. * Either way, if there is a borrow out, enter a loop decrementing words
  250. * until a non-zero word is reached.
  251. *
  252. * Note the cast of ~borrow to (BNWORD16). If the size of an int is larger
  253. * than BNWORD16, C rules say the number is expanded for the arithmetic, so
  254. * the inversion will be done on an int and the value won't be quite what
  255. * is expected.
  256. */
  257. #ifndef lbnSub1_16 /* If defined, it's provided as an asm subroutine */
  258. #ifdef BNWORD32
  259. BNWORD16
  260. lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow)
  261. {
  262. BNWORD32 t;
  263. assert(len > 0); /* Alternative: if (!len) return borrow */
  264. t = (BNWORD32)BIGLITTLE(*--num,*num) - borrow;
  265. BIGLITTLE(*num,*num++) = (BNWORD16)t;
  266. if ((t >> 16) == 0)
  267. return 0;
  268. while (--len) {
  269. if ((BIGLITTLE(*--num,*num++))-- != 0)
  270. return 0;
  271. }
  272. return 1;
  273. }
  274. #else /* no BNWORD32 */
  275. BNWORD16
  276. lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow)
  277. {
  278. assert(len > 0); /* Alternative: if (!len) return borrow */
  279. if ((BIGLITTLE(*--num,*num++) -= borrow) <= (BNWORD16)~borrow)
  280. return 0;
  281. while (--len) {
  282. if ((BIGLITTLE(*--num,*num++))-- != 0)
  283. return 0;
  284. }
  285. return 1;
  286. }
  287. #endif
  288. #endif /* !lbnSub1_16 */
  289. /*
  290. * lbnAddN_16: add two bignums of the same length, returning the carry (0 or 1).
  291. * One of the building blocks, along with lbnAdd1, of adding two bignums of
  292. * differing lengths.
  293. *
  294. * Technique: Maintain a word of carry. If there is no double-width type,
  295. * use the same technique as in lbnAdd1, above, to maintain the carry by
  296. * comparing the inputs. Adding the carry sources is used as an OR operator;
  297. * at most one of the two comparisons can possibly be true. The first can
  298. * only be true if carry == 1 and x, the result, is 0. In that case the
  299. * second can't possibly be true.
  300. */
  301. #ifndef lbnAddN_16
  302. #ifdef BNWORD32
  303. BNWORD16
  304. lbnAddN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
  305. {
  306. BNWORD32 t;
  307. assert(len > 0);
  308. t = (BNWORD32)BIGLITTLE(*--num1,*num1) + BIGLITTLE(*--num2,*num2++);
  309. BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
  310. while (--len) {
  311. t = (BNWORD32)BIGLITTLE(*--num1,*num1) +
  312. (BNWORD32)BIGLITTLE(*--num2,*num2++) + (t >> 16);
  313. BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
  314. }
  315. return (BNWORD16)(t>>16);
  316. }
  317. #else /* no BNWORD32 */
  318. BNWORD16
  319. lbnAddN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
  320. {
  321. BNWORD16 x, carry = 0;
  322. assert(len > 0); /* Alternative: change loop to test at start */
  323. do {
  324. x = BIGLITTLE(*--num2,*num2++);
  325. carry = (x += carry) < carry;
  326. carry += (BIGLITTLE(*--num1,*num1++) += x) < x;
  327. } while (--len);
  328. return carry;
  329. }
  330. #endif
  331. #endif /* !lbnAddN_16 */
  332. /*
  333. * lbnSubN_16: add two bignums of the same length, returning the carry (0 or 1).
  334. * One of the building blocks, along with subn1, of subtracting two bignums of
  335. * differing lengths.
  336. *
  337. * Technique: If no double-width type is availble, maintain a word of borrow.
  338. * First, add the borrow to the subtrahend (did you have to learn all those
  339. * awful words in elementary school, too?), and if it overflows, set the
  340. * borrow again. Then subtract the modified subtrahend from the next word
  341. * of input, using the same technique as in subn1, above.
  342. * Adding the borrows is used as an OR operator; at most one of the two
  343. * comparisons can possibly be true. The first can only be true if
  344. * borrow == 1 and x, the result, is 0. In that case the second can't
  345. * possibly be true.
  346. *
  347. * In the double-word case, (BNWORD16)-(t>>16) is subtracted, rather than
  348. * adding t>>16, because the shift would need to sign-extend and that's
  349. * not guaranteed to happen in ANSI C, even with signed types.
  350. */
  351. #ifndef lbnSubN_16
  352. #ifdef BNWORD32
  353. BNWORD16
  354. lbnSubN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
  355. {
  356. BNWORD32 t;
  357. assert(len > 0);
  358. t = (BNWORD32)BIGLITTLE(*--num1,*num1) - BIGLITTLE(*--num2,*num2++);
  359. BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
  360. while (--len) {
  361. t = (BNWORD32)BIGLITTLE(*--num1,*num1) -
  362. (BNWORD32)BIGLITTLE(*--num2,*num2++) - (BNWORD16)-(t >> 16);
  363. BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
  364. }
  365. return -(BNWORD16)(t>>16);
  366. }
  367. #else
  368. BNWORD16
  369. lbnSubN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
  370. {
  371. BNWORD16 x, borrow = 0;
  372. assert(len > 0); /* Alternative: change loop to test at start */
  373. do {
  374. x = BIGLITTLE(*--num2,*num2++);
  375. borrow = (x += borrow) < borrow;
  376. borrow += (BIGLITTLE(*--num1,*num1++) -= x) > (BNWORD16)~x;
  377. } while (--len);
  378. return borrow;
  379. }
  380. #endif
  381. #endif /* !lbnSubN_16 */
  382. #ifndef lbnCmp_16
  383. /*
  384. * lbnCmp_16: compare two bignums of equal length, returning the sign of
  385. * num1 - num2. (-1, 0 or +1).
  386. *
  387. * Technique: Change the little-endian pointers to big-endian pointers
  388. * and compare from the most-significant end until a difference if found.
  389. * When it is, figure out the sign of the difference and return it.
  390. */
  391. int
  392. lbnCmp_16(BNWORD16 const *num1, BNWORD16 const *num2, unsigned len)
  393. {
  394. BIGLITTLE(num1 -= len, num1 += len);
  395. BIGLITTLE(num2 -= len, num2 += len);
  396. while (len--) {
  397. if (BIGLITTLE(*num1++ != *num2++, *--num1 != *--num2)) {
  398. if (BIGLITTLE(num1[-1] < num2[-1], *num1 < *num2))
  399. return -1;
  400. else
  401. return 1;
  402. }
  403. }
  404. return 0;
  405. }
  406. #endif /* !lbnCmp_16 */
  407. /*
  408. * mul16_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
  409. * computes (ph,pl) = x * y + a + b. mul16_ppmma and mul16_ppmm
  410. * are simpler versions. If you want to be lazy, all of these
  411. * can be defined in terms of the others, so here we create any
  412. * that have not been defined in terms of the ones that have been.
  413. */
  414. /* Define ones with fewer a's in terms of ones with more a's */
  415. #if !defined(mul16_ppmma) && defined(mul16_ppmmaa)
  416. #define mul16_ppmma(ph,pl,x,y,a) mul16_ppmmaa(ph,pl,x,y,a,0)
  417. #endif
  418. #if !defined(mul16_ppmm) && defined(mul16_ppmma)
  419. #define mul16_ppmm(ph,pl,x,y) mul16_ppmma(ph,pl,x,y,0)
  420. #endif
  421. /*
  422. * Use this definition to test the mul16_ppmm-based operations on machines
  423. * that do not provide mul16_ppmm. Change the final "0" to a "1" to
  424. * enable it.
  425. */
  426. #if !defined(mul16_ppmm) && defined(BNWORD32) && 0 /* Debugging */
  427. #define mul16_ppmm(ph,pl,x,y) \
  428. ({BNWORD32 _ = (BNWORD32)(x)*(y); (pl) = _; (ph) = _>>16;})
  429. #endif
  430. #if defined(mul16_ppmm) && !defined(mul16_ppmma)
  431. #define mul16_ppmma(ph,pl,x,y,a) \
  432. (mul16_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
  433. #endif
  434. #if defined(mul16_ppmma) && !defined(mul16_ppmmaa)
  435. #define mul16_ppmmaa(ph,pl,x,y,a,b) \
  436. (mul16_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
  437. #endif
  438. /*
  439. * lbnMulN1_16: Multiply an n-word input by a 1-word input and store the
  440. * n+1-word product. This uses either the mul16_ppmm and mul16_ppmma
  441. * macros, or C multiplication with the BNWORD32 type. This uses mul16_ppmma
  442. * if available, assuming you won't bother defining it unless you can do
  443. * better than the normal multiplication.
  444. */
  445. #ifndef lbnMulN1_16
  446. #ifdef lbnMulAdd1_16 /* If we have this asm primitive, use it. */
  447. void
  448. lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
  449. {
  450. lbnZero_16(out, len);
  451. BIGLITTLE(*(out-len),*(out+len)) = lbnMulAdd1_16(out, in, len, k);
  452. }
  453. #elif defined(mul16_ppmm)
  454. void
  455. lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
  456. {
  457. BNWORD16 carry, carryin;
  458. assert(len > 0);
  459. BIG(--out;--in;);
  460. mul16_ppmm(carry, *out, *in, k);
  461. LITTLE(out++;in++;)
  462. while (--len) {
  463. BIG(--out;--in;)
  464. carryin = carry;
  465. mul16_ppmma(carry, *out, *in, k, carryin);
  466. LITTLE(out++;in++;)
  467. }
  468. BIGLITTLE(*--out,*out) = carry;
  469. }
  470. #elif defined(BNWORD32)
  471. void
  472. lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
  473. {
  474. BNWORD32 p;
  475. assert(len > 0);
  476. p = (BNWORD32)BIGLITTLE(*--in,*in++) * k;
  477. BIGLITTLE(*--out,*out++) = (BNWORD16)p;
  478. while (--len) {
  479. p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + (BNWORD16)(p >> 16);
  480. BIGLITTLE(*--out,*out++) = (BNWORD16)p;
  481. }
  482. BIGLITTLE(*--out,*out) = (BNWORD16)(p >> 16);
  483. }
  484. #else
  485. #error No 16x16 -> 32 multiply available for 16-bit bignum package
  486. #endif
  487. #endif /* lbnMulN1_16 */
  488. /*
  489. * lbnMulAdd1_16: Multiply an n-word input by a 1-word input and add the
  490. * low n words of the product to the destination. *Returns the n+1st word
  491. * of the product.* (That turns out to be more convenient than adding
  492. * it into the destination and dealing with a possible unit carry out
  493. * of *that*.) This uses either the mul16_ppmma and mul16_ppmmaa macros,
  494. * or C multiplication with the BNWORD32 type.
  495. *
  496. * If you're going to write assembly primitives, this is the one to
  497. * start with. It is by far the most commonly called function.
  498. */
  499. #ifndef lbnMulAdd1_16
  500. #if defined(mul16_ppmm)
  501. BNWORD16
  502. lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
  503. {
  504. BNWORD16 prod, carry, carryin;
  505. assert(len > 0);
  506. BIG(--out;--in;);
  507. carryin = *out;
  508. mul16_ppmma(carry, *out, *in, k, carryin);
  509. LITTLE(out++;in++;)
  510. while (--len) {
  511. BIG(--out;--in;);
  512. carryin = carry;
  513. mul16_ppmmaa(carry, prod, *in, k, carryin, *out);
  514. *out = prod;
  515. LITTLE(out++;in++;)
  516. }
  517. return carry;
  518. }
  519. #elif defined(BNWORD32)
  520. BNWORD16
  521. lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
  522. {
  523. BNWORD32 p;
  524. assert(len > 0);
  525. p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out);
  526. BIGLITTLE(*out,*out++) = (BNWORD16)p;
  527. while (--len) {
  528. p = (BNWORD32)BIGLITTLE(*--in,*in++) * k +
  529. (BNWORD16)(p >> 16) + BIGLITTLE(*--out,*out);
  530. BIGLITTLE(*out,*out++) = (BNWORD16)p;
  531. }
  532. return (BNWORD16)(p >> 16);
  533. }
  534. #else
  535. #error No 16x16 -> 32 multiply available for 16-bit bignum package
  536. #endif
  537. #endif /* lbnMulAdd1_16 */
  538. /*
  539. * lbnMulSub1_16: Multiply an n-word input by a 1-word input and subtract the
  540. * n-word product from the destination. Returns the n+1st word of the product.
  541. * This uses either the mul16_ppmm and mul16_ppmma macros, or
  542. * C multiplication with the BNWORD32 type.
  543. *
  544. * This is rather uglier than adding, but fortunately it's only used in
  545. * division which is not used too heavily.
  546. */
  547. #ifndef lbnMulSub1_16
  548. #if defined(mul16_ppmm)
  549. BNWORD16
  550. lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
  551. {
  552. BNWORD16 prod, carry, carryin;
  553. assert(len > 0);
  554. BIG(--in;)
  555. mul16_ppmm(carry, prod, *in, k);
  556. LITTLE(in++;)
  557. carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD16)~prod;
  558. while (--len) {
  559. BIG(--in;);
  560. carryin = carry;
  561. mul16_ppmma(carry, prod, *in, k, carryin);
  562. LITTLE(in++;)
  563. carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD16)~prod;
  564. }
  565. return carry;
  566. }
  567. #elif defined(BNWORD32)
  568. BNWORD16
  569. lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
  570. {
  571. BNWORD32 p;
  572. BNWORD16 carry, t;
  573. assert(len > 0);
  574. p = (BNWORD32)BIGLITTLE(*--in,*in++) * k;
  575. t = BIGLITTLE(*--out,*out);
  576. carry = (BNWORD16)(p>>16) + ((BIGLITTLE(*out,*out++)=t-(BNWORD16)p) > t);
  577. while (--len) {
  578. p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + carry;
  579. t = BIGLITTLE(*--out,*out);
  580. carry = (BNWORD16)(p>>16) +
  581. ( (BIGLITTLE(*out,*out++)=t-(BNWORD16)p) > t );
  582. }
  583. return carry;
  584. }
  585. #else
  586. #error No 16x16 -> 32 multiply available for 16-bit bignum package
  587. #endif
  588. #endif /* !lbnMulSub1_16 */
  589. /*
  590. * Shift n words left "shift" bits. 0 < shift < 16. Returns the
  591. * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
  592. */
  593. #ifndef lbnLshift_16
  594. BNWORD16
  595. lbnLshift_16(BNWORD16 *num, unsigned len, unsigned shift)
  596. {
  597. BNWORD16 x, carry;
  598. assert(shift > 0);
  599. assert(shift < 16);
  600. carry = 0;
  601. while (len--) {
  602. BIG(--num;)
  603. x = *num;
  604. *num = (x<<shift) | carry;
  605. LITTLE(num++;)
  606. carry = x >> (16-shift);
  607. }
  608. return carry;
  609. }
  610. #endif /* !lbnLshift_16 */
  611. /*
  612. * An optimized version of the above, for shifts of 1.
  613. * Some machines can use add-with-carry tricks for this.
  614. */
  615. #ifndef lbnDouble_16
  616. BNWORD16
  617. lbnDouble_16(BNWORD16 *num, unsigned len)
  618. {
  619. BNWORD16 x, carry;
  620. carry = 0;
  621. while (len--) {
  622. BIG(--num;)
  623. x = *num;
  624. *num = (x<<1) | carry;
  625. LITTLE(num++;)
  626. carry = x >> (16-1);
  627. }
  628. return carry;
  629. }
  630. #endif /* !lbnDouble_16 */
  631. /*
  632. * Shift n words right "shift" bits. 0 < shift < 16. Returns the
  633. * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
  634. */
  635. #ifndef lbnRshift_16
  636. BNWORD16
  637. lbnRshift_16(BNWORD16 *num, unsigned len, unsigned shift)
  638. {
  639. BNWORD16 x, carry = 0;
  640. assert(shift > 0);
  641. assert(shift < 16);
  642. BIGLITTLE(num -= len, num += len);
  643. while (len--) {
  644. LITTLE(--num;)
  645. x = *num;
  646. *num = (x>>shift) | carry;
  647. BIG(num++;)
  648. carry = x << (16-shift);
  649. }
  650. return carry >> (16-shift);
  651. }
  652. #endif /* !lbnRshift_16 */
  653. /*
  654. * Multiply two numbers of the given lengths. prod and num2 may overlap,
  655. * provided that the low len1 bits of prod are free. (This corresponds
  656. * nicely to the place the result is returned from lbnMontReduce_16.)
  657. *
  658. * TODO: Use Karatsuba multiply. The overlap constraints may have
  659. * to get rewhacked.
  660. */
  661. #ifndef lbnMul_16
  662. void
  663. lbnMul_16(BNWORD16 *prod, BNWORD16 const *num1, unsigned len1,
  664. BNWORD16 const *num2, unsigned len2)
  665. {
  666. /* Special case of zero */
  667. if (!len1 || !len2) {
  668. lbnZero_16(prod, len1+len2);
  669. return;
  670. }
  671. /* Multiply first word */
  672. lbnMulN1_16(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
  673. /*
  674. * Add in subsequent words, storing the most significant word,
  675. * which is new each time.
  676. */
  677. while (--len2) {
  678. BIGLITTLE(--prod,prod++);
  679. BIGLITTLE(*(prod-len1-1),*(prod+len1)) =
  680. lbnMulAdd1_16(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
  681. }
  682. }
  683. #endif /* !lbnMul_16 */
  684. /*
  685. * lbnMulX_16 is a square multiply - both inputs are the same length.
  686. * It's normally just a macro wrapper around the general multiply,
  687. * but might be implementable in assembly more efficiently (such as
  688. * when product scanning).
  689. */
  690. #ifndef lbnMulX_16
  691. #if defined(BNWORD32) && PRODUCT_SCAN
  692. /*
  693. * Test code to see whether product scanning is any faster. It seems
  694. * to make the C code slower, so PRODUCT_SCAN is not defined.
  695. */
  696. static void
  697. lbnMulX_16(BNWORD16 *prod, BNWORD16 const *num1, BNWORD16 const *num2,
  698. unsigned len)
  699. {
  700. BNWORD32 x, y;
  701. BNWORD16 const *p1, *p2;
  702. unsigned carry;
  703. unsigned i, j;
  704. /* Special case of zero */
  705. if (!len)
  706. return;
  707. x = (BNWORD32)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
  708. BIGLITTLE(*--prod, *prod++) = (BNWORD16)x;
  709. x >>= 16;
  710. for (i = 1; i < len; i++) {
  711. carry = 0;
  712. p1 = num1;
  713. p2 = BIGLITTLE(num2-i-1,num2+i+1);
  714. for (j = 0; j <= i; j++) {
  715. BIG(y = (BNWORD32)*--p1 * *p2++;)
  716. LITTLE(y = (BNWORD32)*p1++ * *--p2;)
  717. x += y;
  718. carry += (x < y);
  719. }
  720. BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
  721. x = (x >> 16) | (BNWORD32)carry << 16;
  722. }
  723. for (i = 1; i < len; i++) {
  724. carry = 0;
  725. p1 = BIGLITTLE(num1-i,num1+i);
  726. p2 = BIGLITTLE(num2-len,num2+len);
  727. for (j = i; j < len; j++) {
  728. BIG(y = (BNWORD32)*--p1 * *p2++;)
  729. LITTLE(y = (BNWORD32)*p1++ * *--p2;)
  730. x += y;
  731. carry += (x < y);
  732. }
  733. BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
  734. x = (x >> 16) | (BNWORD32)carry << 16;
  735. }
  736. BIGLITTLE(*--prod,*prod) = (BNWORD16)x;
  737. }
  738. #else /* !defined(BNWORD32) || !PRODUCT_SCAN */
  739. /* Default trivial macro definition */
  740. #define lbnMulX_16(prod, num1, num2, len) lbnMul_16(prod, num1, len, num2, len)
  741. #endif /* !defined(BNWORD32) || !PRODUCT_SCAN */
  742. #endif /* !lbmMulX_16 */
  743. #if !defined(lbnMontMul_16) && defined(BNWORD32) && PRODUCT_SCAN
  744. /*
  745. * Test code for product-scanning multiply. This seems to slow the C
  746. * code down rather than speed it up.
  747. * This does a multiply and Montgomery reduction together, using the
  748. * same loops. The outer loop scans across the product, twice.
  749. * The first pass computes the low half of the product and the
  750. * Montgomery multipliers. These are stored in the product array,
  751. * which contains no data as of yet. x and carry add up the columns
  752. * and propagate carries forward.
  753. *
  754. * The second half multiplies the upper half, adding in the modulus
  755. * times the Montgomery multipliers. The results of this multiply
  756. * are stored.
  757. */
  758. static void
  759. lbnMontMul_16(BNWORD16 *prod, BNWORD16 const *num1, BNWORD16 const *num2,
  760. BNWORD16 const *mod, unsigned len, BNWORD16 inv)
  761. {
  762. BNWORD32 x, y;
  763. BNWORD16 const *p1, *p2, *pm;
  764. BNWORD16 *pp;
  765. BNWORD16 t;
  766. unsigned carry;
  767. unsigned i, j;
  768. /* Special case of zero */
  769. if (!len)
  770. return;
  771. /*
  772. * This computes directly into the high half of prod, so just
  773. * shift the pointer and consider prod only "len" elements long
  774. * for the rest of the code.
  775. */
  776. BIGLITTLE(prod -= len, prod += len);
  777. /* Pass 1 - compute Montgomery multipliers */
  778. /* First iteration can have certain simplifications. */
  779. x = (BNWORD32)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
  780. BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD16)x;
  781. y = (BNWORD32)t * BIGLITTLE(mod[-1],mod[0]);
  782. x += y;
  783. /* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
  784. carry = (x < y);
  785. assert((BNWORD16)x == 0);
  786. x = x >> 16 | (BNWORD32)carry << 16;
  787. for (i = 1; i < len; i++) {
  788. carry = 0;
  789. p1 = num1;
  790. p2 = BIGLITTLE(num2-i-1,num2+i+1);
  791. pp = prod;
  792. pm = BIGLITTLE(mod-i-1,mod+i+1);
  793. for (j = 0; j < i; j++) {
  794. y = (BNWORD32)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
  795. x += y;
  796. carry += (x < y);
  797. y = (BNWORD32)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm);
  798. x += y;
  799. carry += (x < y);
  800. }
  801. y = (BNWORD32)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]);
  802. x += y;
  803. carry += (x < y);
  804. assert(BIGLITTLE(pp == prod-i, pp == prod+i));
  805. BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD16)x;
  806. assert(BIGLITTLE(pm == mod-1, pm == mod+1));
  807. y = (BNWORD32)t * BIGLITTLE(pm[0],pm[-1]);
  808. x += y;
  809. carry += (x < y);
  810. assert((BNWORD16)x == 0);
  811. x = x >> 16 | (BNWORD32)carry << 16;
  812. }
  813. /* Pass 2 - compute reduced product and store */
  814. for (i = 1; i < len; i++) {
  815. carry = 0;
  816. p1 = BIGLITTLE(num1-i,num1+i);
  817. p2 = BIGLITTLE(num2-len,num2+len);
  818. pm = BIGLITTLE(mod-i,mod+i);
  819. pp = BIGLITTLE(prod-len,prod+len);
  820. for (j = i; j < len; j++) {
  821. y = (BNWORD32)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
  822. x += y;
  823. carry += (x < y);
  824. y = (BNWORD32)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp);
  825. x += y;
  826. carry += (x < y);
  827. }
  828. assert(BIGLITTLE(pm == mod-len, pm == mod+len));
  829. assert(BIGLITTLE(pp == prod-i, pp == prod+i));
  830. BIGLITTLE(pp[0],pp[-1]) = (BNWORD16)x;
  831. x = (x >> 16) | (BNWORD32)carry << 16;
  832. }
  833. /* Last round of second half, simplified. */
  834. BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD16)x;
  835. carry = (x >> 16);
  836. while (carry)
  837. carry -= lbnSubN_16(prod, mod, len);
  838. while (lbnCmp_16(prod, mod, len) >= 0)
  839. (void)lbnSubN_16(prod, mod, len);
  840. }
  841. /* Suppress later definition */
  842. #define lbnMontMul_16 lbnMontMul_16
  843. #endif
  844. #if !defined(lbnSquare_16) && defined(BNWORD32) && PRODUCT_SCAN
  845. /*
  846. * Trial code for product-scanning squaring. This seems to slow the C
  847. * code down rather than speed it up.
  848. */
  849. void
  850. lbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len)
  851. {
  852. BNWORD32 x, y, z;
  853. BNWORD16 const *p1, *p2;
  854. unsigned carry;
  855. unsigned i, j;
  856. /* Special case of zero */
  857. if (!len)
  858. return;
  859. /* Word 0 of product */
  860. x = (BNWORD32)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]);
  861. BIGLITTLE(*--prod, *prod++) = (BNWORD16)x;
  862. x >>= 16;
  863. /* Words 1 through len-1 */
  864. for (i = 1; i < len; i++) {
  865. carry = 0;
  866. y = 0;
  867. p1 = num;
  868. p2 = BIGLITTLE(num-i-1,num+i+1);
  869. for (j = 0; j < (i+1)/2; j++) {
  870. BIG(z = (BNWORD32)*--p1 * *p2++;)
  871. LITTLE(z = (BNWORD32)*p1++ * *--p2;)
  872. y += z;
  873. carry += (y < z);
  874. }
  875. y += z = y;
  876. carry += carry + (y < z);
  877. if ((i & 1) == 0) {
  878. assert(BIGLITTLE(--p1 == p2, p1 == --p2));
  879. BIG(z = (BNWORD32)*p2 * *p2;)
  880. LITTLE(z = (BNWORD32)*p1 * *p1;)
  881. y += z;
  882. carry += (y < z);
  883. }
  884. x += y;
  885. carry += (x < y);
  886. BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
  887. x = (x >> 16) | (BNWORD32)carry << 16;
  888. }
  889. /* Words len through 2*len-2 */
  890. for (i = 1; i < len; i++) {
  891. carry = 0;
  892. y = 0;
  893. p1 = BIGLITTLE(num-i,num+i);
  894. p2 = BIGLITTLE(num-len,num+len);
  895. for (j = 0; j < (len-i)/2; j++) {
  896. BIG(z = (BNWORD32)*--p1 * *p2++;)
  897. LITTLE(z = (BNWORD32)*p1++ * *--p2;)
  898. y += z;
  899. carry += (y < z);
  900. }
  901. y += z = y;
  902. carry += carry + (y < z);
  903. if ((len-i) & 1) {
  904. assert(BIGLITTLE(--p1 == p2, p1 == --p2));
  905. BIG(z = (BNWORD32)*p2 * *p2;)
  906. LITTLE(z = (BNWORD32)*p1 * *p1;)
  907. y += z;
  908. carry += (y < z);
  909. }
  910. x += y;
  911. carry += (x < y);
  912. BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
  913. x = (x >> 16) | (BNWORD32)carry << 16;
  914. }
  915. /* Word 2*len-1 */
  916. BIGLITTLE(*--prod,*prod) = (BNWORD16)x;
  917. }
  918. /* Suppress later definition */
  919. #define lbnSquare_16 lbnSquare_16
  920. #endif
  921. /*
  922. * Square a number, using optimized squaring to reduce the number of
  923. * primitive multiples that are executed. There may not be any
  924. * overlap of the input and output.
  925. *
  926. * Technique: Consider the partial products in the multiplication
  927. * of "abcde" by itself:
  928. *
  929. * a b c d e
  930. * * a b c d e
  931. * ==================
  932. * ae be ce de ee
  933. * ad bd cd dd de
  934. * ac bc cc cd ce
  935. * ab bb bc bd be
  936. * aa ab ac ad ae
  937. *
  938. * Note that everything above the main diagonal:
  939. * ae be ce de = (abcd) * e
  940. * ad bd cd = (abc) * d
  941. * ac bc = (ab) * c
  942. * ab = (a) * b
  943. *
  944. * is a copy of everything below the main diagonal:
  945. * de
  946. * cd ce
  947. * bc bd be
  948. * ab ac ad ae
  949. *
  950. * Thus, the sum is 2 * (off the diagonal) + diagonal.
  951. *
  952. * This is accumulated beginning with the diagonal (which
  953. * consist of the squares of the digits of the input), which is then
  954. * divided by two, the off-diagonal added, and multiplied by two
  955. * again. The low bit is simply a copy of the low bit of the
  956. * input, so it doesn't need special care.
  957. *
  958. * TODO: Merge the shift by 1 with the squaring loop.
  959. * TODO: Use Karatsuba. (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
  960. */
  961. #ifndef lbnSquare_16
  962. void
  963. lbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len)
  964. {
  965. BNWORD16 t;
  966. BNWORD16 *prodx = prod; /* Working copy of the argument */
  967. BNWORD16 const *numx = num; /* Working copy of the argument */
  968. unsigned lenx = len; /* Working copy of the argument */
  969. if (!len)
  970. return;
  971. /* First, store all the squares */
  972. while (lenx--) {
  973. #ifdef mul16_ppmm
  974. BNWORD16 ph, pl;
  975. t = BIGLITTLE(*--numx,*numx++);
  976. mul16_ppmm(ph,pl,t,t);
  977. BIGLITTLE(*--prodx,*prodx++) = pl;
  978. BIGLITTLE(*--prodx,*prodx++) = ph;
  979. #elif defined(BNWORD32) /* use BNWORD32 */
  980. BNWORD32 p;
  981. t = BIGLITTLE(*--numx,*numx++);
  982. p = (BNWORD32)t * t;
  983. BIGLITTLE(*--prodx,*prodx++) = (BNWORD16)p;
  984. BIGLITTLE(*--prodx,*prodx++) = (BNWORD16)(p>>16);
  985. #else /* Use lbnMulN1_16 */
  986. t = BIGLITTLE(numx[-1],*numx);
  987. lbnMulN1_16(prodx, numx, 1, t);
  988. BIGLITTLE(--numx,numx++);
  989. BIGLITTLE(prodx -= 2, prodx += 2);
  990. #endif
  991. }
  992. /* Then, shift right 1 bit */
  993. (void)lbnRshift_16(prod, 2*len, 1);
  994. /* Then, add in the off-diagonal sums */
  995. lenx = len;
  996. numx = num;
  997. prodx = prod;
  998. while (--lenx) {
  999. t = BIGLITTLE(*--numx,*numx++);
  1000. BIGLITTLE(--prodx,prodx++);
  1001. t = lbnMulAdd1_16(prodx, numx, lenx, t);
  1002. lbnAdd1_16(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t);
  1003. BIGLITTLE(--prodx,prodx++);
  1004. }
  1005. /* Shift it back up */
  1006. lbnDouble_16(prod, 2*len);
  1007. /* And set the low bit appropriately */
  1008. BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1;
  1009. }
  1010. #endif /* !lbnSquare_16 */
  1011. /*
  1012. * lbnNorm_16 - given a number, return a modified length such that the
  1013. * most significant digit is non-zero. Zero-length input is okay.
  1014. */
  1015. #ifndef lbnNorm_16
  1016. unsigned
  1017. lbnNorm_16(BNWORD16 const *num, unsigned len)
  1018. {
  1019. BIGLITTLE(num -= len,num += len);
  1020. while (len && BIGLITTLE(*num++,*--num) == 0)
  1021. --len;
  1022. return len;
  1023. }
  1024. #endif /* lbnNorm_16 */
  1025. /*
  1026. * lbnBits_16 - return the number of significant bits in the array.
  1027. * It starts by normalizing the array. Zero-length input is okay.
  1028. * Then assuming there's anything to it, it fetches the high word,
  1029. * generates a bit length by multiplying the word length by 16, and
  1030. * subtracts off 16/2, 16/4, 16/8, ... bits if the high bits are clear.
  1031. */
  1032. #ifndef lbnBits_16
  1033. unsigned
  1034. lbnBits_16(BNWORD16 const *num, unsigned len)
  1035. {
  1036. BNWORD16 t;
  1037. unsigned i;
  1038. len = lbnNorm_16(num, len);
  1039. if (len) {
  1040. t = BIGLITTLE(*(num-len),*(num+(len-1)));
  1041. assert(t);
  1042. len *= 16;
  1043. i = 16/2;
  1044. do {
  1045. if (t >> i)
  1046. t >>= i;
  1047. else
  1048. len -= i;
  1049. } while ((i /= 2) != 0);
  1050. }
  1051. return len;
  1052. }
  1053. #endif /* lbnBits_16 */
  1054. /*
  1055. * If defined, use hand-rolled divide rather than compiler's native.
  1056. * If the machine doesn't do it in line, the manual code is probably
  1057. * faster, since it can assume normalization and the fact that the
  1058. * quotient will fit into 16 bits, which a general 32-bit divide
  1059. * in a compiler's run-time library can't do.
  1060. */
  1061. #ifndef BN_SLOW_DIVIDE_32
  1062. /* Assume that divisors of more than thirty-two bits are slow */
  1063. #define BN_SLOW_DIVIDE_32 (32 > 0x20)
  1064. #endif
  1065. /*
  1066. * Return (nh<<16|nl) % d, and place the quotient digit into *q.
  1067. * It is guaranteed that nh < d, and that d is normalized (with its high
  1068. * bit set). If we have a double-width type, it's easy. If not, ooh,
  1069. * yuk!
  1070. */
  1071. #ifndef lbnDiv21_16
  1072. #if defined(BNWORD32) && !BN_SLOW_DIVIDE_32
  1073. BNWORD16
  1074. lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
  1075. {
  1076. BNWORD32 n = (BNWORD32)nh << 16 | nl;
  1077. /* Divisor must be normalized */
  1078. assert(d >> (16-1) == 1);
  1079. *q = n / d;
  1080. return n % d;
  1081. }
  1082. #else
  1083. /*
  1084. * This is where it gets ugly.
  1085. *
  1086. * Do the division in two halves, using Algorithm D from section 4.3.1
  1087. * of Knuth. Note Theorem B from that section, that the quotient estimate
  1088. * is never more than the true quotient, and is never more than two
  1089. * too low.
  1090. *
  1091. * The mapping onto conventional long division is (everything a half word):
  1092. * _____________qh___ql_
  1093. * dh dl ) nh.h nh.l nl.h nl.l
  1094. * - (qh * d)
  1095. * -----------
  1096. * rrrr rrrr nl.l
  1097. * - (ql * d)
  1098. * -----------
  1099. * rrrr rrrr
  1100. *
  1101. * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way:
  1102. * First, estimate a q digit so that nh/dh works. Subtracting qh*dh from
  1103. * the (nh.h nh.l) list leaves a 1/2-word remainder r. Then compute the
  1104. * low part of the subtractor, qh * dl. This also needs to be subtracted
  1105. * from (nh.h nh.l nl.h) to get the final remainder. So we take the
  1106. * remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
  1107. * try to subtract qh * dl from that. Since the remainder is 1/2-word
  1108. * long, shifting and adding nl.h results in a single word r.
  1109. * It is possible that the remainder we're working with, r, is less than
  1110. * the product qh * dl, if we estimated qh too high. The estimation
  1111. * technique can produce a qh that is too large (never too small), leading
  1112. * to r which is too small. In that case, decrement the digit qh, add
  1113. * shifted dh to r (to correct for that error), and subtract dl from the
  1114. * product we're comparing r with. That's the "correct" way to do it, but
  1115. * just adding dl to r instead of subtracting it from the product is
  1116. * equivalent and a lot simpler. You just have to watch out for overflow.
  1117. *
  1118. * The process is repeated with (rrrr rrrr nl.l) for the low digit of the
  1119. * quotient ql.
  1120. *
  1121. * The various uses of 16/2 for shifts are because of the note about
  1122. * automatic editing of this file at the very top of the file.
  1123. */
  1124. #define highhalf(x) ( (x) >> 16/2 )
  1125. #define lowhalf(x) ( (x) & (((BNWORD16)1 << 16/2)-1) )
  1126. BNWORD16
  1127. lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
  1128. {
  1129. BNWORD16 dh = highhalf(d), dl = lowhalf(d);
  1130. BNWORD16 qh, ql, prod, r;
  1131. /* Divisor must be normalized */
  1132. assert((d >> (16-1)) == 1);
  1133. /* Do first half-word of division */
  1134. qh = nh / dh;
  1135. r = nh % dh;
  1136. prod = qh * dl;
  1137. /*
  1138. * Add next half-word of numerator to remainder and correct.
  1139. * qh may be up to two too large.
  1140. */
  1141. r = (r << (16/2)) | highhalf(nl);
  1142. if (r < prod) {
  1143. --qh; r += d;
  1144. if (r >= d && r < prod) {
  1145. --qh; r += d;
  1146. }
  1147. }
  1148. r -= prod;
  1149. /* Do second half-word of division */
  1150. ql = r / dh;
  1151. r = r % dh;
  1152. prod = ql * dl;
  1153. r = (r << (16/2)) | lowhalf(nl);
  1154. if (r < prod) {
  1155. --ql; r += d;
  1156. if (r >= d && r < prod) {
  1157. --ql; r += d;
  1158. }
  1159. }
  1160. r -= prod;
  1161. *q = (qh << (16/2)) | ql;
  1162. return r;
  1163. }
  1164. #endif
  1165. #endif /* lbnDiv21_16 */
  1166. /*
  1167. * In the division functions, the dividend and divisor are referred to
  1168. * as "n" and "d", which stand for "numerator" and "denominator".
  1169. *
  1170. * The quotient is (nlen-dlen+1) digits long. It may be overlapped with
  1171. * the high (nlen-dlen) words of the dividend, but one extra word is needed
  1172. * on top to hold the top word.
  1173. */
  1174. /*
  1175. * Divide an n-word number by a 1-word number, storing the remainder
  1176. * and n-1 words of the n-word quotient. The high word is returned.
  1177. * It IS legal for rem to point to the same address as n, and for
  1178. * q to point one word higher.
  1179. *
  1180. * TODO: If BN_SLOW_DIVIDE_32, add a divnhalf_16 which uses 16-bit
  1181. * dividends if the divisor is half that long.
  1182. * TODO: Shift the dividend on the fly to avoid the last division and
  1183. * instead have a remainder that needs shifting.
  1184. * TODO: Use reciprocals rather than dividing.
  1185. */
  1186. #ifndef lbnDiv1_16
  1187. BNWORD16
  1188. lbnDiv1_16(BNWORD16 *q, BNWORD16 *rem, BNWORD16 const *n, unsigned len,
  1189. BNWORD16 d)
  1190. {
  1191. unsigned shift;
  1192. unsigned xlen;
  1193. BNWORD16 r;
  1194. BNWORD16 qhigh;
  1195. assert(len > 0);
  1196. assert(d);
  1197. if (len == 1) {
  1198. r = *n;
  1199. *rem = r%d;
  1200. return r/d;
  1201. }
  1202. shift = 0;
  1203. r = d;
  1204. xlen = 16/2;
  1205. do {
  1206. if (r >> xlen)
  1207. r >>= xlen;
  1208. else
  1209. shift += xlen;
  1210. } while ((xlen /= 2) != 0);
  1211. assert((d >> (16-1-shift)) == 1);
  1212. d <<= shift;
  1213. BIGLITTLE(q -= len-1,q += len-1);
  1214. BIGLITTLE(n -= len,n += len);
  1215. r = BIGLITTLE(*n++,*--n);
  1216. if (r < d) {
  1217. qhigh = 0;
  1218. } else {
  1219. qhigh = r/d;
  1220. r %= d;
  1221. }
  1222. xlen = len;
  1223. while (--xlen)
  1224. r = lbnDiv21_16(BIGLITTLE(q++,--q), r, BIGLITTLE(*n++,*--n), d);
  1225. /*
  1226. * Final correction for shift - shift the quotient up "shift"
  1227. * bits, and merge in the extra bits of quotient. Then reduce
  1228. * the final remainder mod the real d.
  1229. */
  1230. if (shift) {
  1231. d >>= shift;
  1232. qhigh = (qhigh << shift) | lbnLshift_16(q, len-1, shift);
  1233. BIGLITTLE(q[-1],*q) |= r/d;
  1234. r %= d;
  1235. }
  1236. *rem = r;
  1237. return qhigh;
  1238. }
  1239. #endif
  1240. /*
  1241. * This function performs a "quick" modulus of a number with a divisor
  1242. * d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
  1243. * This applies regardless of the word size the library is compiled with.
  1244. *
  1245. * This function is important to prime generation, for sieving.
  1246. */
  1247. #ifndef lbnModQ_16
  1248. /* If there's a custom lbnMod21_16, no normalization needed */
  1249. #ifdef lbnMod21_16
  1250. unsigned
  1251. lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
  1252. {
  1253. unsigned i, shift;
  1254. BNWORD16 r;
  1255. assert(len > 0);
  1256. BIGLITTLE(n -= len,n += len);
  1257. /* Try using a compare to avoid the first divide */
  1258. r = BIGLITTLE(*n++,*--n);
  1259. if (r >= d)
  1260. r %= d;
  1261. while (--len)
  1262. r = lbnMod21_16(r, BIGLITTLE(*n++,*--n), d);
  1263. return r;
  1264. }
  1265. #elif defined(BNWORD32) && !BN_SLOW_DIVIDE_32
  1266. unsigned
  1267. lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
  1268. {
  1269. BNWORD16 r;
  1270. if (!--len)
  1271. return BIGLITTLE(n[-1],n[0]) % d;
  1272. BIGLITTLE(n -= len,n += len);
  1273. r = BIGLITTLE(n[-1],n[0]);
  1274. do {
  1275. r = (BNWORD16)((((BNWORD32)r<<16) | BIGLITTLE(*n++,*--n)) % d);
  1276. } while (--len);
  1277. return r;
  1278. }
  1279. #elif 16 >= 0x20
  1280. /*
  1281. * If the single word size can hold 65535*65536, then this function
  1282. * is avilable.
  1283. */
  1284. #ifndef highhalf
  1285. #define highhalf(x) ( (x) >> 16/2 )
  1286. #define lowhalf(x) ( (x) & ((1 << 16/2)-1) )
  1287. #endif
  1288. unsigned
  1289. lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
  1290. {
  1291. BNWORD16 r, x;
  1292. BIGLITTLE(n -= len,n += len);
  1293. r = BIGLITTLE(*n++,*--n);
  1294. while (--len) {
  1295. x = BIGLITTLE(*n++,*--n);
  1296. r = (r%d << 16/2) | highhalf(x);
  1297. r = (r%d << 16/2) | lowhalf(x);
  1298. }
  1299. return r%d;
  1300. }
  1301. #else
  1302. /* Default case - use lbnDiv21_16 */
  1303. unsigned
  1304. lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
  1305. {
  1306. unsigned i, shift;
  1307. BNWORD16 r;
  1308. BNWORD16 q;
  1309. assert(len > 0);
  1310. shift = 0;
  1311. r = d;
  1312. i = 16;
  1313. while (i /= 2) {
  1314. if (r >> i)
  1315. r >>= i;
  1316. else
  1317. shift += i;
  1318. }
  1319. assert(d >> (16-1-shift) == 1);
  1320. d <<= shift;
  1321. BIGLITTLE(n -= len,n += len);
  1322. r = BIGLITTLE(*n++,*--n);
  1323. if (r >= d)
  1324. r %= d;
  1325. while (--len)
  1326. r = lbnDiv21_16(&q, r, BIGLITTLE(*n++,*--n), d);
  1327. /*
  1328. * Final correction for shift - shift the quotient up "shift"
  1329. * bits, and merge in the extra bits of quotient. Then reduce
  1330. * the final remainder mod the real d.
  1331. */
  1332. if (shift)
  1333. r %= d >> shift;
  1334. return r;
  1335. }
  1336. #endif
  1337. #endif /* lbnModQ_16 */
  1338. /*
  1339. * Reduce n mod d and return the quotient. That is, find:
  1340. * q = n / d;
  1341. * n = n % d;
  1342. * d is altered during the execution of this subroutine by normalizing it.
  1343. * It must already have its most significant word non-zero; it is shifted
  1344. * so its most significant bit is non-zero.
  1345. *
  1346. * The quotient q is nlen-dlen+1 words long. To make it possible to
  1347. * overlap the quptient with the input (you can store it in the high dlen
  1348. * words), the high word of the quotient is *not* stored, but is returned.
  1349. * (If all you want is the remainder, you don't care about it, anyway.)
  1350. *
  1351. * This uses algorithm D from Knuth (4.3.1), except that we do binary
  1352. * (shift) normalization of the divisor. WARNING: This is hairy!
  1353. *
  1354. * This function is used for some modular reduction, but it is not used in
  1355. * the modular exponentiation loops; they use Montgomery form and the
  1356. * corresponding, more efficient, Montgomery reduction. This code
  1357. * is needed for the conversion to Montgomery form, however, so it
  1358. * has to be here and it might as well be reasonably efficient.
  1359. *
  1360. * The overall operation is as follows ("top" and "up" refer to the
  1361. * most significant end of the number; "bottom" and "down", the least):
  1362. *
  1363. * - Shift the divisor up until the most significant bit is set.
  1364. * - Shift the dividend up the same amount. This will produce the
  1365. * correct quotient, and the remainder can be recovered by shifting
  1366. * it back down the same number of bits. This may produce an overflow
  1367. * word, but the word is always strictly less than the most significant
  1368. * divisor word.
  1369. * - Estimate the first quotient digit qhat:
  1370. * - First take the top two words (one of which is the overflow) of the
  1371. * dividend and divide by the top word of the divisor:
  1372. * qhat = (nh,nm)/dh. This qhat is >= the correct quotient digit
  1373. * and, since dh is normalized, it is at most two over.
  1374. * - Second, correct by comparing the top three words. If
  1375. * (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
  1376. * The second iteration can be simpler because there can't be a third.
  1377. * The computation can be simplified by subtracting dh*qhat from
  1378. * both sides, suitably shifted. This reduces the left side to
  1379. * dl*qhat. On the right, (nh,nm)-dh*qhat is simply the
  1380. * remainder r from (nh,nm)%dh, so the right is (r,nl).
  1381. * This produces qhat that is almost always correct and at
  1382. * most (prob ~ 2/2^16) one too high.
  1383. * - Subtract qhat times the divisor (suitably shifted) from the dividend.
  1384. * If there is a borrow, qhat was wrong, so decrement it
  1385. * and add the divisor back in (once).
  1386. * - Store the final quotient digit qhat in the quotient array q.
  1387. *
  1388. * Repeat the quotient digit computation for successive digits of the
  1389. * quotient until the whole quotient has been computed. Then shift the
  1390. * divisor and the remainder down to correct for the normalization.
  1391. *
  1392. * TODO: Special case 2-word divisors.
  1393. * TODO: Use reciprocals rather than dividing.
  1394. */
  1395. #ifndef divn_16
  1396. BNWORD16
  1397. lbnDiv_16(BNWORD16 *q, BNWORD16 *n, unsigned nlen, BNWORD16 *d, unsigned dlen)
  1398. {
  1399. BNWORD16 nh,nm,nl; /* Top three words of the dividend */
  1400. BNWORD16 dh,dl; /* Top two words of the divisor */
  1401. BNWORD16 qhat; /* Extimate of quotient word */
  1402. BNWORD16 r; /* Remainder from quotient estimate division */
  1403. BNWORD16 qhigh; /* High word of quotient */
  1404. unsigned i; /* Temp */
  1405. unsigned shift; /* Bits shifted by normalization */
  1406. unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */
  1407. #ifdef mul16_ppmm
  1408. BNWORD16 t16;
  1409. #elif defined(BNWORD32)
  1410. BNWORD32 t32;
  1411. #else /* use lbnMulN1_16 */
  1412. BNWORD16 t2[2];
  1413. #define t2high BIGLITTLE(t2[0],t2[1])
  1414. #define t2low BIGLITTLE(t2[1],t2[0])
  1415. #endif
  1416. assert(dlen);
  1417. assert(nlen >= dlen);
  1418. /*
  1419. * Special cases for short divisors. The general case uses the
  1420. * top top 2 digits of the divisor (d) to estimate a quotient digit,
  1421. * so it breaks if there are fewer digits available. Thus, we need
  1422. * special cases for a divisor of length 1. A divisor of length
  1423. * 2 can have a *lot* of administrivia overhead removed removed,
  1424. * so it's probably worth special-casing that case, too.
  1425. */
  1426. if (dlen == 1)
  1427. return lbnDiv1_16(q, BIGLITTLE(n-1,n), n, nlen,
  1428. BIGLITTLE(d[-1],d[0]));
  1429. #if 0
  1430. /*
  1431. * @@@ This is not yet written... The general loop will do,
  1432. * albeit less efficiently
  1433. */
  1434. if (dlen == 2) {
  1435. /*
  1436. * divisor two digits long:
  1437. * use the 3/2 technique from Knuth, but we know
  1438. * it's exact.
  1439. */
  1440. dh = BIGLITTLE(d[-1],d[0]);
  1441. dl = BIGLITTLE(d[-2],d[1]);
  1442. shift = 0;
  1443. if ((sh & ((BNWORD16)1 << 16-1-shift)) == 0) {
  1444. do {
  1445. shift++;
  1446. } while (dh & (BNWORD16)1<<16-1-shift) == 0);
  1447. dh = dh << shift | dl >> (16-shift);
  1448. dl <<= shift;
  1449. }
  1450. for (shift = 0; (dh & (BNWORD16)1 << 16-1-shift)) == 0; shift++)
  1451. ;
  1452. if (shift) {
  1453. }
  1454. dh = dh << shift | dl >> (16-shift);
  1455. shift = 0;
  1456. while (dh
  1457. }
  1458. #endif
  1459. dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
  1460. assert(dh);
  1461. /* Normalize the divisor */
  1462. shift = 0;
  1463. r = dh;
  1464. i = 16/2;
  1465. do {
  1466. if (r >> i)
  1467. r >>= i;
  1468. else
  1469. shift += i;
  1470. } while ((i /= 2) != 0);
  1471. nh = 0;
  1472. if (shift) {
  1473. lbnLshift_16(d, dlen, shift);
  1474. dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
  1475. nh = lbnLshift_16(n, nlen, shift);
  1476. }
  1477. /* Assert that dh is now normalized */
  1478. assert(dh >> (16-1));
  1479. /* Also get the second-most significant word of the divisor */
  1480. dl = BIGLITTLE(*(d-(dlen-1)),*(d+(dlen-2)));
  1481. /*
  1482. * Adjust pointers: n to point to least significant end of first
  1483. * first subtract, and q to one the most-significant end of the
  1484. * quotient array.
  1485. */
  1486. BIGLITTLE(n -= qlen,n += qlen);
  1487. BIGLITTLE(q -= qlen,q += qlen);
  1488. /* Fetch the most significant stored word of the dividend */
  1489. nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
  1490. /*
  1491. * Compute the first digit of the quotient, based on the
  1492. * first two words of the dividend (the most significant of which
  1493. * is the overflow word h).
  1494. */
  1495. if (nh) {
  1496. assert(nh < dh);
  1497. r = lbnDiv21_16(&qhat, nh, nm, dh);
  1498. } else if (nm >= dh) {
  1499. qhat = nm/dh;
  1500. r = nm % dh;
  1501. } else { /* Quotient is zero */
  1502. qhigh = 0;
  1503. goto divloop;
  1504. }
  1505. /* Now get the third most significant word of the dividend */
  1506. nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
  1507. /*
  1508. * Correct qhat, the estimate of quotient digit.
  1509. * qhat can only be high, and at most two words high,
  1510. * so the loop can be unrolled and abbreviated.
  1511. */
  1512. #ifdef mul16_ppmm
  1513. mul16_ppmm(nm, t16, qhat, dl);
  1514. if (nm > r || (nm == r && t16 > nl)) {
  1515. /* Decrement qhat and adjust comparison parameters */
  1516. qhat--;
  1517. if ((r += dh) >= dh) {
  1518. nm -= (t16 < dl);
  1519. t16 -= dl;
  1520. if (nm > r || (nm == r && t16 > nl))
  1521. qhat--;
  1522. }
  1523. }
  1524. #elif defined(BNWORD32)
  1525. t32 = (BNWORD32)qhat * dl;
  1526. if (t32 > ((BNWORD32)r << 16) + nl) {
  1527. /* Decrement qhat and adjust comparison parameters */
  1528. qhat--;
  1529. if ((r += dh) > dh) {
  1530. t32 -= dl;
  1531. if (t32 > ((BNWORD32)r << 16) + nl)
  1532. qhat--;
  1533. }
  1534. }
  1535. #else /* Use lbnMulN1_16 */
  1536. lbnMulN1_16(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
  1537. if (t2high > r || (t2high == r && t2low > nl)) {
  1538. /* Decrement qhat and adjust comparison parameters */
  1539. qhat--;
  1540. if ((r += dh) >= dh) {
  1541. t2high -= (t2low < dl);
  1542. t2low -= dl;
  1543. if (t2high > r || (t2high == r && t2low > nl))
  1544. qhat--;
  1545. }
  1546. }
  1547. #endif
  1548. /* Do the multiply and subtract */
  1549. r = lbnMulSub1_16(n, d, dlen, qhat);
  1550. /* If there was a borrow, add back once. */
  1551. if (r > nh) { /* Borrow? */
  1552. (void)lbnAddN_16(n, d, dlen);
  1553. qhat--;
  1554. }
  1555. /* Remember the first quotient digit. */
  1556. qhigh = qhat;
  1557. /* Now, the main division loop: */
  1558. divloop:
  1559. while (qlen--) {
  1560. /* Advance n */
  1561. nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
  1562. BIGLITTLE(++n,--n);
  1563. nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
  1564. if (nh == dh) {
  1565. qhat = ~(BNWORD16)0;
  1566. /* Optimized computation of r = (nh,nm) - qhat * dh */
  1567. r = nh + nm;
  1568. if (r < nh)
  1569. goto subtract;
  1570. } else {
  1571. assert(nh < dh);
  1572. r = lbnDiv21_16(&qhat, nh, nm, dh);
  1573. }
  1574. nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
  1575. #ifdef mul16_ppmm
  1576. mul16_ppmm(nm, t16, qhat, dl);
  1577. if (nm > r || (nm == r && t16 > nl)) {
  1578. /* Decrement qhat and adjust comparison parameters */
  1579. qhat--;
  1580. if ((r += dh) >= dh) {
  1581. nm -= (t16 < dl);
  1582. t16 -= dl;
  1583. if (nm > r || (nm == r && t16 > nl))
  1584. qhat--;
  1585. }
  1586. }
  1587. #elif defined(BNWORD32)
  1588. t32 = (BNWORD32)qhat * dl;
  1589. if (t32 > ((BNWORD32)r<<16) + nl) {
  1590. /* Decrement qhat and adjust comparison parameters */
  1591. qhat--;
  1592. if ((r += dh) >= dh) {
  1593. t32 -= dl;
  1594. if (t32 > ((BNWORD32)r << 16) + nl)
  1595. qhat--;
  1596. }
  1597. }
  1598. #else /* Use lbnMulN1_16 */
  1599. lbnMulN1_16(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
  1600. if (t2high > r || (t2high == r && t2low > nl)) {
  1601. /* Decrement qhat and adjust comparison parameters */
  1602. qhat--;
  1603. if ((r += dh) >= dh) {
  1604. t2high -= (t2low < dl);
  1605. t2low -= dl;
  1606. if (t2high > r || (t2high == r && t2low > nl))
  1607. qhat--;
  1608. }
  1609. }
  1610. #endif
  1611. /*
  1612. * As a point of interest, note that it is not worth checking
  1613. * for qhat of 0 or 1 and installing special-case code. These
  1614. * occur with probability 2^-16, so spending 1 cycle to check
  1615. * for them is only worth it if we save more than 2^15 cycles,
  1616. * and a multiply-and-subtract for numbers in the 1024-bit
  1617. * range just doesn't take that long.
  1618. */
  1619. subtract:
  1620. /*
  1621. * n points to the least significant end of the substring
  1622. * of n to be subtracted from. qhat is either exact or
  1623. * one too large. If the subtract gets a borrow, it was
  1624. * one too large and the divisor is added back in. It's
  1625. * a dlen+1 word add which is guaranteed to produce a
  1626. * carry out, so it can be done very simply.
  1627. */
  1628. r = lbnMulSub1_16(n, d, dlen, qhat);
  1629. if (r > nh) { /* Borrow? */
  1630. (void)lbnAddN_16(n, d, dlen);
  1631. qhat--;
  1632. }
  1633. /* Store the quotient digit */
  1634. BIGLITTLE(*q++,*--q) = qhat;
  1635. }
  1636. /* Tah dah! */
  1637. if (shift) {
  1638. lbnRshift_16(d, dlen, shift);
  1639. lbnRshift_16(n, dlen, shift);
  1640. }
  1641. return qhigh;
  1642. }
  1643. #endif
  1644. /*
  1645. * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^16.
  1646. *
  1647. * This just performs Newton's iteration until it gets the
  1648. * inverse. The initial estimate is always correct to 3 bits, and
  1649. * sometimes 4. The number of valid bits doubles each iteration.
  1650. * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
  1651. * for the error mod 2^2n. x * y == 1 + k*2^n (mod 2^2n) and follow
  1652. * the iteration through.)
  1653. */
  1654. #ifndef lbnMontInv1_16
  1655. BNWORD16
  1656. lbnMontInv1_16(BNWORD16 const x)
  1657. {
  1658. BNWORD16 y = x, z;
  1659. assert(x & 1);
  1660. while ((z = x*y) != 1)
  1661. y *= 2 - z;
  1662. return -y;
  1663. }
  1664. #endif /* !lbnMontInv1_16 */
  1665. #if defined(BNWORD32) && PRODUCT_SCAN
  1666. /*
  1667. * Test code for product-scanning Montgomery reduction.
  1668. * This seems to slow the C code down rather than speed it up.
  1669. *
  1670. * The first loop computes the Montgomery multipliers, storing them over
  1671. * the low half of the number n.
  1672. *
  1673. * The second half multiplies the upper half, adding in the modulus
  1674. * times the Montgomery multipliers. The results of this multiply
  1675. * are stored.
  1676. */
  1677. void
  1678. lbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned mlen, BNWORD16 inv)
  1679. {
  1680. BNWORD32 x, y;
  1681. BNWORD16 const *pm;
  1682. BNWORD16 *pn;
  1683. BNWORD16 t;
  1684. unsigned carry;
  1685. unsigned i, j;
  1686. /* Special case of zero */
  1687. if (!mlen)
  1688. return;
  1689. /* Pass 1 - compute Montgomery multipliers */
  1690. /* First iteration can have certain simplifications. */
  1691. t = BIGLITTLE(n[-1],n[0]);
  1692. x = t;
  1693. t *= inv;
  1694. BIGLITTLE(n[-1], n[0]) = t;
  1695. x += (BNWORD32)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
  1696. assert((BNWORD16)x == 0);
  1697. x = x >> 16;
  1698. for (i = 1; i < mlen; i++) {
  1699. carry = 0;
  1700. pn = n;
  1701. pm = BIGLITTLE(mod-i-1,mod+i+1);
  1702. for (j = 0; j < i; j++) {
  1703. y = (BNWORD32)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);
  1704. x += y;
  1705. carry += (x < y);
  1706. }
  1707. assert(BIGLITTLE(pn == n-i, pn == n+i));
  1708. y = t = BIGLITTLE(pn[-1], pn[0]);
  1709. x += y;
  1710. carry += (x < y);
  1711. BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD16)x;
  1712. assert(BIGLITTLE(pm == mod-1, pm == mod+1));
  1713. y = (BNWORD32)t * BIGLITTLE(pm[0],pm[-1]);
  1714. x += y;
  1715. carry += (x < y);
  1716. assert((BNWORD16)x == 0);
  1717. x = x >> 16 | (BNWORD32)carry << 16;
  1718. }
  1719. BIGLITTLE(n -= mlen, n += mlen);
  1720. /* Pass 2 - compute upper words and add to n */
  1721. for (i = 1; i < mlen; i++) {
  1722. carry = 0;
  1723. pm = BIGLITTLE(mod-i,mod+i);
  1724. pn = n;
  1725. for (j = i; j < mlen; j++) {
  1726. y = (BNWORD32)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);
  1727. x += y;
  1728. carry += (x < y);
  1729. }
  1730. assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
  1731. assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
  1732. y = t = BIGLITTLE(*(n-i),*(n+i-1));
  1733. x += y;
  1734. carry += (x < y);
  1735. BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD16)x;
  1736. x = (x >> 16) | (BNWORD32)carry << 16;
  1737. }
  1738. /* Last round of second half, simplified. */
  1739. t = BIGLITTLE(*(n-mlen),*(n+mlen-1));
  1740. x += t;
  1741. BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD16)x;
  1742. carry = (unsigned)(x >> 16);
  1743. while (carry)
  1744. carry -= lbnSubN_16(n, mod, mlen);
  1745. while (lbnCmp_16(n, mod, mlen) >= 0)
  1746. (void)lbnSubN_16(n, mod, mlen);
  1747. }
  1748. #define lbnMontReduce_16 lbnMontReduce_16
  1749. #endif
  1750. /*
  1751. * Montgomery reduce n, modulo mod. This reduces modulo mod and divides by
  1752. * 2^(16*mlen). Returns the result in the *top* mlen words of the argument n.
  1753. * This is ready for another multiplication using lbnMul_16.
  1754. *
  1755. * Montgomery representation is a very useful way to encode numbers when
  1756. * you're doing lots of modular reduction. What you do is pick a multiplier
  1757. * R which is relatively prime to the modulus and very easy to divide by.
  1758. * Since the modulus is odd, R is closen as a power of 2, so the division
  1759. * is a shift. In fact, it's a shift of an integral number of words,
  1760. * so the shift can be implicit - just drop the low-order words.
  1761. *
  1762. * Now, choose R *larger* than the modulus m, 2^(16*mlen). Then convert
  1763. * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
  1764. * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc. Note that:
  1765. * - The Montgomery form of a number depends on the modulus m.
  1766. * A fixed modulus m is assumed throughout this discussion.
  1767. * - Since R is relaitvely prime to m, multiplication by R is invertible;
  1768. * no information about the numbers is lost, they're just scrambled.
  1769. * - Adding (and subtracting) numbers in this form works just as usual.
  1770. * M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
  1771. * - Multiplying numbers in this form produces a*b*R*R. The problem
  1772. * is to divide out the excess factor of R, modulo m as well as to
  1773. * reduce to the given length mlen. It turns out that this can be
  1774. * done *faster* than a normal divide, which is where the speedup
  1775. * in Montgomery division comes from.
  1776. *
  1777. * Normal reduction chooses a most-significant quotient digit q and then
  1778. * subtracts q*m from the number to be reduced. Choosing q is tricky
  1779. * and involved (just look at lbnDiv_16 to see!) and is usually
  1780. * imperfect, requiring a check for correction after the subtraction.
  1781. *
  1782. * Montgomery reduction *adds* a multiple of m to the *low-order* part
  1783. * of the number to be reduced. This multiple is chosen to make the
  1784. * low-order part of the number come out to zero. This can be done
  1785. * with no trickery or error using a precomputed inverse of the modulus.
  1786. * In this code, the "part" is one word, but any width can be used.
  1787. *
  1788. * Repeating this step sufficiently often results in a value which
  1789. * is a multiple of R (a power of two, remember) but is still (since
  1790. * the additions were to the low-order part and thus did not increase
  1791. * the value of the number being reduced very much) still not much
  1792. * larger than m*R. Then implicitly divide by R and subtract off
  1793. * m until the result is in the correct range.
  1794. *
  1795. * Since the low-order part being cancelled is less than R, the
  1796. * multiple of m added must have a multiplier which is at most R-1.
  1797. * Assuming that the input is at most m*R-1, the final number is
  1798. * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
  1799. * the high-order part, equivalent to subtracting m*R from the
  1800. * while number, produces a result which is at most m*R - m - 1,
  1801. * which divided by R is at most m-1.
  1802. *
  1803. * To convert *to* Montgomery form, you need a regular remainder
  1804. * routine, although you can just compute R*R (mod m) and do the
  1805. * conversion using Montgomery multiplication. To convert *from*
  1806. * Montgomery form, just Montgomery reduce the number to
  1807. * remove the extra factor of R.
  1808. *
  1809. * TODO: Change to a full inverse and use Karatsuba's multiplication
  1810. * rather than this word-at-a-time.
  1811. */
  1812. #ifndef lbnMontReduce_16
  1813. void
  1814. lbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned const mlen,
  1815. BNWORD16 inv)
  1816. {
  1817. BNWORD16 t;
  1818. BNWORD16 c = 0;
  1819. unsigned len = mlen;
  1820. /* inv must be the negative inverse of mod's least significant word */
  1821. assert((BNWORD16)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD16)-1);
  1822. assert(len);
  1823. do {
  1824. t = lbnMulAdd1_16(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
  1825. c += lbnAdd1_16(BIGLITTLE(n-mlen,n+mlen), len, t);
  1826. BIGLITTLE(--n,++n);
  1827. } while (--len);
  1828. /*
  1829. * All that adding can cause an overflow past the modulus size,
  1830. * but it's unusual, and never by much, so a subtraction loop
  1831. * is the right way to deal with it.
  1832. * This subtraction happens infrequently - I've only ever seen it
  1833. * invoked once per reduction, and then just under 22.5% of the time.
  1834. */
  1835. while (c)
  1836. c -= lbnSubN_16(n, mod, mlen);
  1837. while (lbnCmp_16(n, mod, mlen) >= 0)
  1838. (void)lbnSubN_16(n, mod, mlen);
  1839. }
  1840. #endif /* !lbnMontReduce_16 */
  1841. /*
  1842. * A couple of helpers that you might want to implement atomically
  1843. * in asm sometime.
  1844. */
  1845. #ifndef lbnMontMul_16
  1846. /*
  1847. * Multiply "num1" by "num2", modulo "mod", all of length "len", and
  1848. * place the result in the high half of "prod". "inv" is the inverse
  1849. * of the least-significant word of the modulus, modulo 2^16.
  1850. * This uses numbers in Montgomery form. Reduce using "len" and "inv".
  1851. *
  1852. * This is implemented as a macro to win on compilers that don't do
  1853. * inlining, since it's so trivial.
  1854. */
  1855. #define lbnMontMul_16(prod, n1, n2, mod, len, inv) \
  1856. (lbnMulX_16(prod, n1, n2, len), lbnMontReduce_16(prod, mod, len, inv))
  1857. #endif /* !lbnMontMul_16 */
  1858. #ifndef lbnMontSquare_16
  1859. /*
  1860. * Square "n", modulo "mod", both of length "len", and place the result
  1861. * in the high half of "prod". "inv" is the inverse of the least-significant
  1862. * word of the modulus, modulo 2^16.
  1863. * This uses numbers in Montgomery form. Reduce using "len" and "inv".
  1864. *
  1865. * This is implemented as a macro to win on compilers that don't do
  1866. * inlining, since it's so trivial.
  1867. */
  1868. #define lbnMontSquare_16(prod, n, mod, len, inv) \
  1869. (lbnSquare_16(prod, n, len), lbnMontReduce_16(prod, mod, len, inv))
  1870. #endif /* !lbnMontSquare_16 */
  1871. /*
  1872. * Convert a number to Montgomery form - requires mlen + nlen words
  1873. * of memory in "n".
  1874. */
  1875. void
  1876. lbnToMont_16(BNWORD16 *n, unsigned nlen, BNWORD16 *mod, unsigned mlen)
  1877. {
  1878. /* Move n up "mlen" words */
  1879. lbnCopy_16(BIGLITTLE(n-mlen,n+mlen), n, nlen);
  1880. lbnZero_16(n, mlen);
  1881. /* Do the division - dump the quotient in the high-order words */
  1882. (void)lbnDiv_16(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
  1883. }
  1884. /*
  1885. * Convert from Montgomery form. Montgomery reduction is all that is
  1886. * needed.
  1887. */
  1888. void
  1889. lbnFromMont_16(BNWORD16 *n, BNWORD16 *mod, unsigned len)
  1890. {
  1891. /* Zero the high words of n */
  1892. lbnZero_16(BIGLITTLE(n-len,n+len), len);
  1893. lbnMontReduce_16(n, mod, len, lbnMontInv1_16(mod[BIGLITTLE(-1,0)]));
  1894. /* Move n down len words */
  1895. lbnCopy_16(n, BIGLITTLE(n-len,n+len), len);
  1896. }
  1897. /*
  1898. * The windowed exponentiation algorithm, precomputes a table of odd
  1899. * powers of n up to 2^k. See the comment in bnExpMod_16 below for
  1900. * an explanation of how it actually works works.
  1901. *
  1902. * It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
  1903. * multiplies (on average) to perform the exponentiation. To minimize
  1904. * the sum, k must vary with e. The optimal window sizes vary with the
  1905. * exponent length. Here are some selected values and the boundary cases.
  1906. * (An underscore _ has been inserted into some of the numbers to ensure
  1907. * that magic strings like 16 do not appear in this table. It should be
  1908. * ignored.)
  1909. *
  1910. * At e = 1 bits, k=1 (0.000000) is best
  1911. * At e = 2 bits, k=1 (0.500000) is best
  1912. * At e = 4 bits, k=1 (1.500000) is best
  1913. * At e = 8 bits, k=2 (3.333333) < k=1 (3.500000)
  1914. * At e = 1_6 bits, k=2 (6.000000) is best
  1915. * At e = 26 bits, k=3 (9.250000) < k=2 (9.333333)
  1916. * At e = 3_2 bits, k=3 (10.750000) is best
  1917. * At e = 6_4 bits, k=3 (18.750000) is best
  1918. * At e = 82 bits, k=4 (23.200000) < k=3 (23.250000)
  1919. * At e = 128 bits, k=4 (3_2.400000) is best
  1920. * At e = 242 bits, k=5 (55.1_66667) < k=4 (55.200000)
  1921. * At e = 256 bits, k=5 (57.500000) is best
  1922. * At e = 512 bits, k=5 (100.1_66667) is best
  1923. * At e = 674 bits, k=6 (127.142857) < k=5 (127.1_66667)
  1924. * At e = 1024 bits, k=6 (177.142857) is best
  1925. * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
  1926. * At e = 2048 bits, k=7 (318.875000) is best
  1927. * At e = 4096 bits, k=7 (574.875000) is best
  1928. *
  1929. * The numbers in parentheses are the expected number of multiplications
  1930. * needed to do the computation. The normal russian-peasant modular
  1931. * exponentiation technique always uses (e-1)/2. For exponents as
  1932. * small as 192 bits (below the range of current factoring algorithms),
  1933. * half of the multiplies are eliminated, 45.2 as opposed to the naive
  1934. * 95.5. Counting the 191 squarings as 3/4 a multiply each (squaring
  1935. * proper is just over half of multiplying, but the Montgomery
  1936. * reduction in each case is also a multiply), that's 143.25
  1937. * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
  1938. * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
  1939. * 24.3% savings. It asymptotically approaches 25%.
  1940. *
  1941. * Um, actually there's a slightly more accurate way to count, which
  1942. * really is the average number of multiplies required, averaged
  1943. * uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
  1944. * It's based on the recurrence that for the last b bits, b <= k, at
  1945. * most one multiply is needed (and none at all 1/2^b of the time),
  1946. * while when b > k, the odds are 1/2 each way that the bit will be
  1947. * 0 (meaning no multiplies to reduce it to the b-1-bit case) and
  1948. * 1/2 that the bit will be 1, starting a k-bit window and requiring
  1949. * 1 multiply beyond the b-k-bit case. Since the most significant
  1950. * bit is always 1, a k-bit window always starts there, and that
  1951. * multiply is by 1, so it isn't a multiply at all. Thus, the
  1952. * number of multiplies is simply that needed for the last e-k bits.
  1953. * This recurrence produces:
  1954. *
  1955. * At e = 1 bits, k=1 (0.000000) is best
  1956. * At e = 2 bits, k=1 (0.500000) is best
  1957. * At e = 4 bits, k=1 (1.500000) is best
  1958. * At e = 6 bits, k=2 (2.437500) < k=1 (2.500000)
  1959. * At e = 8 bits, k=2 (3.109375) is best
  1960. * At e = 1_6 bits, k=2 (5.777771) is best
  1961. * At e = 24 bits, k=3 (8.437629) < k=2 (8.444444)
  1962. * At e = 3_2 bits, k=3 (10.437492) is best
  1963. * At e = 6_4 bits, k=3 (18.437500) is best
  1964. * At e = 81 bits, k=4 (22.6_40000) < k=3 (22.687500)
  1965. * At e = 128 bits, k=4 (3_2.040000) is best
  1966. * At e = 241 bits, k=5 (54.611111) < k=4 (54.6_40000)
  1967. * At e = 256 bits, k=5 (57.111111) is best
  1968. * At e = 512 bits, k=5 (99.777778) is best
  1969. * At e = 673 bits, k=6 (126.591837) < k=5 (126.611111)
  1970. * At e = 1024 bits, k=6 (176.734694) is best
  1971. * At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
  1972. * At e = 2048 bits, k=7 (318.453125) is best
  1973. * At e = 4096 bits, k=7 (574.453125) is best
  1974. *
  1975. * This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
  1976. * of 8, 26, 82, 242, 674, and 1794. Not a very big difference.
  1977. * (The numbers past that are k=8 at 4609 and k=9 at 11521,
  1978. * vs. one more in each case for the approximation.)
  1979. *
  1980. * Given that exponents for which k>7 are useful are uncommon,
  1981. * a fixed size table for k <= 7 is used for simplicity.
  1982. *
  1983. * The basic number of squarings needed is e-1, although a k-bit
  1984. * window (for k > 1) can save, on average, k-2 of those, too.
  1985. * That savings currently isn't counted here. It would drive the
  1986. * crossover points slightly lower.
  1987. * (Actually, this win is also reduced in the DoubleExpMod case,
  1988. * meaning we'd have to split the tables. Except for that, the
  1989. * multiplies by powers of the two bases are independent, so
  1990. * the same logic applies to each as the single case.)
  1991. *
  1992. * Table entry i is the largest number of bits in an exponent to
  1993. * process with a window size of i+1. Entry 6 is the largest
  1994. * possible unsigned number, so the window will never be more
  1995. * than 7 bits, requiring 2^6 = 0x40 slots.
  1996. */
  1997. #define BNEXPMOD_MAX_WINDOW 7
  1998. static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {
  1999. 5, 23, 80, 240, 672, 1792, (unsigned)-1
  2000. /* 7, 25, 81, 241, 673, 1793, (unsigned)-1 ### The old approximations */
  2001. };
  2002. /*
  2003. * Perform modular exponentiation, as fast as possible! This uses
  2004. * Montgomery reduction, optimized squaring, and windowed exponentiation.
  2005. * The modulus "mod" MUST be odd!
  2006. *
  2007. * This returns 0 on success, -1 on out of memory.
  2008. *
  2009. * The window algorithm:
  2010. * The idea is to keep a running product of b1 = n^(high-order bits of exp),
  2011. * and then keep appending exponent bits to it. The following patterns
  2012. * apply to a 3-bit window (k = 3):
  2013. * To append 0: square
  2014. * To append 1: square, multiply by n^1
  2015. * To append 10: square, multiply by n^1, square
  2016. * To append 11: square, square, multiply by n^3
  2017. * To append 100: square, multiply by n^1, square, square
  2018. * To append 101: square, square, square, multiply by n^5
  2019. * To append 110: square, square, multiply by n^3, square
  2020. * To append 111: square, square, square, multiply by n^7
  2021. *
  2022. * Since each pattern involves only one multiply, the longer the pattern
  2023. * the better, except that a 0 (no multiplies) can be appended directly.
  2024. * We precompute a table of odd powers of n, up to 2^k, and can then
  2025. * multiply k bits of exponent at a time. Actually, assuming random
  2026. * exponents, there is on average one zero bit between needs to
  2027. * multiply (1/2 of the time there's none, 1/4 of the time there's 1,
  2028. * 1/8 of the time, there's 2, 1/16 of the time, there's 3, etc.), so
  2029. * you have to do one multiply per k+1 bits of exponent.
  2030. *
  2031. * The loop walks down the exponent, squaring the result buffer as
  2032. * it goes. There is a wbits+1 bit lookahead buffer, buf, that is
  2033. * filled with the upcoming exponent bits. (What is read after the
  2034. * end of the exponent is unimportant, but it is filled with zero here.)
  2035. * When the most-significant bit of this buffer becomes set, i.e.
  2036. * (buf & tblmask) != 0, we have to decide what pattern to multiply
  2037. * by, and when to do it. We decide, remember to do it in future
  2038. * after a suitable number of squarings have passed (e.g. a pattern
  2039. * of "100" in the buffer requires that we multiply by n^1 immediately;
  2040. * a pattern of "110" calls for multiplying by n^3 after one more
  2041. * squaring), clear the buffer, and continue.
  2042. *
  2043. * When we start, there is one more optimization: the result buffer
  2044. * is implcitly one, so squaring it or multiplying by it can be
  2045. * optimized away. Further, if we start with a pattern like "100"
  2046. * in the lookahead window, rather than placing n into the buffer
  2047. * and then starting to square it, we have already computed n^2
  2048. * to compute the odd-powers table, so we can place that into
  2049. * the buffer and save a squaring.
  2050. *
  2051. * This means that if you have a k-bit window, to compute n^z,
  2052. * where z is the high k bits of the exponent, 1/2 of the time
  2053. * it requires no squarings. 1/4 of the time, it requires 1
  2054. * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
  2055. * And the remaining 1/2^(k-1) of the time, the top k bits are a
  2056. * 1 followed by k-1 0 bits, so it again only requires k-2
  2057. * squarings, not k-1. The average of these is 1. Add that
  2058. * to the one squaring we have to do to compute the table,
  2059. * and you'll see that a k-bit window saves k-2 squarings
  2060. * as well as reducing the multiplies. (It actually doesn't
  2061. * hurt in the case k = 1, either.)
  2062. *
  2063. * n must have mlen words allocated. Although fewer may be in use
  2064. * when n is passed in, all are in use on exit.
  2065. */
  2066. int
  2067. lbnExpMod_16(BNWORD16 *result, BNWORD16 const *n, unsigned nlen,
  2068. BNWORD16 const *e, unsigned elen, BNWORD16 *mod, unsigned mlen)
  2069. {
  2070. BNWORD16 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];
  2071. /* Table of odd powers of n */
  2072. unsigned ebits; /* Exponent bits */
  2073. unsigned wbits; /* Window size */
  2074. unsigned tblmask; /* Mask of exponentiation window */
  2075. BNWORD16 bitpos; /* Mask of current look-ahead bit */
  2076. unsigned buf; /* Buffer of exponent bits */
  2077. unsigned multpos; /* Where to do pending multiply */
  2078. BNWORD16 const *mult; /* What to multiply by */
  2079. unsigned i; /* Loop counter */
  2080. int isone; /* Flag: accum. is implicitly one */
  2081. BNWORD16 *a, *b; /* Working buffers/accumulators */
  2082. BNWORD16 *t; /* Pointer into the working buffers */
  2083. BNWORD16 inv; /* mod^-1 modulo 2^16 */
  2084. int y; /* bnYield() result */
  2085. assert(mlen);
  2086. assert(nlen <= mlen);
  2087. /* First, a couple of trivial cases. */
  2088. elen = lbnNorm_16(e, elen);
  2089. if (!elen) {
  2090. /* x ^ 0 == 1 */
  2091. lbnZero_16(result, mlen);
  2092. BIGLITTLE(result[-1],result[0]) = 1;
  2093. return 0;
  2094. }
  2095. ebits = lbnBits_16(e, elen);
  2096. if (ebits == 1) {
  2097. /* x ^ 1 == x */
  2098. if (n != result)
  2099. lbnCopy_16(result, n, nlen);
  2100. if (mlen > nlen)
  2101. lbnZero_16(BIGLITTLE(result-nlen,result+nlen),
  2102. mlen-nlen);
  2103. return 0;
  2104. }
  2105. /* Okay, now move the exponent pointer to the most-significant word */
  2106. e = BIGLITTLE(e-elen, e+elen-1);
  2107. /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
  2108. wbits = 0;
  2109. while (ebits > bnExpModThreshTable[wbits])
  2110. wbits++;
  2111. /* Allocate working storage: two product buffers and the tables. */
  2112. LBNALLOC(a, BNWORD16, 2*mlen);
  2113. if (!a)
  2114. return -1;
  2115. LBNALLOC(b, BNWORD16, 2*mlen);
  2116. if (!b) {
  2117. LBNFREE(a, 2*mlen);
  2118. return -1;
  2119. }
  2120. /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
  2121. tblmask = 1u << wbits;
  2122. /* We have the result buffer available, so use it. */
  2123. table[0] = result;
  2124. /*
  2125. * Okay, we now have a minimal-sized table - expand it.
  2126. * This is allowed to fail! If so, scale back the table size
  2127. * and proceed.
  2128. */
  2129. for (i = 1; i < tblmask; i++) {
  2130. LBNALLOC(t, BNWORD16, mlen);
  2131. if (!t) /* Out of memory! Quit the loop. */
  2132. break;
  2133. table[i] = t;
  2134. }
  2135. /* If we stopped, with i < tblmask, shrink the tables appropriately */
  2136. while (tblmask > i) {
  2137. wbits--;
  2138. tblmask >>= 1;
  2139. }
  2140. /* Free up our overallocations */
  2141. while (--i > tblmask)
  2142. LBNFREE(table[i], mlen);
  2143. /* Okay, fill in the table */
  2144. /* Compute the necessary modular inverse */
  2145. inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
  2146. /* Convert n to Montgomery form */
  2147. /* Move n up "mlen" words into a */
  2148. t = BIGLITTLE(a-mlen, a+mlen);
  2149. lbnCopy_16(t, n, nlen);
  2150. lbnZero_16(a, mlen);
  2151. /* Do the division - lose the quotient into the high-order words */
  2152. (void)lbnDiv_16(t, a, mlen+nlen, mod, mlen);
  2153. /* Copy into first table entry */
  2154. lbnCopy_16(table[0], a, mlen);
  2155. /* Square a into b */
  2156. lbnMontSquare_16(b, a, mod, mlen, inv);
  2157. /* Use high half of b to initialize the table */
  2158. t = BIGLITTLE(b-mlen, b+mlen);
  2159. for (i = 1; i < tblmask; i++) {
  2160. lbnMontMul_16(a, t, table[i-1], mod, mlen, inv);
  2161. lbnCopy_16(table[i], BIGLITTLE(a-mlen, a+mlen), mlen);
  2162. #if BNYIELD
  2163. if (bnYield && (y = bnYield()) < 0)
  2164. goto yield;
  2165. #endif
  2166. }
  2167. /* We might use b = n^2 later... */
  2168. /* Initialze the fetch pointer */
  2169. bitpos = (BNWORD16)1 << ((ebits-1) & (16-1)); /* Initialize mask */
  2170. /* This should point to the msbit of e */
  2171. assert((*e & bitpos) != 0);
  2172. /*
  2173. * Pre-load the window. Becuase the window size is
  2174. * never larger than the exponent size, there is no need to
  2175. * detect running off the end of e in here.
  2176. *
  2177. * The read-ahead is controlled by elen and the bitpos mask.
  2178. * Note that this is *ahead* of ebits, which tracks the
  2179. * most significant end of the window. The purpose of this
  2180. * initialization is to get the two wbits+1 bits apart,
  2181. * like they should be.
  2182. *
  2183. * Note that bitpos and e1len together keep track of the
  2184. * lookahead read pointer in the exponent that is used here.
  2185. */
  2186. buf = 0;
  2187. for (i = 0; i <= wbits; i++) {
  2188. buf = (buf << 1) | ((*e & bitpos) != 0);
  2189. bitpos >>= 1;
  2190. if (!bitpos) {
  2191. BIGLITTLE(e++,e--);
  2192. bitpos = (BNWORD16)1 << (16-1);
  2193. elen--;
  2194. }
  2195. }
  2196. assert(buf & tblmask);
  2197. /*
  2198. * Set the pending multiply positions to a location that will
  2199. * never be encountered, thus ensuring that nothing will happen
  2200. * until the need for a multiply appears and one is scheduled.
  2201. */
  2202. multpos = ebits; /* A NULL value */
  2203. mult = 0; /* Force a crash if we use these */
  2204. /*
  2205. * Okay, now begins the real work. The first step is
  2206. * slightly magic, so it's done outside the main loop,
  2207. * but it's very similar to what's inside.
  2208. */
  2209. ebits--; /* Start processing the first bit... */
  2210. isone = 1;
  2211. /*
  2212. * This is just like the multiply in the loop, except that
  2213. * - We know the msbit of buf is set, and
  2214. * - We have the extra value n^2 floating around.
  2215. * So, do the usual computation, and if the result is that
  2216. * the buffer should be multiplied by n^1 immediately
  2217. * (which we'd normally then square), we multiply it
  2218. * (which reduces to a copy, which reduces to setting a flag)
  2219. * by n^2 and skip the squaring. Thus, we do the
  2220. * multiply and the squaring in one step.
  2221. */
  2222. assert(buf & tblmask);
  2223. multpos = ebits - wbits;
  2224. while ((buf & 1) == 0) {
  2225. buf >>= 1;
  2226. multpos++;
  2227. }
  2228. /* Intermediates can wrap, but final must NOT */
  2229. assert(multpos <= ebits);
  2230. mult = table[buf>>1];
  2231. buf = 0;
  2232. /* Special case: use already-computed value sitting in buffer */
  2233. if (multpos == ebits)
  2234. isone = 0;
  2235. /*
  2236. * At this point, the buffer (which is the high half of b) holds
  2237. * either 1 (implicitly, as the "isone" flag is set), or n^2.
  2238. */
  2239. /*
  2240. * The main loop. The procedure is:
  2241. * - Advance the window
  2242. * - If the most-significant bit of the window is set,
  2243. * schedule a multiply for the appropriate time in the
  2244. * future (may be immediately)
  2245. * - Perform any pending multiples
  2246. * - Check for termination
  2247. * - Square the buffer
  2248. *
  2249. * At any given time, the acumulated product is held in
  2250. * the high half of b.
  2251. */
  2252. for (;;) {
  2253. ebits--;
  2254. /* Advance the window */
  2255. assert(buf < tblmask);
  2256. buf <<= 1;
  2257. /*
  2258. * This reads ahead of the current exponent position
  2259. * (controlled by ebits), so we have to be able to read
  2260. * past the lsb of the exponents without error.
  2261. */
  2262. if (elen) {
  2263. buf |= ((*e & bitpos) != 0);
  2264. bitpos >>= 1;
  2265. if (!bitpos) {
  2266. BIGLITTLE(e++,e--);
  2267. bitpos = (BNWORD16)1 << (16-1);
  2268. elen--;
  2269. }
  2270. }
  2271. /* Examine the window for pending multiplies */
  2272. if (buf & tblmask) {
  2273. multpos = ebits - wbits;
  2274. while ((buf & 1) == 0) {
  2275. buf >>= 1;
  2276. multpos++;
  2277. }
  2278. /* Intermediates can wrap, but final must NOT */
  2279. assert(multpos <= ebits);
  2280. mult = table[buf>>1];
  2281. buf = 0;
  2282. }
  2283. /* If we have a pending multiply, do it */
  2284. if (ebits == multpos) {
  2285. /* Multiply by the table entry remembered previously */
  2286. t = BIGLITTLE(b-mlen, b+mlen);
  2287. if (isone) {
  2288. /* Multiply by 1 is a trivial case */
  2289. lbnCopy_16(t, mult, mlen);
  2290. isone = 0;
  2291. } else {
  2292. lbnMontMul_16(a, t, mult, mod, mlen, inv);
  2293. /* Swap a and b */
  2294. t = a; a = b; b = t;
  2295. }
  2296. }
  2297. /* Are we done? */
  2298. if (!ebits)
  2299. break;
  2300. /* Square the input */
  2301. if (!isone) {
  2302. t = BIGLITTLE(b-mlen, b+mlen);
  2303. lbnMontSquare_16(a, t, mod, mlen, inv);
  2304. /* Swap a and b */
  2305. t = a; a = b; b = t;
  2306. }
  2307. #if BNYIELD
  2308. if (bnYield && (y = bnYield()) < 0)
  2309. goto yield;
  2310. #endif
  2311. } /* for (;;) */
  2312. assert(!isone);
  2313. assert(!buf);
  2314. /* DONE! */
  2315. /* Convert result out of Montgomery form */
  2316. t = BIGLITTLE(b-mlen, b+mlen);
  2317. lbnCopy_16(b, t, mlen);
  2318. lbnZero_16(t, mlen);
  2319. lbnMontReduce_16(b, mod, mlen, inv);
  2320. lbnCopy_16(result, t, mlen);
  2321. /*
  2322. * Clean up - free intermediate storage.
  2323. * Do NOT free table[0], which is the result
  2324. * buffer.
  2325. */
  2326. y = 0;
  2327. #if BNYIELD
  2328. yield:
  2329. #endif
  2330. while (--tblmask)
  2331. LBNFREE(table[tblmask], mlen);
  2332. LBNFREE(b, 2*mlen);
  2333. LBNFREE(a, 2*mlen);
  2334. return y; /* Success */
  2335. }
  2336. #if 0
  2337. /*
  2338. * Compute and return n1^e1 * n2^e2 mod "mod".
  2339. * result may be either input buffer, or something separate.
  2340. * It must be "mlen" words long.
  2341. *
  2342. * There is a current position in the exponents, which is kept in e1bits.
  2343. * (The exponents are swapped if necessary so e1 is the longer of the two.)
  2344. * At any given time, the value in the accumulator is
  2345. * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
  2346. * As e1bits is counted down, this is updated, by squaring it and doing
  2347. * any necessary multiplies.
  2348. * To decide on the necessary multiplies, two windows, each w1bits+1 bits
  2349. * wide, are maintained in buf1 and buf2, which read *ahead* of the
  2350. * e1bits position (with appropriate handling of the case when e1bits
  2351. * drops below w1bits+1). When the most-significant bit of either window
  2352. * becomes set, indicating that something needs to be multiplied by
  2353. * the accumulator or it will get out of sync, the window is examined
  2354. * to see which power of n1 or n2 to multiply by, and when (possibly
  2355. * later, if the power is greater than 1) the multiply should take
  2356. * place. Then the multiply and its location are remembered and the
  2357. * window is cleared.
  2358. *
  2359. * If we had every power of n1 in the table, the multiply would always
  2360. * be w1bits steps in the future. But we only keep the odd powers,
  2361. * so instead of waiting w1bits squarings and then multiplying
  2362. * by n1^k, we wait w1bits-k squarings and multiply by n1.
  2363. *
  2364. * Actually, w2bits can be less than w1bits, but the window is the same
  2365. * size, to make it easier to keep track of where we're reading. The
  2366. * appropriate number of low-order bits of the window are just ignored.
  2367. */
  2368. int
  2369. lbnDoubleExpMod_16(BNWORD16 *result,
  2370. BNWORD16 const *n1, unsigned n1len,
  2371. BNWORD16 const *e1, unsigned e1len,
  2372. BNWORD16 const *n2, unsigned n2len,
  2373. BNWORD16 const *e2, unsigned e2len,
  2374. BNWORD16 *mod, unsigned mlen)
  2375. {
  2376. BNWORD16 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)];
  2377. /* Table of odd powers of n1 */
  2378. BNWORD16 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)];
  2379. /* Table of odd powers of n2 */
  2380. unsigned e1bits, e2bits; /* Exponent bits */
  2381. unsigned w1bits, w2bits; /* Window sizes */
  2382. unsigned tblmask; /* Mask of exponentiation window */
  2383. BNWORD16 bitpos; /* Mask of current look-ahead bit */
  2384. unsigned buf1, buf2; /* Buffer of exponent bits */
  2385. unsigned mult1pos, mult2pos; /* Where to do pending multiply */
  2386. BNWORD16 const *mult1, *mult2; /* What to multiply by */
  2387. unsigned i; /* Loop counter */
  2388. int isone; /* Flag: accum. is implicitly one */
  2389. BNWORD16 *a, *b; /* Working buffers/accumulators */
  2390. BNWORD16 *t; /* Pointer into the working buffers */
  2391. BNWORD16 inv; /* mod^-1 modulo 2^16 */
  2392. int y; /* bnYield() result */
  2393. assert(mlen);
  2394. assert(n1len <= mlen);
  2395. assert(n2len <= mlen);
  2396. /* First, a couple of trivial cases. */
  2397. e1len = lbnNorm_16(e1, e1len);
  2398. e2len = lbnNorm_16(e2, e2len);
  2399. /* Ensure that the first exponent is the longer */
  2400. e1bits = lbnBits_16(e1, e1len);
  2401. e2bits = lbnBits_16(e2, e2len);
  2402. if (e1bits < e2bits) {
  2403. i = e1len; e1len = e2len; e2len = i;
  2404. i = e1bits; e1bits = e2bits; e2bits = i;
  2405. t = (BNWORD16 *)n1; n1 = n2; n2 = t;
  2406. t = (BNWORD16 *)e1; e1 = e2; e2 = t;
  2407. }
  2408. assert(e1bits >= e2bits);
  2409. /* Handle a trivial case */
  2410. if (!e2len)
  2411. return lbnExpMod_16(result, n1, n1len, e1, e1len, mod, mlen);
  2412. assert(e2bits);
  2413. /* The code below fucks up if the exponents aren't at least 2 bits */
  2414. if (e1bits == 1) {
  2415. assert(e2bits == 1);
  2416. LBNALLOC(a, BNWORD16, n1len+n2len);
  2417. if (!a)
  2418. return -1;
  2419. lbnMul_16(a, n1, n1len, n2, n2len);
  2420. /* Do a direct modular reduction */
  2421. if (n1len + n2len >= mlen)
  2422. (void)lbnDiv_16(a+mlen, a, n1len+n2len, mod, mlen);
  2423. lbnCopy_16(result, a, mlen);
  2424. LBNFREE(a, n1len+n2len);
  2425. return 0;
  2426. }
  2427. /* Okay, now move the exponent pointers to the most-significant word */
  2428. e1 = BIGLITTLE(e1-e1len, e1+e1len-1);
  2429. e2 = BIGLITTLE(e2-e2len, e2+e2len-1);
  2430. /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
  2431. w1bits = 0;
  2432. while (e1bits > bnExpModThreshTable[w1bits])
  2433. w1bits++;
  2434. w2bits = 0;
  2435. while (e2bits > bnExpModThreshTable[w2bits])
  2436. w2bits++;
  2437. assert(w1bits >= w2bits);
  2438. /* Allocate working storage: two product buffers and the tables. */
  2439. LBNALLOC(a, BNWORD16, 2*mlen);
  2440. if (!a)
  2441. return -1;
  2442. LBNALLOC(b, BNWORD16, 2*mlen);
  2443. if (!b) {
  2444. LBNFREE(a, 2*mlen);
  2445. return -1;
  2446. }
  2447. /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
  2448. tblmask = 1u << w1bits;
  2449. /* Use buf2 for its size, temporarily */
  2450. buf2 = 1u << w2bits;
  2451. LBNALLOC(t, BNWORD16, mlen);
  2452. if (!t) {
  2453. LBNFREE(b, 2*mlen);
  2454. LBNFREE(a, 2*mlen);
  2455. return -1;
  2456. }
  2457. table1[0] = t;
  2458. table2[0] = result;
  2459. /*
  2460. * Okay, we now have some minimal-sized tables - expand them.
  2461. * This is allowed to fail! If so, scale back the table sizes
  2462. * and proceed. We allocate both tables at the same time
  2463. * so if it fails partway through, they'll both be a reasonable
  2464. * size rather than one huge and one tiny.
  2465. * When i passes buf2 (the number of entries in the e2 window,
  2466. * which may be less than the number of entries in the e1 window),
  2467. * stop allocating e2 space.
  2468. */
  2469. for (i = 1; i < tblmask; i++) {
  2470. LBNALLOC(t, BNWORD16, mlen);
  2471. if (!t) /* Out of memory! Quit the loop. */
  2472. break;
  2473. table1[i] = t;
  2474. if (i < buf2) {
  2475. LBNALLOC(t, BNWORD16, mlen);
  2476. if (!t) {
  2477. LBNFREE(table1[i], mlen);
  2478. break;
  2479. }
  2480. table2[i] = t;
  2481. }
  2482. }
  2483. /* If we stopped, with i < tblmask, shrink the tables appropriately */
  2484. while (tblmask > i) {
  2485. w1bits--;
  2486. tblmask >>= 1;
  2487. }
  2488. /* Free up our overallocations */
  2489. while (--i > tblmask) {
  2490. if (i < buf2)
  2491. LBNFREE(table2[i], mlen);
  2492. LBNFREE(table1[i], mlen);
  2493. }
  2494. /* And shrink the second window too, if needed */
  2495. if (w2bits > w1bits) {
  2496. w2bits = w1bits;
  2497. buf2 = tblmask;
  2498. }
  2499. /*
  2500. * From now on, use the w2bits variable for the difference
  2501. * between w1bits and w2bits.
  2502. */
  2503. w2bits = w1bits-w2bits;
  2504. /* Okay, fill in the tables */
  2505. /* Compute the necessary modular inverse */
  2506. inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
  2507. /* Convert n1 to Montgomery form */
  2508. /* Move n1 up "mlen" words into a */
  2509. t = BIGLITTLE(a-mlen, a+mlen);
  2510. lbnCopy_16(t, n1, n1len);
  2511. lbnZero_16(a, mlen);
  2512. /* Do the division - lose the quotient into the high-order words */
  2513. (void)lbnDiv_16(t, a, mlen+n1len, mod, mlen);
  2514. /* Copy into first table entry */
  2515. lbnCopy_16(table1[0], a, mlen);
  2516. /* Square a into b */
  2517. lbnMontSquare_16(b, a, mod, mlen, inv);
  2518. /* Use high half of b to initialize the first table */
  2519. t = BIGLITTLE(b-mlen, b+mlen);
  2520. for (i = 1; i < tblmask; i++) {
  2521. lbnMontMul_16(a, t, table1[i-1], mod, mlen, inv);
  2522. lbnCopy_16(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen);
  2523. #if BNYIELD
  2524. if (bnYield && (y = bnYield()) < 0)
  2525. goto yield;
  2526. #endif
  2527. }
  2528. /* Convert n2 to Montgomery form */
  2529. t = BIGLITTLE(a-mlen, a+mlen);
  2530. /* Move n2 up "mlen" words into a */
  2531. lbnCopy_16(t, n2, n2len);
  2532. lbnZero_16(a, mlen);
  2533. /* Do the division - lose the quotient into the high-order words */
  2534. (void)lbnDiv_16(t, a, mlen+n2len, mod, mlen);
  2535. /* Copy into first table entry */
  2536. lbnCopy_16(table2[0], a, mlen);
  2537. /* Square it into a */
  2538. lbnMontSquare_16(a, table2[0], mod, mlen, inv);
  2539. /* Copy to b, low half */
  2540. lbnCopy_16(b, t, mlen);
  2541. /* Use b to initialize the second table */
  2542. for (i = 1; i < buf2; i++) {
  2543. lbnMontMul_16(a, b, table2[i-1], mod, mlen, inv);
  2544. lbnCopy_16(table2[i], t, mlen);
  2545. #if BNYIELD
  2546. if (bnYield && (y = bnYield()) < 0)
  2547. goto yield;
  2548. #endif
  2549. }
  2550. /*
  2551. * Okay, a recap: at this point, the low part of b holds
  2552. * n2^2, the high part holds n1^2, and the tables are
  2553. * initialized with the odd powers of n1 and n2 from 1
  2554. * through 2*tblmask-1 and 2*buf2-1.
  2555. *
  2556. * We might use those squares in b later, or we might not.
  2557. */
  2558. /* Initialze the fetch pointer */
  2559. bitpos = (BNWORD16)1 << ((e1bits-1) & (16-1)); /* Initialize mask */
  2560. /* This should point to the msbit of e1 */
  2561. assert((*e1 & bitpos) != 0);
  2562. /*
  2563. * Pre-load the windows. Becuase the window size is
  2564. * never larger than the exponent size, there is no need to
  2565. * detect running off the end of e1 in here.
  2566. *
  2567. * The read-ahead is controlled by e1len and the bitpos mask.
  2568. * Note that this is *ahead* of e1bits, which tracks the
  2569. * most significant end of the window. The purpose of this
  2570. * initialization is to get the two w1bits+1 bits apart,
  2571. * like they should be.
  2572. *
  2573. * Note that bitpos and e1len together keep track of the
  2574. * lookahead read pointer in the exponent that is used here.
  2575. * e2len is not decremented, it is only ever compared with
  2576. * e1len as *that* is decremented.
  2577. */
  2578. buf1 = buf2 = 0;
  2579. for (i = 0; i <= w1bits; i++) {
  2580. buf1 = (buf1 << 1) | ((*e1 & bitpos) != 0);
  2581. if (e1len <= e2len)
  2582. buf2 = (buf2 << 1) | ((*e2 & bitpos) != 0);
  2583. bitpos >>= 1;
  2584. if (!bitpos) {
  2585. BIGLITTLE(e1++,e1--);
  2586. if (e1len <= e2len)
  2587. BIGLITTLE(e2++,e2--);
  2588. bitpos = (BNWORD16)1 << (16-1);
  2589. e1len--;
  2590. }
  2591. }
  2592. assert(buf1 & tblmask);
  2593. /*
  2594. * Set the pending multiply positions to a location that will
  2595. * never be encountered, thus ensuring that nothing will happen
  2596. * until the need for a multiply appears and one is scheduled.
  2597. */
  2598. mult1pos = mult2pos = e1bits; /* A NULL value */
  2599. mult1 = mult2 = 0; /* Force a crash if we use these */
  2600. /*
  2601. * Okay, now begins the real work. The first step is
  2602. * slightly magic, so it's done outside the main loop,
  2603. * but it's very similar to what's inside.
  2604. */
  2605. isone = 1; /* Buffer is implicitly 1, so replace * by copy */
  2606. e1bits--; /* Start processing the first bit... */
  2607. /*
  2608. * This is just like the multiply in the loop, except that
  2609. * - We know the msbit of buf1 is set, and
  2610. * - We have the extra value n1^2 floating around.
  2611. * So, do the usual computation, and if the result is that
  2612. * the buffer should be multiplied by n1^1 immediately
  2613. * (which we'd normally then square), we multiply it
  2614. * (which reduces to a copy, which reduces to setting a flag)
  2615. * by n1^2 and skip the squaring. Thus, we do the
  2616. * multiply and the squaring in one step.
  2617. */
  2618. assert(buf1 & tblmask);
  2619. mult1pos = e1bits - w1bits;
  2620. while ((buf1 & 1) == 0) {
  2621. buf1 >>= 1;
  2622. mult1pos++;
  2623. }
  2624. /* Intermediates can wrap, but final must NOT */
  2625. assert(mult1pos <= e1bits);
  2626. mult1 = table1[buf1>>1];
  2627. buf1 = 0;
  2628. /* Special case: use already-computed value sitting in buffer */
  2629. if (mult1pos == e1bits)
  2630. isone = 0;
  2631. /*
  2632. * The first multiply by a power of n2. Similar, but
  2633. * we might not even want to schedule a multiply if e2 is
  2634. * shorter than e1, and the window might be shorter so
  2635. * we have to leave the low w2bits bits alone.
  2636. */
  2637. if (buf2 & tblmask) {
  2638. /* Remember low-order bits for later */
  2639. i = buf2 & ((1u << w2bits) - 1);
  2640. buf2 >>= w2bits;
  2641. mult2pos = e1bits - w1bits + w2bits;
  2642. while ((buf2 & 1) == 0) {
  2643. buf2 >>= 1;
  2644. mult2pos++;
  2645. }
  2646. assert(mult2pos <= e1bits);
  2647. mult2 = table2[buf2>>1];
  2648. buf2 = i;
  2649. if (mult2pos == e1bits) {
  2650. t = BIGLITTLE(b-mlen, b+mlen);
  2651. if (isone) {
  2652. lbnCopy_16(t, b, mlen); /* Copy low to high */
  2653. isone = 0;
  2654. } else {
  2655. lbnMontMul_16(a, t, b, mod, mlen, inv);
  2656. t = a; a = b; b = t;
  2657. }
  2658. }
  2659. }
  2660. /*
  2661. * At this point, the buffer (which is the high half of b)
  2662. * holds either 1 (implicitly, as the "isone" flag is set),
  2663. * n1^2, n2^2 or n1^2 * n2^2.
  2664. */
  2665. /*
  2666. * The main loop. The procedure is:
  2667. * - Advance the windows
  2668. * - If the most-significant bit of a window is set,
  2669. * schedule a multiply for the appropriate time in the
  2670. * future (may be immediately)
  2671. * - Perform any pending multiples
  2672. * - Check for termination
  2673. * - Square the buffers
  2674. *
  2675. * At any given time, the acumulated product is held in
  2676. * the high half of b.
  2677. */
  2678. for (;;) {
  2679. e1bits--;
  2680. /* Advance the windows */
  2681. assert(buf1 < tblmask);
  2682. buf1 <<= 1;
  2683. assert(buf2 < tblmask);
  2684. buf2 <<= 1;
  2685. /*
  2686. * This reads ahead of the current exponent position
  2687. * (controlled by e1bits), so we have to be able to read
  2688. * past the lsb of the exponents without error.
  2689. */
  2690. if (e1len) {
  2691. buf1 |= ((*e1 & bitpos) != 0);
  2692. if (e1len <= e2len)
  2693. buf2 |= ((*e2 & bitpos) != 0);
  2694. bitpos >>= 1;
  2695. if (!bitpos) {
  2696. BIGLITTLE(e1++,e1--);
  2697. if (e1len <= e2len)
  2698. BIGLITTLE(e2++,e2--);
  2699. bitpos = (BNWORD16)1 << (16-1);
  2700. e1len--;
  2701. }
  2702. }
  2703. /* Examine the first window for pending multiplies */
  2704. if (buf1 & tblmask) {
  2705. mult1pos = e1bits - w1bits;
  2706. while ((buf1 & 1) == 0) {
  2707. buf1 >>= 1;
  2708. mult1pos++;
  2709. }
  2710. /* Intermediates can wrap, but final must NOT */
  2711. assert(mult1pos <= e1bits);
  2712. mult1 = table1[buf1>>1];
  2713. buf1 = 0;
  2714. }
  2715. /*
  2716. * Examine the second window for pending multiplies.
  2717. * Window 2 can be smaller than window 1, but we
  2718. * keep the same number of bits in buf2, so we need
  2719. * to ignore any low-order bits in the buffer when
  2720. * computing what to multiply by, and recompute them
  2721. * later.
  2722. */
  2723. if (buf2 & tblmask) {
  2724. /* Remember low-order bits for later */
  2725. i = buf2 & ((1u << w2bits) - 1);
  2726. buf2 >>= w2bits;
  2727. mult2pos = e1bits - w1bits + w2bits;
  2728. while ((buf2 & 1) == 0) {
  2729. buf2 >>= 1;
  2730. mult2pos++;
  2731. }
  2732. assert(mult2pos <= e1bits);
  2733. mult2 = table2[buf2>>1];
  2734. buf2 = i;
  2735. }
  2736. /* If we have a pending multiply for e1, do it */
  2737. if (e1bits == mult1pos) {
  2738. /* Multiply by the table entry remembered previously */
  2739. t = BIGLITTLE(b-mlen, b+mlen);
  2740. if (isone) {
  2741. /* Multiply by 1 is a trivial case */
  2742. lbnCopy_16(t, mult1, mlen);
  2743. isone = 0;
  2744. } else {
  2745. lbnMontMul_16(a, t, mult1, mod, mlen, inv);
  2746. /* Swap a and b */
  2747. t = a; a = b; b = t;
  2748. }
  2749. }
  2750. /* If we have a pending multiply for e2, do it */
  2751. if (e1bits == mult2pos) {
  2752. /* Multiply by the table entry remembered previously */
  2753. t = BIGLITTLE(b-mlen, b+mlen);
  2754. if (isone) {
  2755. /* Multiply by 1 is a trivial case */
  2756. lbnCopy_16(t, mult2, mlen);
  2757. isone = 0;
  2758. } else {
  2759. lbnMontMul_16(a, t, mult2, mod, mlen, inv);
  2760. /* Swap a and b */
  2761. t = a; a = b; b = t;
  2762. }
  2763. }
  2764. /* Are we done? */
  2765. if (!e1bits)
  2766. break;
  2767. /* Square the buffer */
  2768. if (!isone) {
  2769. t = BIGLITTLE(b-mlen, b+mlen);
  2770. lbnMontSquare_16(a, t, mod, mlen, inv);
  2771. /* Swap a and b */
  2772. t = a; a = b; b = t;
  2773. }
  2774. #if BNYIELD
  2775. if (bnYield && (y = bnYield()) < 0)
  2776. goto yield;
  2777. #endif
  2778. } /* for (;;) */
  2779. assert(!isone);
  2780. assert(!buf1);
  2781. assert(!buf2);
  2782. /* DONE! */
  2783. /* Convert result out of Montgomery form */
  2784. t = BIGLITTLE(b-mlen, b+mlen);
  2785. lbnCopy_16(b, t, mlen);
  2786. lbnZero_16(t, mlen);
  2787. lbnMontReduce_16(b, mod, mlen, inv);
  2788. lbnCopy_16(result, t, mlen);
  2789. /* Clean up - free intermediate storage */
  2790. y = 0;
  2791. #if BNYIELD
  2792. yield:
  2793. #endif
  2794. buf2 = tblmask >> w2bits;
  2795. while (--tblmask) {
  2796. if (tblmask < buf2)
  2797. LBNFREE(table2[tblmask], mlen);
  2798. LBNFREE(table1[tblmask], mlen);
  2799. }
  2800. t = table1[0];
  2801. LBNFREE(t, mlen);
  2802. LBNFREE(b, 2*mlen);
  2803. LBNFREE(a, 2*mlen);
  2804. return y; /* Success */
  2805. }
  2806. #endif
  2807. /*
  2808. * 2^exp (mod mod). This is an optimized version for use in Fermat
  2809. * tests. The input value of n is ignored; it is returned with
  2810. * "mlen" words valid.
  2811. */
  2812. int
  2813. lbnTwoExpMod_16(BNWORD16 *n, BNWORD16 const *exp, unsigned elen,
  2814. BNWORD16 *mod, unsigned mlen)
  2815. {
  2816. unsigned e; /* Copy of high words of the exponent */
  2817. unsigned bits; /* Assorted counter of bits */
  2818. BNWORD16 const *bitptr;
  2819. BNWORD16 bitword, bitpos;
  2820. BNWORD16 *a, *b, *a1;
  2821. BNWORD16 inv;
  2822. int y; /* Result of bnYield() */
  2823. assert(mlen);
  2824. bitptr = BIGLITTLE(exp-elen, exp+elen-1);
  2825. bitword = *bitptr;
  2826. assert(bitword);
  2827. /* Clear n for future use. */
  2828. lbnZero_16(n, mlen);
  2829. bits = lbnBits_16(exp, elen);
  2830. /* First, a couple of trivial cases. */
  2831. if (bits <= 1) {
  2832. /* 2 ^ 0 == 1, 2 ^ 1 == 2 */
  2833. BIGLITTLE(n[-1],n[0]) = (BNWORD16)1<<elen;
  2834. return 0;
  2835. }
  2836. /* Set bitpos to the most significant bit */
  2837. bitpos = (BNWORD16)1 << ((bits-1) & (16-1));
  2838. /* Now, count the bits in the modulus. */
  2839. bits = lbnBits_16(mod, mlen);
  2840. assert(bits > 1); /* a 1-bit modulus is just stupid... */
  2841. /*
  2842. * We start with 1<<e, where "e" is as many high bits of the
  2843. * exponent as we can manage without going over the modulus.
  2844. * This first loop finds "e".
  2845. */
  2846. e = 1;
  2847. while (elen) {
  2848. /* Consume the first bit */
  2849. bitpos >>= 1;
  2850. if (!bitpos) {
  2851. if (!--elen)
  2852. break;
  2853. bitword = BIGLITTLE(*++bitptr,*--bitptr);
  2854. bitpos = (BNWORD16)1<<(16-1);
  2855. }
  2856. e = (e << 1) | ((bitpos & bitword) != 0);
  2857. if (e >= bits) { /* Overflow! Back out. */
  2858. e >>= 1;
  2859. break;
  2860. }
  2861. }
  2862. /*
  2863. * The bit in "bitpos" being examined by the bit buffer has NOT
  2864. * been consumed yet. This may be past the end of the exponent,
  2865. * in which case elen == 1.
  2866. */
  2867. /* Okay, now, set bit "e" in n. n is already zero. */
  2868. inv = (BNWORD16)1 << (e & (16-1));
  2869. e /= 16;
  2870. BIGLITTLE(n[-e-1],n[e]) = inv;
  2871. /*
  2872. * The effective length of n in words is now "e+1".
  2873. * This is used a little bit later.
  2874. */
  2875. if (!elen)
  2876. return 0; /* That was easy! */
  2877. /*
  2878. * We have now processed the first few bits. The next step
  2879. * is to convert this to Montgomery form for further squaring.
  2880. */
  2881. /* Allocate working storage: two product buffers */
  2882. LBNALLOC(a, BNWORD16, 2*mlen);
  2883. if (!a)
  2884. return -1;
  2885. LBNALLOC(b, BNWORD16, 2*mlen);
  2886. if (!b) {
  2887. LBNFREE(a, 2*mlen);
  2888. return -1;
  2889. }
  2890. /* Convert n to Montgomery form */
  2891. inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
  2892. assert(inv & 1); /* Modulus must be odd */
  2893. inv = lbnMontInv1_16(inv);
  2894. /* Move n (length e+1, remember?) up "mlen" words into b */
  2895. /* Note that we lie about a1 for a bit - it's pointing to b */
  2896. a1 = BIGLITTLE(b-mlen,b+mlen);
  2897. lbnCopy_16(a1, n, e+1);
  2898. lbnZero_16(b, mlen);
  2899. /* Do the division - dump the quotient into the high-order words */
  2900. (void)lbnDiv_16(a1, b, mlen+e+1, mod, mlen);
  2901. /*
  2902. * Now do the first squaring and modular reduction to put
  2903. * the number up in a1 where it belongs.
  2904. */
  2905. lbnMontSquare_16(a, b, mod, mlen, inv);
  2906. /* Fix up a1 to point to where it should go. */
  2907. a1 = BIGLITTLE(a-mlen,a+mlen);
  2908. /*
  2909. * Okay, now, a1 holds the number being accumulated, and
  2910. * b is a scratch register. Start working:
  2911. */
  2912. for (;;) {
  2913. /*
  2914. * Is the bit set? If so, double a1 as well.
  2915. * A modular doubling like this is very cheap.
  2916. */
  2917. if (bitpos & bitword) {
  2918. /*
  2919. * Double the number. If there was a carry out OR
  2920. * the result is greater than the modulus, subract
  2921. * the modulus.
  2922. */
  2923. if (lbnDouble_16(a1, mlen) ||
  2924. lbnCmp_16(a1, mod, mlen) > 0)
  2925. (void)lbnSubN_16(a1, mod, mlen);
  2926. }
  2927. /* Advance to the next exponent bit */
  2928. bitpos >>= 1;
  2929. if (!bitpos) {
  2930. if (!--elen)
  2931. break; /* Done! */
  2932. bitword = BIGLITTLE(*++bitptr,*--bitptr);
  2933. bitpos = (BNWORD16)1<<(16-1);
  2934. }
  2935. /*
  2936. * The elen/bitword/bitpos bit buffer is known to be
  2937. * non-empty, i.e. there is at least one more unconsumed bit.
  2938. * Thus, it's safe to square the number.
  2939. */
  2940. lbnMontSquare_16(b, a1, mod, mlen, inv);
  2941. /* Rename result (in b) back to a (a1, really). */
  2942. a1 = b; b = a; a = a1;
  2943. a1 = BIGLITTLE(a-mlen,a+mlen);
  2944. #if BNYIELD
  2945. if (bnYield && (y = bnYield()) < 0)
  2946. goto yield;
  2947. #endif
  2948. }
  2949. /* DONE! Just a little bit of cleanup... */
  2950. /*
  2951. * Convert result out of Montgomery form... this is
  2952. * just a Montgomery reduction.
  2953. */
  2954. lbnCopy_16(a, a1, mlen);
  2955. lbnZero_16(a1, mlen);
  2956. lbnMontReduce_16(a, mod, mlen, inv);
  2957. lbnCopy_16(n, a1, mlen);
  2958. /* Clean up - free intermediate storage */
  2959. y = 0;
  2960. #if BNYIELD
  2961. yield:
  2962. #endif
  2963. LBNFREE(b, 2*mlen);
  2964. LBNFREE(a, 2*mlen);
  2965. return y; /* Success */
  2966. }
  2967. /*
  2968. * Returns a substring of the big-endian array of bytes representation
  2969. * of the bignum array based on two parameters, the least significant
  2970. * byte number (0 to start with the least significant byte) and the
  2971. * length. I.e. the number returned is a representation of
  2972. * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
  2973. *
  2974. * It is an error if the bignum is not at least buflen + lsbyte bytes
  2975. * long.
  2976. *
  2977. * This code assumes that the compiler has the minimal intelligence
  2978. * neded to optimize divides and modulo operations on an unsigned data
  2979. * type with a power of two.
  2980. */
  2981. void
  2982. lbnExtractBigBytes_16(BNWORD16 const *n, unsigned char *buf,
  2983. unsigned lsbyte, unsigned buflen)
  2984. {
  2985. BNWORD16 t = 0; /* Needed to shut up uninitialized var warnings */
  2986. unsigned shift;
  2987. lsbyte += buflen;
  2988. shift = (8 * lsbyte) % 16;
  2989. lsbyte /= (16/8); /* Convert to word offset */
  2990. BIGLITTLE(n -= lsbyte, n += lsbyte);
  2991. if (shift)
  2992. t = BIGLITTLE(n[-1],n[0]);
  2993. while (buflen--) {
  2994. if (!shift) {
  2995. t = BIGLITTLE(*n++,*--n);
  2996. shift = 16;
  2997. }
  2998. shift -= 8;
  2999. *buf++ = (unsigned char)(t>>shift);
  3000. }
  3001. }
  3002. /*
  3003. * Merge a big-endian array of bytes into a bignum array.
  3004. * The array had better be big enough. This is
  3005. * equivalent to extracting the entire bignum into a
  3006. * large byte array, copying the input buffer into the
  3007. * middle of it, and converting back to a bignum.
  3008. *
  3009. * The buf is "len" bytes long, and its *last* byte is at
  3010. * position "lsbyte" from the end of the bignum.
  3011. *
  3012. * Note that this is a pain to get right. Fortunately, it's hardly
  3013. * critical for efficiency.
  3014. */
  3015. void
  3016. lbnInsertBigBytes_16(BNWORD16 *n, unsigned char const *buf,
  3017. unsigned lsbyte, unsigned buflen)
  3018. {
  3019. BNWORD16 t = 0; /* Shut up uninitialized varibale warnings */
  3020. lsbyte += buflen;
  3021. BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
  3022. /* Load up leading odd bytes */
  3023. if (lsbyte % (16/8)) {
  3024. t = BIGLITTLE(*--n,*n++);
  3025. t >>= (lsbyte * 8) % 16;
  3026. }
  3027. /* The main loop - merge into t, storing at each word boundary. */
  3028. while (buflen--) {
  3029. t = (t << 8) | *buf++;
  3030. if ((--lsbyte % (16/8)) == 0)
  3031. BIGLITTLE(*n++,*--n) = t;
  3032. }
  3033. /* Merge odd bytes in t into last word */
  3034. lsbyte = (lsbyte * 8) % 16;
  3035. if (lsbyte) {
  3036. t <<= lsbyte;
  3037. t |= (((BNWORD16)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
  3038. BIGLITTLE(n[0],n[-1]) = t;
  3039. }
  3040. return;
  3041. }
  3042. /*
  3043. * Returns a substring of the little-endian array of bytes representation
  3044. * of the bignum array based on two parameters, the least significant
  3045. * byte number (0 to start with the least significant byte) and the
  3046. * length. I.e. the number returned is a representation of
  3047. * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
  3048. *
  3049. * It is an error if the bignum is not at least buflen + lsbyte bytes
  3050. * long.
  3051. *
  3052. * This code assumes that the compiler has the minimal intelligence
  3053. * neded to optimize divides and modulo operations on an unsigned data
  3054. * type with a power of two.
  3055. */
  3056. void
  3057. lbnExtractLittleBytes_16(BNWORD16 const *n, unsigned char *buf,
  3058. unsigned lsbyte, unsigned buflen)
  3059. {
  3060. BNWORD16 t = 0; /* Needed to shut up uninitialized var warnings */
  3061. BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
  3062. if (lsbyte % (16/8)) {
  3063. t = BIGLITTLE(*--n,*n++);
  3064. t >>= (lsbyte % (16/8)) * 8 ;
  3065. }
  3066. while (buflen--) {
  3067. if ((lsbyte++ % (16/8)) == 0)
  3068. t = BIGLITTLE(*--n,*n++);
  3069. *buf++ = (unsigned char)t;
  3070. t >>= 8;
  3071. }
  3072. }
  3073. /*
  3074. * Merge a little-endian array of bytes into a bignum array.
  3075. * The array had better be big enough. This is
  3076. * equivalent to extracting the entire bignum into a
  3077. * large byte array, copying the input buffer into the
  3078. * middle of it, and converting back to a bignum.
  3079. *
  3080. * The buf is "len" bytes long, and its first byte is at
  3081. * position "lsbyte" from the end of the bignum.
  3082. *
  3083. * Note that this is a pain to get right. Fortunately, it's hardly
  3084. * critical for efficiency.
  3085. */
  3086. void
  3087. lbnInsertLittleBytes_16(BNWORD16 *n, unsigned char const *buf,
  3088. unsigned lsbyte, unsigned buflen)
  3089. {
  3090. BNWORD16 t = 0; /* Shut up uninitialized varibale warnings */
  3091. /* Move to most-significant end */
  3092. lsbyte += buflen;
  3093. buf += buflen;
  3094. BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
  3095. /* Load up leading odd bytes */
  3096. if (lsbyte % (16/8)) {
  3097. t = BIGLITTLE(*--n,*n++);
  3098. t >>= (lsbyte * 8) % 16;
  3099. }
  3100. /* The main loop - merge into t, storing at each word boundary. */
  3101. while (buflen--) {
  3102. t = (t << 8) | *--buf;
  3103. if ((--lsbyte % (16/8)) == 0)
  3104. BIGLITTLE(*n++,*--n) = t;
  3105. }
  3106. /* Merge odd bytes in t into last word */
  3107. lsbyte = (lsbyte * 8) % 16;
  3108. if (lsbyte) {
  3109. t <<= lsbyte;
  3110. t |= (((BNWORD16)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
  3111. BIGLITTLE(n[0],n[-1]) = t;
  3112. }
  3113. return;
  3114. }
  3115. #ifdef DEADCODE /* This was a precursor to the more flexible lbnExtractBytes */
  3116. /*
  3117. * Convert a big-endian array of bytes to a bignum.
  3118. * Returns the number of words in the bignum.
  3119. * Note the expression "16/8" for the number of bytes per word.
  3120. * This is so the word-size adjustment will work.
  3121. */
  3122. unsigned
  3123. lbnFromBytes_16(BNWORD16 *a, unsigned char const *b, unsigned blen)
  3124. {
  3125. BNWORD16 t;
  3126. unsigned alen = (blen + (16/8-1))/(16/8);
  3127. BIGLITTLE(a -= alen, a += alen);
  3128. while (blen) {
  3129. t = 0;
  3130. do {
  3131. t = t << 8 | *b++;
  3132. } while (--blen & (16/8-1));
  3133. BIGLITTLE(*a++,*--a) = t;
  3134. }
  3135. return alen;
  3136. }
  3137. #endif
  3138. #if 0
  3139. /*
  3140. * Computes the GCD of a and b. Modifies both arguments; when it returns,
  3141. * one of them is the GCD and the other is trash. The return value
  3142. * indicates which: 0 for a, and 1 for b. The length of the retult is
  3143. * returned in rlen. Both inputs must have one extra word of precision.
  3144. * alen must be >= blen.
  3145. *
  3146. * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
  3147. * This is based on taking out common powers of 2, then repeatedly:
  3148. * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted.
  3149. * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
  3150. * It gets less reduction per step, but the steps are much faster than
  3151. * the division case.
  3152. */
  3153. int
  3154. lbnGcd_16(BNWORD16 *a, unsigned alen, BNWORD16 *b, unsigned blen,
  3155. unsigned *rlen)
  3156. {
  3157. #if BNYIELD
  3158. int y;
  3159. #endif
  3160. assert(alen >= blen);
  3161. while (blen != 0) {
  3162. (void)lbnDiv_16(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
  3163. alen = lbnNorm_16(a, blen);
  3164. if (alen == 0) {
  3165. *rlen = blen;
  3166. return 1;
  3167. }
  3168. (void)lbnDiv_16(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
  3169. blen = lbnNorm_16(b, alen);
  3170. #if BNYIELD
  3171. if (bnYield && (y = bnYield()) < 0)
  3172. return y;
  3173. #endif
  3174. }
  3175. *rlen = alen;
  3176. return 0;
  3177. }
  3178. /*
  3179. * Invert "a" modulo "mod" using the extended Euclidean algorithm.
  3180. * Note that this only computes one of the cosequences, and uses the
  3181. * theorem that the signs flip every step and the absolute value of
  3182. * the cosequence values are always bounded by the modulus to avoid
  3183. * having to work with negative numbers.
  3184. * gcd(a,mod) had better equal 1. Returns 1 if the GCD is NOT 1.
  3185. * a must be one word longer than "mod". It is overwritten with the
  3186. * result.
  3187. * TODO: Use Richard Schroeppel's *much* faster algorithm.
  3188. */
  3189. int
  3190. lbnInv_16(BNWORD16 *a, unsigned alen, BNWORD16 const *mod, unsigned mlen)
  3191. {
  3192. BNWORD16 *b; /* Hold a copy of mod during GCD reduction */
  3193. BNWORD16 *p; /* Temporary for products added to t0 and t1 */
  3194. BNWORD16 *t0, *t1; /* Inverse accumulators */
  3195. BNWORD16 cy;
  3196. unsigned blen, t0len, t1len, plen;
  3197. int y;
  3198. alen = lbnNorm_16(a, alen);
  3199. if (!alen)
  3200. return 1; /* No inverse */
  3201. mlen = lbnNorm_16(mod, mlen);
  3202. assert (alen <= mlen);
  3203. /* Inverse of 1 is 1 */
  3204. if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) {
  3205. lbnZero_16(BIGLITTLE(a-alen,a+alen), mlen-alen);
  3206. return 0;
  3207. }
  3208. /* Allocate a pile of space */
  3209. LBNALLOC(b, BNWORD16, mlen+1);
  3210. if (b) {
  3211. /*
  3212. * Although products are guaranteed to always be less than the
  3213. * modulus, it can involve multiplying two 3-word numbers to
  3214. * get a 5-word result, requiring a 6th word to store a 0
  3215. * temporarily. Thus, mlen + 1.
  3216. */
  3217. LBNALLOC(p, BNWORD16, mlen+1);
  3218. if (p) {
  3219. LBNALLOC(t0, BNWORD16, mlen);
  3220. if (t0) {
  3221. LBNALLOC(t1, BNWORD16, mlen);
  3222. if (t1)
  3223. goto allocated;
  3224. LBNFREE(t0, mlen);
  3225. }
  3226. LBNFREE(p, mlen+1);
  3227. }
  3228. LBNFREE(b, mlen+1);
  3229. }
  3230. return -1;
  3231. allocated:
  3232. /* Set t0 to 1 */
  3233. t0len = 1;
  3234. BIGLITTLE(t0[-1],t0[0]) = 1;
  3235. /* b = mod */
  3236. lbnCopy_16(b, mod, mlen);
  3237. /* blen = mlen (implicitly) */
  3238. /* t1 = b / a; b = b % a */
  3239. cy = lbnDiv_16(t1, b, mlen, a, alen);
  3240. *(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy;
  3241. t1len = lbnNorm_16(t1, mlen-alen+1);
  3242. blen = lbnNorm_16(b, alen);
  3243. /* while (b > 1) */
  3244. while (blen > 1 || BIGLITTLE(b[-1],b[0]) != (BNWORD16)1) {
  3245. /* q = a / b; a = a % b; */
  3246. if (alen < blen || (alen == blen && lbnCmp_16(a, a, alen) < 0))
  3247. assert(0);
  3248. cy = lbnDiv_16(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
  3249. *(BIGLITTLE(a-alen-1,a+alen)) = cy;
  3250. plen = lbnNorm_16(BIGLITTLE(a-blen,a+blen), alen-blen+1);
  3251. assert(plen);
  3252. alen = lbnNorm_16(a, blen);
  3253. if (!alen)
  3254. goto failure; /* GCD not 1 */
  3255. /* t0 += q * t1; */
  3256. assert(plen+t1len <= mlen+1);
  3257. lbnMul_16(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len);
  3258. plen = lbnNorm_16(p, plen + t1len);
  3259. assert(plen <= mlen);
  3260. if (plen > t0len) {
  3261. lbnZero_16(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len);
  3262. t0len = plen;
  3263. }
  3264. cy = lbnAddN_16(t0, p, plen);
  3265. if (cy) {
  3266. if (t0len > plen) {
  3267. cy = lbnAdd1_16(BIGLITTLE(t0-plen,t0+plen),
  3268. t0len-plen, cy);
  3269. }
  3270. if (cy) {
  3271. BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy;
  3272. t0len++;
  3273. }
  3274. }
  3275. /* if (a <= 1) return a ? t0 : FAIL; */
  3276. if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD16)1) {
  3277. if (alen == 0)
  3278. goto failure; /* FAIL */
  3279. assert(t0len <= mlen);
  3280. lbnCopy_16(a, t0, t0len);
  3281. lbnZero_16(BIGLITTLE(a-t0len, a+t0len), mlen-t0len);
  3282. goto success;
  3283. }
  3284. /* q = b / a; b = b % a; */
  3285. if (blen < alen || (blen == alen && lbnCmp_16(b, a, alen) < 0))
  3286. assert(0);
  3287. cy = lbnDiv_16(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
  3288. *(BIGLITTLE(b-blen-1,b+blen)) = cy;
  3289. plen = lbnNorm_16(BIGLITTLE(b-alen,b+alen), blen-alen+1);
  3290. assert(plen);
  3291. blen = lbnNorm_16(b, alen);
  3292. if (!blen)
  3293. goto failure; /* GCD not 1 */
  3294. /* t1 += q * t0; */
  3295. assert(plen+t0len <= mlen+1);
  3296. lbnMul_16(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len);
  3297. plen = lbnNorm_16(p, plen + t0len);
  3298. assert(plen <= mlen);
  3299. if (plen > t1len) {
  3300. lbnZero_16(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len);
  3301. t1len = plen;
  3302. }
  3303. cy = lbnAddN_16(t1, p, plen);
  3304. if (cy) {
  3305. if (t1len > plen) {
  3306. cy = lbnAdd1_16(BIGLITTLE(t1-plen,t0+plen),
  3307. t1len-plen, cy);
  3308. }
  3309. if (cy) {
  3310. BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy;
  3311. t1len++;
  3312. }
  3313. }
  3314. #if BNYIELD
  3315. if (bnYield && (y = bnYield() < 0))
  3316. goto yield;
  3317. #endif
  3318. }
  3319. if (!blen)
  3320. goto failure; /* gcd(a, mod) != 1 -- FAIL */
  3321. /* return mod-t1 */
  3322. lbnCopy_16(a, mod, mlen);
  3323. assert(t1len <= mlen);
  3324. cy = lbnSubN_16(a, t1, t1len);
  3325. if (cy) {
  3326. assert(mlen > t1len);
  3327. cy = lbnSub1_16(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy);
  3328. assert(!cy);
  3329. }
  3330. success:
  3331. LBNFREE(t1, mlen);
  3332. LBNFREE(t0, mlen);
  3333. LBNFREE(p, mlen+1);
  3334. LBNFREE(b, mlen+1);
  3335. return 0;
  3336. failure: /* GCD is not 1 - no inverse exists! */
  3337. y = 1;
  3338. #if BNYIELD
  3339. yield:
  3340. #endif
  3341. LBNFREE(t1, mlen);
  3342. LBNFREE(t0, mlen);
  3343. LBNFREE(p, mlen+1);
  3344. LBNFREE(b, mlen+1);
  3345. return y;
  3346. }
  3347. /*
  3348. * Precompute powers of "a" mod "mod". Compute them every "bits"
  3349. * for "n" steps. This is sufficient to compute powers of g with
  3350. * exponents up to n*bits bits long, i.e. less than 2^(n*bits).
  3351. *
  3352. * This assumes that the caller has already initialized "array" to point
  3353. * to "n" buffers of size "mlen".
  3354. */
  3355. int
  3356. lbnBasePrecompBegin_16(BNWORD16 **array, unsigned n, unsigned bits,
  3357. BNWORD16 const *g, unsigned glen, BNWORD16 *mod, unsigned mlen)
  3358. {
  3359. BNWORD16 *a, *b; /* Temporary double-width accumulators */
  3360. BNWORD16 *a1; /* Pointer to high half of a*/
  3361. BNWORD16 inv; /* Montgomery inverse of LSW of mod */
  3362. BNWORD16 *t;
  3363. unsigned i;
  3364. glen = lbnNorm_16(g, glen);
  3365. assert(glen);
  3366. assert (mlen == lbnNorm_16(mod, mlen));
  3367. assert (glen <= mlen);
  3368. /* Allocate two temporary buffers, and the array slots */
  3369. LBNALLOC(a, BNWORD16, mlen*2);
  3370. if (!a)
  3371. return -1;
  3372. LBNALLOC(b, BNWORD16, mlen*2);
  3373. if (!b) {
  3374. LBNFREE(a, 2*mlen);
  3375. return -1;
  3376. }
  3377. /* Okay, all ready */
  3378. /* Convert n to Montgomery form */
  3379. inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
  3380. assert(inv & 1); /* Modulus must be odd */
  3381. inv = lbnMontInv1_16(inv);
  3382. /* Move g up "mlen" words into a (clearing the low mlen words) */
  3383. a1 = BIGLITTLE(a-mlen,a+mlen);
  3384. lbnCopy_16(a1, g, glen);
  3385. lbnZero_16(a, mlen);
  3386. /* Do the division - dump the quotient into the high-order words */
  3387. (void)lbnDiv_16(a1, a, mlen+glen, mod, mlen);
  3388. /* Copy the first value into the array */
  3389. t = *array;
  3390. lbnCopy_16(t, a, mlen);
  3391. a1 = a; /* This first value is *not* shifted up */
  3392. /* Now compute the remaining n-1 array entries */
  3393. assert(bits);
  3394. assert(n);
  3395. while (--n) {
  3396. i = bits;
  3397. do {
  3398. /* Square a1 into b1 */
  3399. lbnMontSquare_16(b, a1, mod, mlen, inv);
  3400. t = b; b = a; a = t;
  3401. a1 = BIGLITTLE(a-mlen, a+mlen);
  3402. } while (--i);
  3403. t = *++array;
  3404. lbnCopy_16(t, a1, mlen);
  3405. }
  3406. /* Hooray, we're done. */
  3407. LBNFREE(b, 2*mlen);
  3408. LBNFREE(a, 2*mlen);
  3409. return 0;
  3410. }
  3411. /*
  3412. * result = base^exp (mod mod). "array" is a an array of pointers
  3413. * to procomputed powers of base, each 2^bits apart. (I.e. array[i]
  3414. * is base^(2^(i*bits))).
  3415. *
  3416. * The algorithm consists of:
  3417. * a = b = (powers of g to be raised to the power 2^bits-1)
  3418. * a *= b *= (powers of g to be raised to the power 2^bits-2)
  3419. * ...
  3420. * a *= b *= (powers of g to be raised to the power 1)
  3421. *
  3422. * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
  3423. */
  3424. int
  3425. lbnBasePrecompExp_16(BNWORD16 *result, BNWORD16 const * const *array,
  3426. unsigned bits, BNWORD16 const *exp, unsigned elen,
  3427. BNWORD16 const *mod, unsigned mlen)
  3428. {
  3429. BNWORD16 *a, *b, *c, *t;
  3430. BNWORD16 *a1, *b1;
  3431. int anull, bnull; /* Null flags: values are implicitly 1 */
  3432. unsigned i, j; /* Loop counters */
  3433. unsigned mask; /* Exponent bits to examime */
  3434. BNWORD16 const *eptr; /* Pointer into exp */
  3435. BNWORD16 buf, curbits, nextword; /* Bit-buffer varaibles */
  3436. BNWORD16 inv; /* Inverse of LSW of modulus */
  3437. unsigned ewords; /* Words of exponent left */
  3438. int bufbits; /* Number of valid bits */
  3439. int y = 0;
  3440. mlen = lbnNorm_16(mod, mlen);
  3441. assert (mlen);
  3442. elen = lbnNorm_16(exp, elen);
  3443. if (!elen) {
  3444. lbnZero_16(result, mlen);
  3445. BIGLITTLE(result[-1],result[0]) = 1;
  3446. return 0;
  3447. }
  3448. /*
  3449. * This could be precomputed, but it's so cheap, and it would require
  3450. * making the precomputation structure word-size dependent.
  3451. */
  3452. inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
  3453. assert(elen);
  3454. /*
  3455. * Allocate three temporary buffers. The current numbers generally
  3456. * live in the upper halves of these buffers.
  3457. */
  3458. LBNALLOC(a, BNWORD16, mlen*2);
  3459. if (a) {
  3460. LBNALLOC(b, BNWORD16, mlen*2);
  3461. if (b) {
  3462. LBNALLOC(c, BNWORD16, mlen*2);
  3463. if (c)
  3464. goto allocated;
  3465. LBNFREE(b, 2*mlen);
  3466. }
  3467. LBNFREE(a, 2*mlen);
  3468. }
  3469. return -1;
  3470. allocated:
  3471. anull = bnull = 1;
  3472. mask = (1u<<bits) - 1;
  3473. for (i = mask; i; --i) {
  3474. /* Set up bit buffer for walking the exponent */
  3475. eptr = exp;
  3476. buf = BIGLITTLE(*--eptr, *eptr++);
  3477. ewords = elen-1;
  3478. bufbits = 16;
  3479. for (j = 0; ewords || buf; j++) {
  3480. /* Shift down current buffer */
  3481. curbits = buf;
  3482. buf >>= bits;
  3483. /* If necessary, add next word */
  3484. bufbits -= bits;
  3485. if (bufbits < 0 && ewords > 0) {
  3486. nextword = BIGLITTLE(*--eptr, *eptr++);
  3487. ewords--;
  3488. curbits |= nextword << (bufbits+bits);
  3489. buf = nextword >> -bufbits;
  3490. bufbits += 16;
  3491. }
  3492. /* If appropriate, multiply b *= array[j] */
  3493. if ((curbits & mask) == i) {
  3494. BNWORD16 const *d = array[j];
  3495. b1 = BIGLITTLE(b-mlen-1,b+mlen);
  3496. if (bnull) {
  3497. lbnCopy_16(b1, d, mlen);
  3498. bnull = 0;
  3499. } else {
  3500. lbnMontMul_16(c, b1, d, mod, mlen, inv);
  3501. t = c; c = b; b = t;
  3502. }
  3503. #if BNYIELD
  3504. if (bnYield && (y = bnYield() < 0))
  3505. goto yield;
  3506. #endif
  3507. }
  3508. }
  3509. /* Multiply a *= b */
  3510. if (!bnull) {
  3511. a1 = BIGLITTLE(a-mlen-1,a+mlen);
  3512. b1 = BIGLITTLE(b-mlen-1,b+mlen);
  3513. if (anull) {
  3514. lbnCopy_16(a1, b1, mlen);
  3515. anull = 0;
  3516. } else {
  3517. lbnMontMul_16(c, a1, b1, mod, mlen, inv);
  3518. t = c; c = a; a = t;
  3519. }
  3520. }
  3521. }
  3522. assert(!anull); /* If it were, elen would have been 0 */
  3523. /* Convert out of Montgomery form and return */
  3524. a1 = BIGLITTLE(a-mlen-1,a+mlen);
  3525. lbnCopy_16(a, a1, mlen);
  3526. lbnZero_16(a1, mlen);
  3527. lbnMontReduce_16(a, mod, mlen, inv);
  3528. lbnCopy_16(result, a1, mlen);
  3529. #if BNYIELD
  3530. yield:
  3531. #endif
  3532. LBNFREE(c, 2*mlen);
  3533. LBNFREE(b, 2*mlen);
  3534. LBNFREE(a, 2*mlen);
  3535. return y;
  3536. }
  3537. /*
  3538. * result = base1^exp1 *base2^exp2 (mod mod). "array1" and "array2" are
  3539. * arrays of pointers to procomputed powers of the corresponding bases,
  3540. * each 2^bits apart. (I.e. array1[i] is base1^(2^(i*bits))).
  3541. *
  3542. * Bits must be the same in both. (It could be made adjustable, but it's
  3543. * a bit of a pain. Just make them both equal to the larger one.)
  3544. *
  3545. * The algorithm consists of:
  3546. * a = b = (powers of base1 and base2 to be raised to the power 2^bits-1)
  3547. * a *= b *= (powers of base1 and base2 to be raised to the power 2^bits-2)
  3548. * ...
  3549. * a *= b *= (powers of base1 and base2 to be raised to the power 1)
  3550. *
  3551. * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
  3552. */
  3553. int
  3554. lbnDoubleBasePrecompExp_16(BNWORD16 *result, unsigned bits,
  3555. BNWORD16 const * const *array1, BNWORD16 const *exp1, unsigned elen1,
  3556. BNWORD16 const * const *array2, BNWORD16 const *exp2,
  3557. unsigned elen2, BNWORD16 const *mod, unsigned mlen)
  3558. {
  3559. BNWORD16 *a, *b, *c, *t;
  3560. BNWORD16 *a1, *b1;
  3561. int anull, bnull; /* Null flags: values are implicitly 1 */
  3562. unsigned i, j, k; /* Loop counters */
  3563. unsigned mask; /* Exponent bits to examime */
  3564. BNWORD16 const *eptr; /* Pointer into exp */
  3565. BNWORD16 buf, curbits, nextword; /* Bit-buffer varaibles */
  3566. BNWORD16 inv; /* Inverse of LSW of modulus */
  3567. unsigned ewords; /* Words of exponent left */
  3568. int bufbits; /* Number of valid bits */
  3569. int y = 0;
  3570. BNWORD16 const * const *array;
  3571. mlen = lbnNorm_16(mod, mlen);
  3572. assert (mlen);
  3573. elen1 = lbnNorm_16(exp1, elen1);
  3574. if (!elen1) {
  3575. return lbnBasePrecompExp_16(result, array2, bits, exp2, elen2,
  3576. mod, mlen);
  3577. }
  3578. elen2 = lbnNorm_16(exp2, elen2);
  3579. if (!elen2) {
  3580. return lbnBasePrecompExp_16(result, array1, bits, exp1, elen1,
  3581. mod, mlen);
  3582. }
  3583. /*
  3584. * This could be precomputed, but it's so cheap, and it would require
  3585. * making the precomputation structure word-size dependent.
  3586. */
  3587. inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
  3588. assert(elen1);
  3589. assert(elen2);
  3590. /*
  3591. * Allocate three temporary buffers. The current numbers generally
  3592. * live in the upper halves of these buffers.
  3593. */
  3594. LBNALLOC(a, BNWORD16, mlen*2);
  3595. if (a) {
  3596. LBNALLOC(b, BNWORD16, mlen*2);
  3597. if (b) {
  3598. LBNALLOC(c, BNWORD16, mlen*2);
  3599. if (c)
  3600. goto allocated;
  3601. LBNFREE(b, 2*mlen);
  3602. }
  3603. LBNFREE(a, 2*mlen);
  3604. }
  3605. return -1;
  3606. allocated:
  3607. anull = bnull = 1;
  3608. mask = (1u<<bits) - 1;
  3609. for (i = mask; i; --i) {
  3610. /* Walk each exponent in turn */
  3611. for (k = 0; k < 2; k++) {
  3612. /* Set up the exponent for walking */
  3613. array = k ? array2 : array1;
  3614. eptr = k ? exp2 : exp1;
  3615. ewords = (k ? elen2 : elen1) - 1;
  3616. /* Set up bit buffer for walking the exponent */
  3617. buf = BIGLITTLE(*--eptr, *eptr++);
  3618. bufbits = 16;
  3619. for (j = 0; ewords || buf; j++) {
  3620. /* Shift down current buffer */
  3621. curbits = buf;
  3622. buf >>= bits;
  3623. /* If necessary, add next word */
  3624. bufbits -= bits;
  3625. if (bufbits < 0 && ewords > 0) {
  3626. nextword = BIGLITTLE(*--eptr, *eptr++);
  3627. ewords--;
  3628. curbits |= nextword << (bufbits+bits);
  3629. buf = nextword >> -bufbits;
  3630. bufbits += 16;
  3631. }
  3632. /* If appropriate, multiply b *= array[j] */
  3633. if ((curbits & mask) == i) {
  3634. BNWORD16 const *d = array[j];
  3635. b1 = BIGLITTLE(b-mlen-1,b+mlen);
  3636. if (bnull) {
  3637. lbnCopy_16(b1, d, mlen);
  3638. bnull = 0;
  3639. } else {
  3640. lbnMontMul_16(c, b1, d, mod, mlen, inv);
  3641. t = c; c = b; b = t;
  3642. }
  3643. #if BNYIELD
  3644. if (bnYield && (y = bnYield() < 0))
  3645. goto yield;
  3646. #endif
  3647. }
  3648. }
  3649. }
  3650. /* Multiply a *= b */
  3651. if (!bnull) {
  3652. a1 = BIGLITTLE(a-mlen-1,a+mlen);
  3653. b1 = BIGLITTLE(b-mlen-1,b+mlen);
  3654. if (anull) {
  3655. lbnCopy_16(a1, b1, mlen);
  3656. anull = 0;
  3657. } else {
  3658. lbnMontMul_16(c, a1, b1, mod, mlen, inv);
  3659. t = c; c = a; a = t;
  3660. }
  3661. }
  3662. }
  3663. assert(!anull); /* If it were, elen would have been 0 */
  3664. /* Convert out of Montgomery form and return */
  3665. a1 = BIGLITTLE(a-mlen-1,a+mlen);
  3666. lbnCopy_16(a, a1, mlen);
  3667. lbnZero_16(a1, mlen);
  3668. lbnMontReduce_16(a, mod, mlen, inv);
  3669. lbnCopy_16(result, a1, mlen);
  3670. #if BNYIELD
  3671. yield:
  3672. #endif
  3673. LBNFREE(c, 2*mlen);
  3674. LBNFREE(b, 2*mlen);
  3675. LBNFREE(a, 2*mlen);
  3676. return y;
  3677. }
  3678. #endif