bn-586.pl 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785
  1. #! /usr/bin/env perl
  2. # Copyright 1995-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  9. push(@INC,"${dir}","${dir}../../perlasm");
  10. require "x86asm.pl";
  11. $output = pop;
  12. open STDOUT,">$output";
  13. &asm_init($ARGV[0]);
  14. $sse2=0;
  15. for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
  16. &external_label("OPENSSL_ia32cap_P") if ($sse2);
  17. &bn_mul_add_words("bn_mul_add_words");
  18. &bn_mul_words("bn_mul_words");
  19. &bn_sqr_words("bn_sqr_words");
  20. &bn_div_words("bn_div_words");
  21. &bn_add_words("bn_add_words");
  22. &bn_sub_words("bn_sub_words");
  23. &bn_sub_part_words("bn_sub_part_words");
  24. &asm_finish();
  25. close STDOUT or die "error closing STDOUT: $!";
  26. sub bn_mul_add_words
  27. {
  28. local($name)=@_;
  29. &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
  30. $r="eax";
  31. $a="edx";
  32. $c="ecx";
  33. if ($sse2) {
  34. &picmeup("eax","OPENSSL_ia32cap_P");
  35. &bt(&DWP(0,"eax"),26);
  36. &jnc(&label("maw_non_sse2"));
  37. &mov($r,&wparam(0));
  38. &mov($a,&wparam(1));
  39. &mov($c,&wparam(2));
  40. &movd("mm0",&wparam(3)); # mm0 = w
  41. &pxor("mm1","mm1"); # mm1 = carry_in
  42. &jmp(&label("maw_sse2_entry"));
  43. &set_label("maw_sse2_unrolled",16);
  44. &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
  45. &paddq("mm1","mm3"); # mm1 = carry_in + r[0]
  46. &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
  47. &pmuludq("mm2","mm0"); # mm2 = w*a[0]
  48. &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
  49. &pmuludq("mm4","mm0"); # mm4 = w*a[1]
  50. &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
  51. &pmuludq("mm6","mm0"); # mm6 = w*a[2]
  52. &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
  53. &pmuludq("mm7","mm0"); # mm7 = w*a[3]
  54. &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
  55. &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
  56. &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
  57. &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
  58. &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
  59. &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
  60. &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
  61. &movd(&DWP(0,$r,"",0),"mm1");
  62. &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
  63. &pmuludq("mm2","mm0"); # mm2 = w*a[4]
  64. &psrlq("mm1",32); # mm1 = carry0
  65. &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
  66. &pmuludq("mm4","mm0"); # mm4 = w*a[5]
  67. &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
  68. &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
  69. &pmuludq("mm6","mm0"); # mm6 = w*a[6]
  70. &movd(&DWP(4,$r,"",0),"mm1");
  71. &psrlq("mm1",32); # mm1 = carry1
  72. &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
  73. &add($a,32);
  74. &pmuludq("mm3","mm0"); # mm3 = w*a[7]
  75. &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
  76. &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
  77. &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
  78. &movd(&DWP(8,$r,"",0),"mm1");
  79. &psrlq("mm1",32); # mm1 = carry2
  80. &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
  81. &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
  82. &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
  83. &movd(&DWP(12,$r,"",0),"mm1");
  84. &psrlq("mm1",32); # mm1 = carry3
  85. &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
  86. &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
  87. &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
  88. &movd(&DWP(16,$r,"",0),"mm1");
  89. &psrlq("mm1",32); # mm1 = carry4
  90. &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
  91. &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
  92. &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
  93. &movd(&DWP(20,$r,"",0),"mm1");
  94. &psrlq("mm1",32); # mm1 = carry5
  95. &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
  96. &movd(&DWP(24,$r,"",0),"mm1");
  97. &psrlq("mm1",32); # mm1 = carry6
  98. &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
  99. &movd(&DWP(28,$r,"",0),"mm1");
  100. &lea($r,&DWP(32,$r));
  101. &psrlq("mm1",32); # mm1 = carry_out
  102. &sub($c,8);
  103. &jz(&label("maw_sse2_exit"));
  104. &set_label("maw_sse2_entry");
  105. &test($c,0xfffffff8);
  106. &jnz(&label("maw_sse2_unrolled"));
  107. &set_label("maw_sse2_loop",4);
  108. &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
  109. &movd("mm3",&DWP(0,$r)); # mm3 = r[i]
  110. &pmuludq("mm2","mm0"); # a[i] *= w
  111. &lea($a,&DWP(4,$a));
  112. &paddq("mm1","mm3"); # carry += r[i]
  113. &paddq("mm1","mm2"); # carry += a[i]*w
  114. &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
  115. &sub($c,1);
  116. &psrlq("mm1",32); # carry = carry_high
  117. &lea($r,&DWP(4,$r));
  118. &jnz(&label("maw_sse2_loop"));
  119. &set_label("maw_sse2_exit");
  120. &movd("eax","mm1"); # c = carry_out
  121. &emms();
  122. &ret();
  123. &set_label("maw_non_sse2",16);
  124. }
  125. # function_begin prologue
  126. &push("ebp");
  127. &push("ebx");
  128. &push("esi");
  129. &push("edi");
  130. &comment("");
  131. $Low="eax";
  132. $High="edx";
  133. $a="ebx";
  134. $w="ebp";
  135. $r="edi";
  136. $c="esi";
  137. &xor($c,$c); # clear carry
  138. &mov($r,&wparam(0)); #
  139. &mov("ecx",&wparam(2)); #
  140. &mov($a,&wparam(1)); #
  141. &and("ecx",0xfffffff8); # num / 8
  142. &mov($w,&wparam(3)); #
  143. &push("ecx"); # Up the stack for a tmp variable
  144. &jz(&label("maw_finish"));
  145. &set_label("maw_loop",16);
  146. for ($i=0; $i<32; $i+=4)
  147. {
  148. &comment("Round $i");
  149. &mov("eax",&DWP($i,$a)); # *a
  150. &mul($w); # *a * w
  151. &add("eax",$c); # L(t)+= c
  152. &adc("edx",0); # H(t)+=carry
  153. &add("eax",&DWP($i,$r)); # L(t)+= *r
  154. &adc("edx",0); # H(t)+=carry
  155. &mov(&DWP($i,$r),"eax"); # *r= L(t);
  156. &mov($c,"edx"); # c= H(t);
  157. }
  158. &comment("");
  159. &sub("ecx",8);
  160. &lea($a,&DWP(32,$a));
  161. &lea($r,&DWP(32,$r));
  162. &jnz(&label("maw_loop"));
  163. &set_label("maw_finish",0);
  164. &mov("ecx",&wparam(2)); # get num
  165. &and("ecx",7);
  166. &jnz(&label("maw_finish2")); # helps branch prediction
  167. &jmp(&label("maw_end"));
  168. &set_label("maw_finish2",1);
  169. for ($i=0; $i<7; $i++)
  170. {
  171. &comment("Tail Round $i");
  172. &mov("eax",&DWP($i*4,$a)); # *a
  173. &mul($w); # *a * w
  174. &add("eax",$c); # L(t)+=c
  175. &adc("edx",0); # H(t)+=carry
  176. &add("eax",&DWP($i*4,$r)); # L(t)+= *r
  177. &adc("edx",0); # H(t)+=carry
  178. &dec("ecx") if ($i != 7-1);
  179. &mov(&DWP($i*4,$r),"eax"); # *r= L(t);
  180. &mov($c,"edx"); # c= H(t);
  181. &jz(&label("maw_end")) if ($i != 7-1);
  182. }
  183. &set_label("maw_end",0);
  184. &mov("eax",$c);
  185. &pop("ecx"); # clear variable from
  186. &function_end($name);
  187. }
  188. sub bn_mul_words
  189. {
  190. local($name)=@_;
  191. &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
  192. $r="eax";
  193. $a="edx";
  194. $c="ecx";
  195. if ($sse2) {
  196. &picmeup("eax","OPENSSL_ia32cap_P");
  197. &bt(&DWP(0,"eax"),26);
  198. &jnc(&label("mw_non_sse2"));
  199. &mov($r,&wparam(0));
  200. &mov($a,&wparam(1));
  201. &mov($c,&wparam(2));
  202. &movd("mm0",&wparam(3)); # mm0 = w
  203. &pxor("mm1","mm1"); # mm1 = carry = 0
  204. &set_label("mw_sse2_loop",16);
  205. &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
  206. &pmuludq("mm2","mm0"); # a[i] *= w
  207. &lea($a,&DWP(4,$a));
  208. &paddq("mm1","mm2"); # carry += a[i]*w
  209. &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
  210. &sub($c,1);
  211. &psrlq("mm1",32); # carry = carry_high
  212. &lea($r,&DWP(4,$r));
  213. &jnz(&label("mw_sse2_loop"));
  214. &movd("eax","mm1"); # return carry
  215. &emms();
  216. &ret();
  217. &set_label("mw_non_sse2",16);
  218. }
  219. # function_begin prologue
  220. &push("ebp");
  221. &push("ebx");
  222. &push("esi");
  223. &push("edi");
  224. &comment("");
  225. $Low="eax";
  226. $High="edx";
  227. $a="ebx";
  228. $w="ecx";
  229. $r="edi";
  230. $c="esi";
  231. $num="ebp";
  232. &xor($c,$c); # clear carry
  233. &mov($r,&wparam(0)); #
  234. &mov($a,&wparam(1)); #
  235. &mov($num,&wparam(2)); #
  236. &mov($w,&wparam(3)); #
  237. &and($num,0xfffffff8); # num / 8
  238. &jz(&label("mw_finish"));
  239. &set_label("mw_loop",0);
  240. for ($i=0; $i<32; $i+=4)
  241. {
  242. &comment("Round $i");
  243. &mov("eax",&DWP($i,$a,"",0)); # *a
  244. &mul($w); # *a * w
  245. &add("eax",$c); # L(t)+=c
  246. # XXX
  247. &adc("edx",0); # H(t)+=carry
  248. &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
  249. &mov($c,"edx"); # c= H(t);
  250. }
  251. &comment("");
  252. &add($a,32);
  253. &add($r,32);
  254. &sub($num,8);
  255. &jz(&label("mw_finish"));
  256. &jmp(&label("mw_loop"));
  257. &set_label("mw_finish",0);
  258. &mov($num,&wparam(2)); # get num
  259. &and($num,7);
  260. &jnz(&label("mw_finish2"));
  261. &jmp(&label("mw_end"));
  262. &set_label("mw_finish2",1);
  263. for ($i=0; $i<7; $i++)
  264. {
  265. &comment("Tail Round $i");
  266. &mov("eax",&DWP($i*4,$a,"",0));# *a
  267. &mul($w); # *a * w
  268. &add("eax",$c); # L(t)+=c
  269. # XXX
  270. &adc("edx",0); # H(t)+=carry
  271. &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
  272. &mov($c,"edx"); # c= H(t);
  273. &dec($num) if ($i != 7-1);
  274. &jz(&label("mw_end")) if ($i != 7-1);
  275. }
  276. &set_label("mw_end",0);
  277. &mov("eax",$c);
  278. &function_end($name);
  279. }
  280. sub bn_sqr_words
  281. {
  282. local($name)=@_;
  283. &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
  284. $r="eax";
  285. $a="edx";
  286. $c="ecx";
  287. if ($sse2) {
  288. &picmeup("eax","OPENSSL_ia32cap_P");
  289. &bt(&DWP(0,"eax"),26);
  290. &jnc(&label("sqr_non_sse2"));
  291. &mov($r,&wparam(0));
  292. &mov($a,&wparam(1));
  293. &mov($c,&wparam(2));
  294. &set_label("sqr_sse2_loop",16);
  295. &movd("mm0",&DWP(0,$a)); # mm0 = a[i]
  296. &pmuludq("mm0","mm0"); # a[i] *= a[i]
  297. &lea($a,&DWP(4,$a)); # a++
  298. &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
  299. &sub($c,1);
  300. &lea($r,&DWP(8,$r)); # r += 2
  301. &jnz(&label("sqr_sse2_loop"));
  302. &emms();
  303. &ret();
  304. &set_label("sqr_non_sse2",16);
  305. }
  306. # function_begin prologue
  307. &push("ebp");
  308. &push("ebx");
  309. &push("esi");
  310. &push("edi");
  311. &comment("");
  312. $r="esi";
  313. $a="edi";
  314. $num="ebx";
  315. &mov($r,&wparam(0)); #
  316. &mov($a,&wparam(1)); #
  317. &mov($num,&wparam(2)); #
  318. &and($num,0xfffffff8); # num / 8
  319. &jz(&label("sw_finish"));
  320. &set_label("sw_loop",0);
  321. for ($i=0; $i<32; $i+=4)
  322. {
  323. &comment("Round $i");
  324. &mov("eax",&DWP($i,$a,"",0)); # *a
  325. # XXX
  326. &mul("eax"); # *a * *a
  327. &mov(&DWP($i*2,$r,"",0),"eax"); #
  328. &mov(&DWP($i*2+4,$r,"",0),"edx");#
  329. }
  330. &comment("");
  331. &add($a,32);
  332. &add($r,64);
  333. &sub($num,8);
  334. &jnz(&label("sw_loop"));
  335. &set_label("sw_finish",0);
  336. &mov($num,&wparam(2)); # get num
  337. &and($num,7);
  338. &jz(&label("sw_end"));
  339. for ($i=0; $i<7; $i++)
  340. {
  341. &comment("Tail Round $i");
  342. &mov("eax",&DWP($i*4,$a,"",0)); # *a
  343. # XXX
  344. &mul("eax"); # *a * *a
  345. &mov(&DWP($i*8,$r,"",0),"eax"); #
  346. &dec($num) if ($i != 7-1);
  347. &mov(&DWP($i*8+4,$r,"",0),"edx");
  348. &jz(&label("sw_end")) if ($i != 7-1);
  349. }
  350. &set_label("sw_end",0);
  351. &function_end($name);
  352. }
  353. sub bn_div_words
  354. {
  355. local($name)=@_;
  356. &function_begin_B($name,"");
  357. &mov("edx",&wparam(0)); #
  358. &mov("eax",&wparam(1)); #
  359. &mov("ecx",&wparam(2)); #
  360. &div("ecx");
  361. &ret();
  362. &function_end_B($name);
  363. }
  364. sub bn_add_words
  365. {
  366. local($name)=@_;
  367. &function_begin($name,"");
  368. &comment("");
  369. $a="esi";
  370. $b="edi";
  371. $c="eax";
  372. $r="ebx";
  373. $tmp1="ecx";
  374. $tmp2="edx";
  375. $num="ebp";
  376. &mov($r,&wparam(0)); # get r
  377. &mov($a,&wparam(1)); # get a
  378. &mov($b,&wparam(2)); # get b
  379. &mov($num,&wparam(3)); # get num
  380. &xor($c,$c); # clear carry
  381. &and($num,0xfffffff8); # num / 8
  382. &jz(&label("aw_finish"));
  383. &set_label("aw_loop",0);
  384. for ($i=0; $i<8; $i++)
  385. {
  386. &comment("Round $i");
  387. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  388. &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
  389. &add($tmp1,$c);
  390. &mov($c,0);
  391. &adc($c,$c);
  392. &add($tmp1,$tmp2);
  393. &adc($c,0);
  394. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  395. }
  396. &comment("");
  397. &add($a,32);
  398. &add($b,32);
  399. &add($r,32);
  400. &sub($num,8);
  401. &jnz(&label("aw_loop"));
  402. &set_label("aw_finish",0);
  403. &mov($num,&wparam(3)); # get num
  404. &and($num,7);
  405. &jz(&label("aw_end"));
  406. for ($i=0; $i<7; $i++)
  407. {
  408. &comment("Tail Round $i");
  409. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  410. &mov($tmp2,&DWP($i*4,$b,"",0));# *b
  411. &add($tmp1,$c);
  412. &mov($c,0);
  413. &adc($c,$c);
  414. &add($tmp1,$tmp2);
  415. &adc($c,0);
  416. &dec($num) if ($i != 6);
  417. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  418. &jz(&label("aw_end")) if ($i != 6);
  419. }
  420. &set_label("aw_end",0);
  421. # &mov("eax",$c); # $c is "eax"
  422. &function_end($name);
  423. }
  424. sub bn_sub_words
  425. {
  426. local($name)=@_;
  427. &function_begin($name,"");
  428. &comment("");
  429. $a="esi";
  430. $b="edi";
  431. $c="eax";
  432. $r="ebx";
  433. $tmp1="ecx";
  434. $tmp2="edx";
  435. $num="ebp";
  436. &mov($r,&wparam(0)); # get r
  437. &mov($a,&wparam(1)); # get a
  438. &mov($b,&wparam(2)); # get b
  439. &mov($num,&wparam(3)); # get num
  440. &xor($c,$c); # clear carry
  441. &and($num,0xfffffff8); # num / 8
  442. &jz(&label("aw_finish"));
  443. &set_label("aw_loop",0);
  444. for ($i=0; $i<8; $i++)
  445. {
  446. &comment("Round $i");
  447. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  448. &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
  449. &sub($tmp1,$c);
  450. &mov($c,0);
  451. &adc($c,$c);
  452. &sub($tmp1,$tmp2);
  453. &adc($c,0);
  454. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  455. }
  456. &comment("");
  457. &add($a,32);
  458. &add($b,32);
  459. &add($r,32);
  460. &sub($num,8);
  461. &jnz(&label("aw_loop"));
  462. &set_label("aw_finish",0);
  463. &mov($num,&wparam(3)); # get num
  464. &and($num,7);
  465. &jz(&label("aw_end"));
  466. for ($i=0; $i<7; $i++)
  467. {
  468. &comment("Tail Round $i");
  469. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  470. &mov($tmp2,&DWP($i*4,$b,"",0));# *b
  471. &sub($tmp1,$c);
  472. &mov($c,0);
  473. &adc($c,$c);
  474. &sub($tmp1,$tmp2);
  475. &adc($c,0);
  476. &dec($num) if ($i != 6);
  477. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  478. &jz(&label("aw_end")) if ($i != 6);
  479. }
  480. &set_label("aw_end",0);
  481. # &mov("eax",$c); # $c is "eax"
  482. &function_end($name);
  483. }
  484. sub bn_sub_part_words
  485. {
  486. local($name)=@_;
  487. &function_begin($name,"");
  488. &comment("");
  489. $a="esi";
  490. $b="edi";
  491. $c="eax";
  492. $r="ebx";
  493. $tmp1="ecx";
  494. $tmp2="edx";
  495. $num="ebp";
  496. &mov($r,&wparam(0)); # get r
  497. &mov($a,&wparam(1)); # get a
  498. &mov($b,&wparam(2)); # get b
  499. &mov($num,&wparam(3)); # get num
  500. &xor($c,$c); # clear carry
  501. &and($num,0xfffffff8); # num / 8
  502. &jz(&label("aw_finish"));
  503. &set_label("aw_loop",0);
  504. for ($i=0; $i<8; $i++)
  505. {
  506. &comment("Round $i");
  507. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  508. &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
  509. &sub($tmp1,$c);
  510. &mov($c,0);
  511. &adc($c,$c);
  512. &sub($tmp1,$tmp2);
  513. &adc($c,0);
  514. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  515. }
  516. &comment("");
  517. &add($a,32);
  518. &add($b,32);
  519. &add($r,32);
  520. &sub($num,8);
  521. &jnz(&label("aw_loop"));
  522. &set_label("aw_finish",0);
  523. &mov($num,&wparam(3)); # get num
  524. &and($num,7);
  525. &jz(&label("aw_end"));
  526. for ($i=0; $i<7; $i++)
  527. {
  528. &comment("Tail Round $i");
  529. &mov($tmp1,&DWP(0,$a,"",0)); # *a
  530. &mov($tmp2,&DWP(0,$b,"",0));# *b
  531. &sub($tmp1,$c);
  532. &mov($c,0);
  533. &adc($c,$c);
  534. &sub($tmp1,$tmp2);
  535. &adc($c,0);
  536. &mov(&DWP(0,$r,"",0),$tmp1); # *r
  537. &add($a, 4);
  538. &add($b, 4);
  539. &add($r, 4);
  540. &dec($num) if ($i != 6);
  541. &jz(&label("aw_end")) if ($i != 6);
  542. }
  543. &set_label("aw_end",0);
  544. &cmp(&wparam(4),0);
  545. &je(&label("pw_end"));
  546. &mov($num,&wparam(4)); # get dl
  547. &cmp($num,0);
  548. &je(&label("pw_end"));
  549. &jge(&label("pw_pos"));
  550. &comment("pw_neg");
  551. &mov($tmp2,0);
  552. &sub($tmp2,$num);
  553. &mov($num,$tmp2);
  554. &and($num,0xfffffff8); # num / 8
  555. &jz(&label("pw_neg_finish"));
  556. &set_label("pw_neg_loop",0);
  557. for ($i=0; $i<8; $i++)
  558. {
  559. &comment("dl<0 Round $i");
  560. &mov($tmp1,0);
  561. &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
  562. &sub($tmp1,$c);
  563. &mov($c,0);
  564. &adc($c,$c);
  565. &sub($tmp1,$tmp2);
  566. &adc($c,0);
  567. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  568. }
  569. &comment("");
  570. &add($b,32);
  571. &add($r,32);
  572. &sub($num,8);
  573. &jnz(&label("pw_neg_loop"));
  574. &set_label("pw_neg_finish",0);
  575. &mov($tmp2,&wparam(4)); # get dl
  576. &mov($num,0);
  577. &sub($num,$tmp2);
  578. &and($num,7);
  579. &jz(&label("pw_end"));
  580. for ($i=0; $i<7; $i++)
  581. {
  582. &comment("dl<0 Tail Round $i");
  583. &mov($tmp1,0);
  584. &mov($tmp2,&DWP($i*4,$b,"",0));# *b
  585. &sub($tmp1,$c);
  586. &mov($c,0);
  587. &adc($c,$c);
  588. &sub($tmp1,$tmp2);
  589. &adc($c,0);
  590. &dec($num) if ($i != 6);
  591. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  592. &jz(&label("pw_end")) if ($i != 6);
  593. }
  594. &jmp(&label("pw_end"));
  595. &set_label("pw_pos",0);
  596. &and($num,0xfffffff8); # num / 8
  597. &jz(&label("pw_pos_finish"));
  598. &set_label("pw_pos_loop",0);
  599. for ($i=0; $i<8; $i++)
  600. {
  601. &comment("dl>0 Round $i");
  602. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  603. &sub($tmp1,$c);
  604. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  605. &jnc(&label("pw_nc".$i));
  606. }
  607. &comment("");
  608. &add($a,32);
  609. &add($r,32);
  610. &sub($num,8);
  611. &jnz(&label("pw_pos_loop"));
  612. &set_label("pw_pos_finish",0);
  613. &mov($num,&wparam(4)); # get dl
  614. &and($num,7);
  615. &jz(&label("pw_end"));
  616. for ($i=0; $i<7; $i++)
  617. {
  618. &comment("dl>0 Tail Round $i");
  619. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  620. &sub($tmp1,$c);
  621. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  622. &jnc(&label("pw_tail_nc".$i));
  623. &dec($num) if ($i != 6);
  624. &jz(&label("pw_end")) if ($i != 6);
  625. }
  626. &mov($c,1);
  627. &jmp(&label("pw_end"));
  628. &set_label("pw_nc_loop",0);
  629. for ($i=0; $i<8; $i++)
  630. {
  631. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  632. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  633. &set_label("pw_nc".$i,0);
  634. }
  635. &comment("");
  636. &add($a,32);
  637. &add($r,32);
  638. &sub($num,8);
  639. &jnz(&label("pw_nc_loop"));
  640. &mov($num,&wparam(4)); # get dl
  641. &and($num,7);
  642. &jz(&label("pw_nc_end"));
  643. for ($i=0; $i<7; $i++)
  644. {
  645. &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
  646. &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
  647. &set_label("pw_tail_nc".$i,0);
  648. &dec($num) if ($i != 6);
  649. &jz(&label("pw_nc_end")) if ($i != 6);
  650. }
  651. &set_label("pw_nc_end",0);
  652. &mov($c,0);
  653. &set_label("pw_end",0);
  654. # &mov("eax",$c); # $c is "eax"
  655. &function_end($name);
  656. }