2
0

ghash-parisc.pl 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748
  1. #! /usr/bin/env perl
  2. # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # April 2010
  17. #
  18. # The module implements "4-bit" GCM GHASH function and underlying
  19. # single multiplication operation in GF(2^128). "4-bit" means that it
  20. # uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
  21. # it processes one byte in 19.6 cycles, which is more than twice as
  22. # fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
  23. # 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
  24. # processed byte. This is ~2.2x faster than 64-bit code generated by
  25. # vendor compiler (which used to be very hard to beat:-).
  26. #
  27. # Special thanks to polarhome.com for providing HP-UX account.
  28. $flavour = shift;
  29. $output = shift;
  30. open STDOUT,">$output";
  31. if ($flavour =~ /64/) {
  32. $LEVEL ="2.0W";
  33. $SIZE_T =8;
  34. $FRAME_MARKER =80;
  35. $SAVED_RP =16;
  36. $PUSH ="std";
  37. $PUSHMA ="std,ma";
  38. $POP ="ldd";
  39. $POPMB ="ldd,mb";
  40. $NREGS =6;
  41. } else {
  42. $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
  43. $SIZE_T =4;
  44. $FRAME_MARKER =48;
  45. $SAVED_RP =20;
  46. $PUSH ="stw";
  47. $PUSHMA ="stwm";
  48. $POP ="ldw";
  49. $POPMB ="ldwm";
  50. $NREGS =11;
  51. }
  52. $FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
  53. # [+ argument transfer]
  54. ################# volatile registers
  55. $Xi="%r26"; # argument block
  56. $Htbl="%r25";
  57. $inp="%r24";
  58. $len="%r23";
  59. $Hhh=$Htbl; # variables
  60. $Hll="%r22";
  61. $Zhh="%r21";
  62. $Zll="%r20";
  63. $cnt="%r19";
  64. $rem_4bit="%r28";
  65. $rem="%r29";
  66. $mask0xf0="%r31";
  67. ################# preserved registers
  68. $Thh="%r1";
  69. $Tll="%r2";
  70. $nlo="%r3";
  71. $nhi="%r4";
  72. $byte="%r5";
  73. if ($SIZE_T==4) {
  74. $Zhl="%r6";
  75. $Zlh="%r7";
  76. $Hhl="%r8";
  77. $Hlh="%r9";
  78. $Thl="%r10";
  79. $Tlh="%r11";
  80. }
  81. $rem2="%r6"; # used in PA-RISC 2.0 code
  82. $code.=<<___;
  83. .LEVEL $LEVEL
  84. .SPACE \$TEXT\$
  85. .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
  86. .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
  87. .ALIGN 64
  88. gcm_gmult_4bit
  89. .PROC
  90. .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
  91. .ENTRY
  92. $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
  93. $PUSHMA %r3,$FRAME(%sp)
  94. $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
  95. $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
  96. $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
  97. ___
  98. $code.=<<___ if ($SIZE_T==4);
  99. $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
  100. $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
  101. $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
  102. $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
  103. $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
  104. ___
  105. $code.=<<___;
  106. blr %r0,$rem_4bit
  107. ldi 3,$rem
  108. L\$pic_gmult
  109. andcm $rem_4bit,$rem,$rem_4bit
  110. addl $inp,$len,$len
  111. ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
  112. ldi 0xf0,$mask0xf0
  113. ___
  114. $code.=<<___ if ($SIZE_T==4);
  115. ldi 31,$rem
  116. mtctl $rem,%cr11
  117. extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
  118. b L\$parisc1_gmult
  119. nop
  120. ___
  121. $code.=<<___;
  122. ldb 15($Xi),$nlo
  123. ldo 8($Htbl),$Hll
  124. and $mask0xf0,$nlo,$nhi
  125. depd,z $nlo,59,4,$nlo
  126. ldd $nlo($Hll),$Zll
  127. ldd $nlo($Hhh),$Zhh
  128. depd,z $Zll,60,4,$rem
  129. shrpd $Zhh,$Zll,4,$Zll
  130. extrd,u $Zhh,59,60,$Zhh
  131. ldb 14($Xi),$nlo
  132. ldd $nhi($Hll),$Tll
  133. ldd $nhi($Hhh),$Thh
  134. and $mask0xf0,$nlo,$nhi
  135. depd,z $nlo,59,4,$nlo
  136. xor $Tll,$Zll,$Zll
  137. xor $Thh,$Zhh,$Zhh
  138. ldd $rem($rem_4bit),$rem
  139. b L\$oop_gmult_pa2
  140. ldi 13,$cnt
  141. .ALIGN 8
  142. L\$oop_gmult_pa2
  143. xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
  144. depd,z $Zll,60,4,$rem
  145. shrpd $Zhh,$Zll,4,$Zll
  146. extrd,u $Zhh,59,60,$Zhh
  147. ldd $nlo($Hll),$Tll
  148. ldd $nlo($Hhh),$Thh
  149. xor $Tll,$Zll,$Zll
  150. xor $Thh,$Zhh,$Zhh
  151. ldd $rem($rem_4bit),$rem
  152. xor $rem,$Zhh,$Zhh
  153. depd,z $Zll,60,4,$rem
  154. ldbx $cnt($Xi),$nlo
  155. shrpd $Zhh,$Zll,4,$Zll
  156. extrd,u $Zhh,59,60,$Zhh
  157. ldd $nhi($Hll),$Tll
  158. ldd $nhi($Hhh),$Thh
  159. and $mask0xf0,$nlo,$nhi
  160. depd,z $nlo,59,4,$nlo
  161. ldd $rem($rem_4bit),$rem
  162. xor $Tll,$Zll,$Zll
  163. addib,uv -1,$cnt,L\$oop_gmult_pa2
  164. xor $Thh,$Zhh,$Zhh
  165. xor $rem,$Zhh,$Zhh
  166. depd,z $Zll,60,4,$rem
  167. shrpd $Zhh,$Zll,4,$Zll
  168. extrd,u $Zhh,59,60,$Zhh
  169. ldd $nlo($Hll),$Tll
  170. ldd $nlo($Hhh),$Thh
  171. xor $Tll,$Zll,$Zll
  172. xor $Thh,$Zhh,$Zhh
  173. ldd $rem($rem_4bit),$rem
  174. xor $rem,$Zhh,$Zhh
  175. depd,z $Zll,60,4,$rem
  176. shrpd $Zhh,$Zll,4,$Zll
  177. extrd,u $Zhh,59,60,$Zhh
  178. ldd $nhi($Hll),$Tll
  179. ldd $nhi($Hhh),$Thh
  180. xor $Tll,$Zll,$Zll
  181. xor $Thh,$Zhh,$Zhh
  182. ldd $rem($rem_4bit),$rem
  183. xor $rem,$Zhh,$Zhh
  184. std $Zll,8($Xi)
  185. std $Zhh,0($Xi)
  186. ___
  187. $code.=<<___ if ($SIZE_T==4);
  188. b L\$done_gmult
  189. nop
  190. L\$parisc1_gmult
  191. ldb 15($Xi),$nlo
  192. ldo 12($Htbl),$Hll
  193. ldo 8($Htbl),$Hlh
  194. ldo 4($Htbl),$Hhl
  195. and $mask0xf0,$nlo,$nhi
  196. zdep $nlo,27,4,$nlo
  197. ldwx $nlo($Hll),$Zll
  198. ldwx $nlo($Hlh),$Zlh
  199. ldwx $nlo($Hhl),$Zhl
  200. ldwx $nlo($Hhh),$Zhh
  201. zdep $Zll,28,4,$rem
  202. ldb 14($Xi),$nlo
  203. ldwx $rem($rem_4bit),$rem
  204. shrpw $Zlh,$Zll,4,$Zll
  205. ldwx $nhi($Hll),$Tll
  206. shrpw $Zhl,$Zlh,4,$Zlh
  207. ldwx $nhi($Hlh),$Tlh
  208. shrpw $Zhh,$Zhl,4,$Zhl
  209. ldwx $nhi($Hhl),$Thl
  210. extru $Zhh,27,28,$Zhh
  211. ldwx $nhi($Hhh),$Thh
  212. xor $rem,$Zhh,$Zhh
  213. and $mask0xf0,$nlo,$nhi
  214. zdep $nlo,27,4,$nlo
  215. xor $Tll,$Zll,$Zll
  216. ldwx $nlo($Hll),$Tll
  217. xor $Tlh,$Zlh,$Zlh
  218. ldwx $nlo($Hlh),$Tlh
  219. xor $Thl,$Zhl,$Zhl
  220. b L\$oop_gmult_pa1
  221. ldi 13,$cnt
  222. .ALIGN 8
  223. L\$oop_gmult_pa1
  224. zdep $Zll,28,4,$rem
  225. ldwx $nlo($Hhl),$Thl
  226. xor $Thh,$Zhh,$Zhh
  227. ldwx $rem($rem_4bit),$rem
  228. shrpw $Zlh,$Zll,4,$Zll
  229. ldwx $nlo($Hhh),$Thh
  230. shrpw $Zhl,$Zlh,4,$Zlh
  231. ldbx $cnt($Xi),$nlo
  232. xor $Tll,$Zll,$Zll
  233. ldwx $nhi($Hll),$Tll
  234. shrpw $Zhh,$Zhl,4,$Zhl
  235. xor $Tlh,$Zlh,$Zlh
  236. ldwx $nhi($Hlh),$Tlh
  237. extru $Zhh,27,28,$Zhh
  238. xor $Thl,$Zhl,$Zhl
  239. ldwx $nhi($Hhl),$Thl
  240. xor $rem,$Zhh,$Zhh
  241. zdep $Zll,28,4,$rem
  242. xor $Thh,$Zhh,$Zhh
  243. ldwx $nhi($Hhh),$Thh
  244. shrpw $Zlh,$Zll,4,$Zll
  245. ldwx $rem($rem_4bit),$rem
  246. shrpw $Zhl,$Zlh,4,$Zlh
  247. shrpw $Zhh,$Zhl,4,$Zhl
  248. and $mask0xf0,$nlo,$nhi
  249. extru $Zhh,27,28,$Zhh
  250. zdep $nlo,27,4,$nlo
  251. xor $Tll,$Zll,$Zll
  252. ldwx $nlo($Hll),$Tll
  253. xor $Tlh,$Zlh,$Zlh
  254. ldwx $nlo($Hlh),$Tlh
  255. xor $rem,$Zhh,$Zhh
  256. addib,uv -1,$cnt,L\$oop_gmult_pa1
  257. xor $Thl,$Zhl,$Zhl
  258. zdep $Zll,28,4,$rem
  259. ldwx $nlo($Hhl),$Thl
  260. xor $Thh,$Zhh,$Zhh
  261. ldwx $rem($rem_4bit),$rem
  262. shrpw $Zlh,$Zll,4,$Zll
  263. ldwx $nlo($Hhh),$Thh
  264. shrpw $Zhl,$Zlh,4,$Zlh
  265. xor $Tll,$Zll,$Zll
  266. ldwx $nhi($Hll),$Tll
  267. shrpw $Zhh,$Zhl,4,$Zhl
  268. xor $Tlh,$Zlh,$Zlh
  269. ldwx $nhi($Hlh),$Tlh
  270. extru $Zhh,27,28,$Zhh
  271. xor $rem,$Zhh,$Zhh
  272. xor $Thl,$Zhl,$Zhl
  273. ldwx $nhi($Hhl),$Thl
  274. xor $Thh,$Zhh,$Zhh
  275. ldwx $nhi($Hhh),$Thh
  276. zdep $Zll,28,4,$rem
  277. ldwx $rem($rem_4bit),$rem
  278. shrpw $Zlh,$Zll,4,$Zll
  279. shrpw $Zhl,$Zlh,4,$Zlh
  280. shrpw $Zhh,$Zhl,4,$Zhl
  281. extru $Zhh,27,28,$Zhh
  282. xor $Tll,$Zll,$Zll
  283. xor $Tlh,$Zlh,$Zlh
  284. xor $rem,$Zhh,$Zhh
  285. stw $Zll,12($Xi)
  286. xor $Thl,$Zhl,$Zhl
  287. stw $Zlh,8($Xi)
  288. xor $Thh,$Zhh,$Zhh
  289. stw $Zhl,4($Xi)
  290. stw $Zhh,0($Xi)
  291. ___
  292. $code.=<<___;
  293. L\$done_gmult
  294. $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
  295. $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
  296. $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
  297. $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
  298. ___
  299. $code.=<<___ if ($SIZE_T==4);
  300. $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
  301. $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
  302. $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
  303. $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
  304. $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
  305. ___
  306. $code.=<<___;
  307. bv (%r2)
  308. .EXIT
  309. $POPMB -$FRAME(%sp),%r3
  310. .PROCEND
  311. .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
  312. .ALIGN 64
  313. gcm_ghash_4bit
  314. .PROC
  315. .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
  316. .ENTRY
  317. $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
  318. $PUSHMA %r3,$FRAME(%sp)
  319. $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
  320. $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
  321. $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
  322. ___
  323. $code.=<<___ if ($SIZE_T==4);
  324. $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
  325. $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
  326. $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
  327. $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
  328. $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
  329. ___
  330. $code.=<<___;
  331. blr %r0,$rem_4bit
  332. ldi 3,$rem
  333. L\$pic_ghash
  334. andcm $rem_4bit,$rem,$rem_4bit
  335. addl $inp,$len,$len
  336. ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
  337. ldi 0xf0,$mask0xf0
  338. ___
  339. $code.=<<___ if ($SIZE_T==4);
  340. ldi 31,$rem
  341. mtctl $rem,%cr11
  342. extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
  343. b L\$parisc1_ghash
  344. nop
  345. ___
  346. $code.=<<___;
  347. ldb 15($Xi),$nlo
  348. ldo 8($Htbl),$Hll
  349. L\$outer_ghash_pa2
  350. ldb 15($inp),$nhi
  351. xor $nhi,$nlo,$nlo
  352. and $mask0xf0,$nlo,$nhi
  353. depd,z $nlo,59,4,$nlo
  354. ldd $nlo($Hll),$Zll
  355. ldd $nlo($Hhh),$Zhh
  356. depd,z $Zll,60,4,$rem
  357. shrpd $Zhh,$Zll,4,$Zll
  358. extrd,u $Zhh,59,60,$Zhh
  359. ldb 14($Xi),$nlo
  360. ldb 14($inp),$byte
  361. ldd $nhi($Hll),$Tll
  362. ldd $nhi($Hhh),$Thh
  363. xor $byte,$nlo,$nlo
  364. and $mask0xf0,$nlo,$nhi
  365. depd,z $nlo,59,4,$nlo
  366. xor $Tll,$Zll,$Zll
  367. xor $Thh,$Zhh,$Zhh
  368. ldd $rem($rem_4bit),$rem
  369. b L\$oop_ghash_pa2
  370. ldi 13,$cnt
  371. .ALIGN 8
  372. L\$oop_ghash_pa2
  373. xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
  374. depd,z $Zll,60,4,$rem2
  375. shrpd $Zhh,$Zll,4,$Zll
  376. extrd,u $Zhh,59,60,$Zhh
  377. ldd $nlo($Hll),$Tll
  378. ldd $nlo($Hhh),$Thh
  379. xor $Tll,$Zll,$Zll
  380. xor $Thh,$Zhh,$Zhh
  381. ldbx $cnt($Xi),$nlo
  382. ldbx $cnt($inp),$byte
  383. depd,z $Zll,60,4,$rem
  384. shrpd $Zhh,$Zll,4,$Zll
  385. ldd $rem2($rem_4bit),$rem2
  386. xor $rem2,$Zhh,$Zhh
  387. xor $byte,$nlo,$nlo
  388. ldd $nhi($Hll),$Tll
  389. ldd $nhi($Hhh),$Thh
  390. and $mask0xf0,$nlo,$nhi
  391. depd,z $nlo,59,4,$nlo
  392. extrd,u $Zhh,59,60,$Zhh
  393. xor $Tll,$Zll,$Zll
  394. ldd $rem($rem_4bit),$rem
  395. addib,uv -1,$cnt,L\$oop_ghash_pa2
  396. xor $Thh,$Zhh,$Zhh
  397. xor $rem,$Zhh,$Zhh
  398. depd,z $Zll,60,4,$rem2
  399. shrpd $Zhh,$Zll,4,$Zll
  400. extrd,u $Zhh,59,60,$Zhh
  401. ldd $nlo($Hll),$Tll
  402. ldd $nlo($Hhh),$Thh
  403. xor $Tll,$Zll,$Zll
  404. xor $Thh,$Zhh,$Zhh
  405. depd,z $Zll,60,4,$rem
  406. shrpd $Zhh,$Zll,4,$Zll
  407. ldd $rem2($rem_4bit),$rem2
  408. xor $rem2,$Zhh,$Zhh
  409. ldd $nhi($Hll),$Tll
  410. ldd $nhi($Hhh),$Thh
  411. extrd,u $Zhh,59,60,$Zhh
  412. xor $Tll,$Zll,$Zll
  413. xor $Thh,$Zhh,$Zhh
  414. ldd $rem($rem_4bit),$rem
  415. xor $rem,$Zhh,$Zhh
  416. std $Zll,8($Xi)
  417. ldo 16($inp),$inp
  418. std $Zhh,0($Xi)
  419. cmpb,*<> $inp,$len,L\$outer_ghash_pa2
  420. copy $Zll,$nlo
  421. ___
  422. $code.=<<___ if ($SIZE_T==4);
  423. b L\$done_ghash
  424. nop
  425. L\$parisc1_ghash
  426. ldb 15($Xi),$nlo
  427. ldo 12($Htbl),$Hll
  428. ldo 8($Htbl),$Hlh
  429. ldo 4($Htbl),$Hhl
  430. L\$outer_ghash_pa1
  431. ldb 15($inp),$byte
  432. xor $byte,$nlo,$nlo
  433. and $mask0xf0,$nlo,$nhi
  434. zdep $nlo,27,4,$nlo
  435. ldwx $nlo($Hll),$Zll
  436. ldwx $nlo($Hlh),$Zlh
  437. ldwx $nlo($Hhl),$Zhl
  438. ldwx $nlo($Hhh),$Zhh
  439. zdep $Zll,28,4,$rem
  440. ldb 14($Xi),$nlo
  441. ldb 14($inp),$byte
  442. ldwx $rem($rem_4bit),$rem
  443. shrpw $Zlh,$Zll,4,$Zll
  444. ldwx $nhi($Hll),$Tll
  445. shrpw $Zhl,$Zlh,4,$Zlh
  446. ldwx $nhi($Hlh),$Tlh
  447. shrpw $Zhh,$Zhl,4,$Zhl
  448. ldwx $nhi($Hhl),$Thl
  449. extru $Zhh,27,28,$Zhh
  450. ldwx $nhi($Hhh),$Thh
  451. xor $byte,$nlo,$nlo
  452. xor $rem,$Zhh,$Zhh
  453. and $mask0xf0,$nlo,$nhi
  454. zdep $nlo,27,4,$nlo
  455. xor $Tll,$Zll,$Zll
  456. ldwx $nlo($Hll),$Tll
  457. xor $Tlh,$Zlh,$Zlh
  458. ldwx $nlo($Hlh),$Tlh
  459. xor $Thl,$Zhl,$Zhl
  460. b L\$oop_ghash_pa1
  461. ldi 13,$cnt
  462. .ALIGN 8
  463. L\$oop_ghash_pa1
  464. zdep $Zll,28,4,$rem
  465. ldwx $nlo($Hhl),$Thl
  466. xor $Thh,$Zhh,$Zhh
  467. ldwx $rem($rem_4bit),$rem
  468. shrpw $Zlh,$Zll,4,$Zll
  469. ldwx $nlo($Hhh),$Thh
  470. shrpw $Zhl,$Zlh,4,$Zlh
  471. ldbx $cnt($Xi),$nlo
  472. xor $Tll,$Zll,$Zll
  473. ldwx $nhi($Hll),$Tll
  474. shrpw $Zhh,$Zhl,4,$Zhl
  475. ldbx $cnt($inp),$byte
  476. xor $Tlh,$Zlh,$Zlh
  477. ldwx $nhi($Hlh),$Tlh
  478. extru $Zhh,27,28,$Zhh
  479. xor $Thl,$Zhl,$Zhl
  480. ldwx $nhi($Hhl),$Thl
  481. xor $rem,$Zhh,$Zhh
  482. zdep $Zll,28,4,$rem
  483. xor $Thh,$Zhh,$Zhh
  484. ldwx $nhi($Hhh),$Thh
  485. shrpw $Zlh,$Zll,4,$Zll
  486. ldwx $rem($rem_4bit),$rem
  487. shrpw $Zhl,$Zlh,4,$Zlh
  488. xor $byte,$nlo,$nlo
  489. shrpw $Zhh,$Zhl,4,$Zhl
  490. and $mask0xf0,$nlo,$nhi
  491. extru $Zhh,27,28,$Zhh
  492. zdep $nlo,27,4,$nlo
  493. xor $Tll,$Zll,$Zll
  494. ldwx $nlo($Hll),$Tll
  495. xor $Tlh,$Zlh,$Zlh
  496. ldwx $nlo($Hlh),$Tlh
  497. xor $rem,$Zhh,$Zhh
  498. addib,uv -1,$cnt,L\$oop_ghash_pa1
  499. xor $Thl,$Zhl,$Zhl
  500. zdep $Zll,28,4,$rem
  501. ldwx $nlo($Hhl),$Thl
  502. xor $Thh,$Zhh,$Zhh
  503. ldwx $rem($rem_4bit),$rem
  504. shrpw $Zlh,$Zll,4,$Zll
  505. ldwx $nlo($Hhh),$Thh
  506. shrpw $Zhl,$Zlh,4,$Zlh
  507. xor $Tll,$Zll,$Zll
  508. ldwx $nhi($Hll),$Tll
  509. shrpw $Zhh,$Zhl,4,$Zhl
  510. xor $Tlh,$Zlh,$Zlh
  511. ldwx $nhi($Hlh),$Tlh
  512. extru $Zhh,27,28,$Zhh
  513. xor $rem,$Zhh,$Zhh
  514. xor $Thl,$Zhl,$Zhl
  515. ldwx $nhi($Hhl),$Thl
  516. xor $Thh,$Zhh,$Zhh
  517. ldwx $nhi($Hhh),$Thh
  518. zdep $Zll,28,4,$rem
  519. ldwx $rem($rem_4bit),$rem
  520. shrpw $Zlh,$Zll,4,$Zll
  521. shrpw $Zhl,$Zlh,4,$Zlh
  522. shrpw $Zhh,$Zhl,4,$Zhl
  523. extru $Zhh,27,28,$Zhh
  524. xor $Tll,$Zll,$Zll
  525. xor $Tlh,$Zlh,$Zlh
  526. xor $rem,$Zhh,$Zhh
  527. stw $Zll,12($Xi)
  528. xor $Thl,$Zhl,$Zhl
  529. stw $Zlh,8($Xi)
  530. xor $Thh,$Zhh,$Zhh
  531. stw $Zhl,4($Xi)
  532. ldo 16($inp),$inp
  533. stw $Zhh,0($Xi)
  534. comb,<> $inp,$len,L\$outer_ghash_pa1
  535. copy $Zll,$nlo
  536. ___
  537. $code.=<<___;
  538. L\$done_ghash
  539. $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
  540. $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
  541. $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
  542. $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
  543. ___
  544. $code.=<<___ if ($SIZE_T==4);
  545. $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
  546. $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
  547. $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
  548. $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
  549. $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
  550. ___
  551. $code.=<<___;
  552. bv (%r2)
  553. .EXIT
  554. $POPMB -$FRAME(%sp),%r3
  555. .PROCEND
  556. .ALIGN 64
  557. L\$rem_4bit
  558. .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
  559. .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
  560. .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
  561. .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
  562. .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
  563. .ALIGN 64
  564. ___
  565. # Explicitly encode PA-RISC 2.0 instructions used in this module, so
  566. # that it can be compiled with .LEVEL 1.0. It should be noted that I
  567. # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
  568. # directive...
  569. my $ldd = sub {
  570. my ($mod,$args) = @_;
  571. my $orig = "ldd$mod\t$args";
  572. if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
  573. { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
  574. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  575. }
  576. elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
  577. { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
  578. $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
  579. $opcode|=(1<<5) if ($mod =~ /^,m/);
  580. $opcode|=(1<<13) if ($mod =~ /^,mb/);
  581. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  582. }
  583. else { "\t".$orig; }
  584. };
  585. my $std = sub {
  586. my ($mod,$args) = @_;
  587. my $orig = "std$mod\t$args";
  588. if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
  589. { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
  590. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  591. }
  592. else { "\t".$orig; }
  593. };
  594. my $extrd = sub {
  595. my ($mod,$args) = @_;
  596. my $orig = "extrd$mod\t$args";
  597. # I only have ",u" completer, it's implicitly encoded...
  598. if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
  599. { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
  600. my $len=32-$3;
  601. $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
  602. $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
  603. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  604. }
  605. elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
  606. { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
  607. my $len=32-$2;
  608. $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
  609. $opcode |= (1<<13) if ($mod =~ /,\**=/);
  610. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  611. }
  612. else { "\t".$orig; }
  613. };
  614. my $shrpd = sub {
  615. my ($mod,$args) = @_;
  616. my $orig = "shrpd$mod\t$args";
  617. if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
  618. { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
  619. my $cpos=63-$3;
  620. $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
  621. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  622. }
  623. elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
  624. { sprintf "\t.WORD\t0x%08x\t; %s",
  625. (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
  626. }
  627. else { "\t".$orig; }
  628. };
  629. my $depd = sub {
  630. my ($mod,$args) = @_;
  631. my $orig = "depd$mod\t$args";
  632. # I only have ",z" completer, it's implicitly encoded...
  633. if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
  634. { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
  635. my $cpos=63-$2;
  636. my $len=32-$3;
  637. $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
  638. $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
  639. sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
  640. }
  641. else { "\t".$orig; }
  642. };
  643. sub assemble {
  644. my ($mnemonic,$mod,$args)=@_;
  645. my $opcode = eval("\$$mnemonic");
  646. ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
  647. }
  648. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  649. =~ /GNU assembler/) {
  650. $gnuas = 1;
  651. }
  652. foreach (split("\n",$code)) {
  653. s/\`([^\`]*)\`/eval $1/ge;
  654. if ($SIZE_T==4) {
  655. s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
  656. s/cmpb,\*/comb,/;
  657. s/,\*/,/;
  658. }
  659. s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
  660. s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
  661. s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
  662. s/\bbv\b/bve/ if ($SIZE_T==8);
  663. print $_,"\n";
  664. }
  665. close STDOUT or die "error closing STDOUT: $!";