cmllt4-sparcv9.pl 23 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by David S. Miller and Andy Polyakov.
  10. # The module is licensed under 2-clause BSD
  11. # license. October 2012. All rights reserved.
  12. # ====================================================================
  13. ######################################################################
  14. # Camellia for SPARC T4.
  15. #
  16. # As with AES below results [for aligned data] are virtually identical
  17. # to critical path lengths for 3-cycle instruction latency:
  18. #
  19. # 128-bit key 192/256-
  20. # CBC encrypt 4.14/4.21(*) 5.46/5.52
  21. # (*) numbers after slash are for
  22. # misaligned data.
  23. #
  24. # As with Intel AES-NI, question is if it's possible to improve
  25. # performance of parallelizable modes by interleaving round
  26. # instructions. In Camellia every instruction is dependent on
  27. # previous, which means that there is place for 2 additional ones
  28. # in between two dependent. Can we expect 3x performance improvement?
  29. # At least one can argue that it should be possible to break 2x
  30. # barrier... For some reason not even 2x appears to be possible:
  31. #
  32. # 128-bit key 192/256-
  33. # CBC decrypt 2.21/2.74 2.99/3.40
  34. # CTR 2.15/2.68(*) 2.93/3.34
  35. # (*) numbers after slash are for
  36. # misaligned data.
  37. #
  38. # This is for 2x interleave. But compared to 1x interleave CBC decrypt
  39. # improved by ... 0% for 128-bit key, and 11% for 192/256-bit one.
  40. # So that out-of-order execution logic can take non-interleaved code
  41. # to 1.87x, but can't take 2x interleaved one any further. There
  42. # surely is some explanation... As result 3x interleave was not even
  43. # attempted. Instead an effort was made to share specific modes
  44. # implementations with AES module (therefore sparct4_modes.pl).
  45. #
  46. # To anchor to something else, software C implementation processes
  47. # one byte in 38 cycles with 128-bit key on same processor.
  48. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  49. push(@INC,"${dir}","${dir}../../perlasm");
  50. require "sparcv9_modes.pl";
  51. $output = pop;
  52. open STDOUT,">$output";
  53. $::evp=1; # if $evp is set to 0, script generates module with
  54. # Camellia_[en|de]crypt, Camellia_set_key and Camellia_cbc_encrypt
  55. # entry points. These are fully compatible with openssl/camellia.h.
  56. ######################################################################
  57. # single-round subroutines
  58. #
  59. {
  60. my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
  61. $code=<<___;
  62. #include "sparc_arch.h"
  63. .text
  64. .globl cmll_t4_encrypt
  65. .align 32
  66. cmll_t4_encrypt:
  67. andcc $inp, 7, %g1 ! is input aligned?
  68. andn $inp, 7, $inp
  69. ldx [$key + 0], %g4
  70. ldx [$key + 8], %g5
  71. ldx [$inp + 0], %o4
  72. bz,pt %icc, 1f
  73. ldx [$inp + 8], %o5
  74. ldx [$inp + 16], $inp
  75. sll %g1, 3, %g1
  76. sub %g0, %g1, %o3
  77. sllx %o4, %g1, %o4
  78. sllx %o5, %g1, %g1
  79. srlx %o5, %o3, %o5
  80. srlx $inp, %o3, %o3
  81. or %o5, %o4, %o4
  82. or %o3, %g1, %o5
  83. 1:
  84. ld [$key + 272], $rounds ! grandRounds, 3 or 4
  85. ldd [$key + 16], %f12
  86. ldd [$key + 24], %f14
  87. xor %g4, %o4, %o4
  88. xor %g5, %o5, %o5
  89. ldd [$key + 32], %f16
  90. ldd [$key + 40], %f18
  91. movxtod %o4, %f0
  92. movxtod %o5, %f2
  93. ldd [$key + 48], %f20
  94. ldd [$key + 56], %f22
  95. sub $rounds, 1, $rounds
  96. ldd [$key + 64], %f24
  97. ldd [$key + 72], %f26
  98. add $key, 80, $key
  99. .Lenc:
  100. camellia_f %f12, %f2, %f0, %f2
  101. ldd [$key + 0], %f12
  102. sub $rounds,1,$rounds
  103. camellia_f %f14, %f0, %f2, %f0
  104. ldd [$key + 8], %f14
  105. camellia_f %f16, %f2, %f0, %f2
  106. ldd [$key + 16], %f16
  107. camellia_f %f18, %f0, %f2, %f0
  108. ldd [$key + 24], %f18
  109. camellia_f %f20, %f2, %f0, %f2
  110. ldd [$key + 32], %f20
  111. camellia_f %f22, %f0, %f2, %f0
  112. ldd [$key + 40], %f22
  113. camellia_fl %f24, %f0, %f0
  114. ldd [$key + 48], %f24
  115. camellia_fli %f26, %f2, %f2
  116. ldd [$key + 56], %f26
  117. brnz,pt $rounds, .Lenc
  118. add $key, 64, $key
  119. andcc $out, 7, $tmp ! is output aligned?
  120. camellia_f %f12, %f2, %f0, %f2
  121. camellia_f %f14, %f0, %f2, %f0
  122. camellia_f %f16, %f2, %f0, %f2
  123. camellia_f %f18, %f0, %f2, %f0
  124. camellia_f %f20, %f2, %f0, %f4
  125. camellia_f %f22, %f0, %f4, %f2
  126. fxor %f24, %f4, %f0
  127. fxor %f26, %f2, %f2
  128. bnz,pn %icc, 2f
  129. nop
  130. std %f0, [$out + 0]
  131. retl
  132. std %f2, [$out + 8]
  133. 2: alignaddrl $out, %g0, $out
  134. mov 0xff, $mask
  135. srl $mask, $tmp, $mask
  136. faligndata %f0, %f0, %f4
  137. faligndata %f0, %f2, %f6
  138. faligndata %f2, %f2, %f8
  139. stda %f4, [$out + $mask]0xc0 ! partial store
  140. std %f6, [$out + 8]
  141. add $out, 16, $out
  142. orn %g0, $mask, $mask
  143. retl
  144. stda %f8, [$out + $mask]0xc0 ! partial store
  145. .type cmll_t4_encrypt,#function
  146. .size cmll_t4_encrypt,.-cmll_t4_encrypt
  147. .globl cmll_t4_decrypt
  148. .align 32
  149. cmll_t4_decrypt:
  150. ld [$key + 272], $rounds ! grandRounds, 3 or 4
  151. andcc $inp, 7, %g1 ! is input aligned?
  152. andn $inp, 7, $inp
  153. sll $rounds, 6, $rounds
  154. add $rounds, $key, $key
  155. ldx [$inp + 0], %o4
  156. bz,pt %icc, 1f
  157. ldx [$inp + 8], %o5
  158. ldx [$inp + 16], $inp
  159. sll %g1, 3, %g1
  160. sub %g0, %g1, %g4
  161. sllx %o4, %g1, %o4
  162. sllx %o5, %g1, %g1
  163. srlx %o5, %g4, %o5
  164. srlx $inp, %g4, %g4
  165. or %o5, %o4, %o4
  166. or %g4, %g1, %o5
  167. 1:
  168. ldx [$key + 0], %g4
  169. ldx [$key + 8], %g5
  170. ldd [$key - 8], %f12
  171. ldd [$key - 16], %f14
  172. xor %g4, %o4, %o4
  173. xor %g5, %o5, %o5
  174. ldd [$key - 24], %f16
  175. ldd [$key - 32], %f18
  176. movxtod %o4, %f0
  177. movxtod %o5, %f2
  178. ldd [$key - 40], %f20
  179. ldd [$key - 48], %f22
  180. sub $rounds, 64, $rounds
  181. ldd [$key - 56], %f24
  182. ldd [$key - 64], %f26
  183. sub $key, 64, $key
  184. .Ldec:
  185. camellia_f %f12, %f2, %f0, %f2
  186. ldd [$key - 8], %f12
  187. sub $rounds, 64, $rounds
  188. camellia_f %f14, %f0, %f2, %f0
  189. ldd [$key - 16], %f14
  190. camellia_f %f16, %f2, %f0, %f2
  191. ldd [$key - 24], %f16
  192. camellia_f %f18, %f0, %f2, %f0
  193. ldd [$key - 32], %f18
  194. camellia_f %f20, %f2, %f0, %f2
  195. ldd [$key - 40], %f20
  196. camellia_f %f22, %f0, %f2, %f0
  197. ldd [$key - 48], %f22
  198. camellia_fl %f24, %f0, %f0
  199. ldd [$key - 56], %f24
  200. camellia_fli %f26, %f2, %f2
  201. ldd [$key - 64], %f26
  202. brnz,pt $rounds, .Ldec
  203. sub $key, 64, $key
  204. andcc $out, 7, $tmp ! is output aligned?
  205. camellia_f %f12, %f2, %f0, %f2
  206. camellia_f %f14, %f0, %f2, %f0
  207. camellia_f %f16, %f2, %f0, %f2
  208. camellia_f %f18, %f0, %f2, %f0
  209. camellia_f %f20, %f2, %f0, %f4
  210. camellia_f %f22, %f0, %f4, %f2
  211. fxor %f26, %f4, %f0
  212. fxor %f24, %f2, %f2
  213. bnz,pn %icc, 2f
  214. nop
  215. std %f0, [$out + 0]
  216. retl
  217. std %f2, [$out + 8]
  218. 2: alignaddrl $out, %g0, $out
  219. mov 0xff, $mask
  220. srl $mask, $tmp, $mask
  221. faligndata %f0, %f0, %f4
  222. faligndata %f0, %f2, %f6
  223. faligndata %f2, %f2, %f8
  224. stda %f4, [$out + $mask]0xc0 ! partial store
  225. std %f6, [$out + 8]
  226. add $out, 16, $out
  227. orn %g0, $mask, $mask
  228. retl
  229. stda %f8, [$out + $mask]0xc0 ! partial store
  230. .type cmll_t4_decrypt,#function
  231. .size cmll_t4_decrypt,.-cmll_t4_decrypt
  232. ___
  233. }
  234. ######################################################################
  235. # key setup subroutines
  236. #
  237. {
  238. sub ROTL128 {
  239. my $rot = shift;
  240. "srlx %o4, 64-$rot, %g4\n\t".
  241. "sllx %o4, $rot, %o4\n\t".
  242. "srlx %o5, 64-$rot, %g5\n\t".
  243. "sllx %o5, $rot, %o5\n\t".
  244. "or %o4, %g5, %o4\n\t".
  245. "or %o5, %g4, %o5";
  246. }
  247. my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
  248. $code.=<<___;
  249. .globl cmll_t4_set_key
  250. .align 32
  251. cmll_t4_set_key:
  252. and $inp, 7, $tmp
  253. alignaddr $inp, %g0, $inp
  254. cmp $bits, 192
  255. ldd [$inp + 0], %f0
  256. bl,pt %icc,.L128
  257. ldd [$inp + 8], %f2
  258. be,pt %icc,.L192
  259. ldd [$inp + 16], %f4
  260. brz,pt $tmp, .L256aligned
  261. ldd [$inp + 24], %f6
  262. ldd [$inp + 32], %f8
  263. faligndata %f0, %f2, %f0
  264. faligndata %f2, %f4, %f2
  265. faligndata %f4, %f6, %f4
  266. b .L256aligned
  267. faligndata %f6, %f8, %f6
  268. .align 16
  269. .L192:
  270. brz,a,pt $tmp, .L256aligned
  271. fnot2 %f4, %f6
  272. ldd [$inp + 24], %f6
  273. nop
  274. faligndata %f0, %f2, %f0
  275. faligndata %f2, %f4, %f2
  276. faligndata %f4, %f6, %f4
  277. fnot2 %f4, %f6
  278. .L256aligned:
  279. std %f0, [$out + 0] ! k[0, 1]
  280. fsrc2 %f0, %f28
  281. std %f2, [$out + 8] ! k[2, 3]
  282. fsrc2 %f2, %f30
  283. fxor %f4, %f0, %f0
  284. b .L128key
  285. fxor %f6, %f2, %f2
  286. .align 16
  287. .L128:
  288. brz,pt $tmp, .L128aligned
  289. nop
  290. ldd [$inp + 16], %f4
  291. nop
  292. faligndata %f0, %f2, %f0
  293. faligndata %f2, %f4, %f2
  294. .L128aligned:
  295. std %f0, [$out + 0] ! k[0, 1]
  296. fsrc2 %f0, %f28
  297. std %f2, [$out + 8] ! k[2, 3]
  298. fsrc2 %f2, %f30
  299. .L128key:
  300. mov %o7, %o5
  301. 1: call .+8
  302. add %o7, SIGMA-1b, %o4
  303. mov %o5, %o7
  304. ldd [%o4 + 0], %f16
  305. ldd [%o4 + 8], %f18
  306. ldd [%o4 + 16], %f20
  307. ldd [%o4 + 24], %f22
  308. camellia_f %f16, %f2, %f0, %f2
  309. camellia_f %f18, %f0, %f2, %f0
  310. fxor %f28, %f0, %f0
  311. fxor %f30, %f2, %f2
  312. camellia_f %f20, %f2, %f0, %f2
  313. camellia_f %f22, %f0, %f2, %f0
  314. bge,pn %icc, .L256key
  315. nop
  316. std %f0, [$out + 0x10] ! k[ 4, 5]
  317. std %f2, [$out + 0x18] ! k[ 6, 7]
  318. movdtox %f0, %o4
  319. movdtox %f2, %o5
  320. `&ROTL128(15)`
  321. stx %o4, [$out + 0x30] ! k[12, 13]
  322. stx %o5, [$out + 0x38] ! k[14, 15]
  323. `&ROTL128(15)`
  324. stx %o4, [$out + 0x40] ! k[16, 17]
  325. stx %o5, [$out + 0x48] ! k[18, 19]
  326. `&ROTL128(15)`
  327. stx %o4, [$out + 0x60] ! k[24, 25]
  328. `&ROTL128(15)`
  329. stx %o4, [$out + 0x70] ! k[28, 29]
  330. stx %o5, [$out + 0x78] ! k[30, 31]
  331. `&ROTL128(34)`
  332. stx %o4, [$out + 0xa0] ! k[40, 41]
  333. stx %o5, [$out + 0xa8] ! k[42, 43]
  334. `&ROTL128(17)`
  335. stx %o4, [$out + 0xc0] ! k[48, 49]
  336. stx %o5, [$out + 0xc8] ! k[50, 51]
  337. movdtox %f28, %o4 ! k[ 0, 1]
  338. movdtox %f30, %o5 ! k[ 2, 3]
  339. `&ROTL128(15)`
  340. stx %o4, [$out + 0x20] ! k[ 8, 9]
  341. stx %o5, [$out + 0x28] ! k[10, 11]
  342. `&ROTL128(30)`
  343. stx %o4, [$out + 0x50] ! k[20, 21]
  344. stx %o5, [$out + 0x58] ! k[22, 23]
  345. `&ROTL128(15)`
  346. stx %o5, [$out + 0x68] ! k[26, 27]
  347. `&ROTL128(17)`
  348. stx %o4, [$out + 0x80] ! k[32, 33]
  349. stx %o5, [$out + 0x88] ! k[34, 35]
  350. `&ROTL128(17)`
  351. stx %o4, [$out + 0x90] ! k[36, 37]
  352. stx %o5, [$out + 0x98] ! k[38, 39]
  353. `&ROTL128(17)`
  354. stx %o4, [$out + 0xb0] ! k[44, 45]
  355. stx %o5, [$out + 0xb8] ! k[46, 47]
  356. mov 3, $tmp
  357. st $tmp, [$out + 0x110]
  358. retl
  359. xor %o0, %o0, %o0
  360. .align 16
  361. .L256key:
  362. ldd [%o4 + 32], %f24
  363. ldd [%o4 + 40], %f26
  364. std %f0, [$out + 0x30] ! k[12, 13]
  365. std %f2, [$out + 0x38] ! k[14, 15]
  366. fxor %f4, %f0, %f0
  367. fxor %f6, %f2, %f2
  368. camellia_f %f24, %f2, %f0, %f2
  369. camellia_f %f26, %f0, %f2, %f0
  370. std %f0, [$out + 0x10] ! k[ 4, 5]
  371. std %f2, [$out + 0x18] ! k[ 6, 7]
  372. movdtox %f0, %o4
  373. movdtox %f2, %o5
  374. `&ROTL128(30)`
  375. stx %o4, [$out + 0x50] ! k[20, 21]
  376. stx %o5, [$out + 0x58] ! k[22, 23]
  377. `&ROTL128(30)`
  378. stx %o4, [$out + 0xa0] ! k[40, 41]
  379. stx %o5, [$out + 0xa8] ! k[42, 43]
  380. `&ROTL128(51)`
  381. stx %o4, [$out + 0x100] ! k[64, 65]
  382. stx %o5, [$out + 0x108] ! k[66, 67]
  383. movdtox %f4, %o4 ! k[ 8, 9]
  384. movdtox %f6, %o5 ! k[10, 11]
  385. `&ROTL128(15)`
  386. stx %o4, [$out + 0x20] ! k[ 8, 9]
  387. stx %o5, [$out + 0x28] ! k[10, 11]
  388. `&ROTL128(15)`
  389. stx %o4, [$out + 0x40] ! k[16, 17]
  390. stx %o5, [$out + 0x48] ! k[18, 19]
  391. `&ROTL128(30)`
  392. stx %o4, [$out + 0x90] ! k[36, 37]
  393. stx %o5, [$out + 0x98] ! k[38, 39]
  394. `&ROTL128(34)`
  395. stx %o4, [$out + 0xd0] ! k[52, 53]
  396. stx %o5, [$out + 0xd8] ! k[54, 55]
  397. ldx [$out + 0x30], %o4 ! k[12, 13]
  398. ldx [$out + 0x38], %o5 ! k[14, 15]
  399. `&ROTL128(15)`
  400. stx %o4, [$out + 0x30] ! k[12, 13]
  401. stx %o5, [$out + 0x38] ! k[14, 15]
  402. `&ROTL128(30)`
  403. stx %o4, [$out + 0x70] ! k[28, 29]
  404. stx %o5, [$out + 0x78] ! k[30, 31]
  405. srlx %o4, 32, %g4
  406. srlx %o5, 32, %g5
  407. st %o4, [$out + 0xc0] ! k[48]
  408. st %g5, [$out + 0xc4] ! k[49]
  409. st %o5, [$out + 0xc8] ! k[50]
  410. st %g4, [$out + 0xcc] ! k[51]
  411. `&ROTL128(49)`
  412. stx %o4, [$out + 0xe0] ! k[56, 57]
  413. stx %o5, [$out + 0xe8] ! k[58, 59]
  414. movdtox %f28, %o4 ! k[ 0, 1]
  415. movdtox %f30, %o5 ! k[ 2, 3]
  416. `&ROTL128(45)`
  417. stx %o4, [$out + 0x60] ! k[24, 25]
  418. stx %o5, [$out + 0x68] ! k[26, 27]
  419. `&ROTL128(15)`
  420. stx %o4, [$out + 0x80] ! k[32, 33]
  421. stx %o5, [$out + 0x88] ! k[34, 35]
  422. `&ROTL128(17)`
  423. stx %o4, [$out + 0xb0] ! k[44, 45]
  424. stx %o5, [$out + 0xb8] ! k[46, 47]
  425. `&ROTL128(34)`
  426. stx %o4, [$out + 0xf0] ! k[60, 61]
  427. stx %o5, [$out + 0xf8] ! k[62, 63]
  428. mov 4, $tmp
  429. st $tmp, [$out + 0x110]
  430. retl
  431. xor %o0, %o0, %o0
  432. .type cmll_t4_set_key,#function
  433. .size cmll_t4_set_key,.-cmll_t4_set_key
  434. .align 32
  435. SIGMA:
  436. .long 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2
  437. .long 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c
  438. .long 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd
  439. .type SIGMA,#object
  440. .size SIGMA,.-SIGMA
  441. .asciz "Camellia for SPARC T4, David S. Miller, Andy Polyakov"
  442. ___
  443. }
  444. {{{
  445. my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
  446. my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
  447. $code.=<<___;
  448. .align 32
  449. _cmll128_load_enckey:
  450. ldx [$key + 0], %g4
  451. ldx [$key + 8], %g5
  452. ___
  453. for ($i=2; $i<26;$i++) { # load key schedule
  454. $code.=<<___;
  455. ldd [$key + `8*$i`], %f`12+2*$i`
  456. ___
  457. }
  458. $code.=<<___;
  459. retl
  460. nop
  461. .type _cmll128_load_enckey,#function
  462. .size _cmll128_load_enckey,.-_cmll128_load_enckey
  463. _cmll256_load_enckey=_cmll128_load_enckey
  464. .align 32
  465. _cmll256_load_deckey:
  466. ldd [$key + 64], %f62
  467. ldd [$key + 72], %f60
  468. b .Load_deckey
  469. add $key, 64, $key
  470. _cmll128_load_deckey:
  471. ldd [$key + 0], %f60
  472. ldd [$key + 8], %f62
  473. .Load_deckey:
  474. ___
  475. for ($i=2; $i<24;$i++) { # load key schedule
  476. $code.=<<___;
  477. ldd [$key + `8*$i`], %f`62-2*$i`
  478. ___
  479. }
  480. $code.=<<___;
  481. ldx [$key + 192], %g4
  482. retl
  483. ldx [$key + 200], %g5
  484. .type _cmll256_load_deckey,#function
  485. .size _cmll256_load_deckey,.-_cmll256_load_deckey
  486. .align 32
  487. _cmll128_encrypt_1x:
  488. ___
  489. for ($i=0; $i<3; $i++) {
  490. $code.=<<___;
  491. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  492. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  493. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  494. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  495. ___
  496. $code.=<<___ if ($i<2);
  497. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  498. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  499. camellia_fl %f`16+16*$i+12`, %f0, %f0
  500. camellia_fli %f`16+16*$i+14`, %f2, %f2
  501. ___
  502. }
  503. $code.=<<___;
  504. camellia_f %f56, %f2, %f0, %f4
  505. camellia_f %f58, %f0, %f4, %f2
  506. fxor %f60, %f4, %f0
  507. retl
  508. fxor %f62, %f2, %f2
  509. .type _cmll128_encrypt_1x,#function
  510. .size _cmll128_encrypt_1x,.-_cmll128_encrypt_1x
  511. _cmll128_decrypt_1x=_cmll128_encrypt_1x
  512. .align 32
  513. _cmll128_encrypt_2x:
  514. ___
  515. for ($i=0; $i<3; $i++) {
  516. $code.=<<___;
  517. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  518. camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
  519. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  520. camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
  521. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  522. camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
  523. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  524. camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
  525. ___
  526. $code.=<<___ if ($i<2);
  527. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  528. camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
  529. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  530. camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
  531. camellia_fl %f`16+16*$i+12`, %f0, %f0
  532. camellia_fl %f`16+16*$i+12`, %f4, %f4
  533. camellia_fli %f`16+16*$i+14`, %f2, %f2
  534. camellia_fli %f`16+16*$i+14`, %f6, %f6
  535. ___
  536. }
  537. $code.=<<___;
  538. camellia_f %f56, %f2, %f0, %f8
  539. camellia_f %f56, %f6, %f4, %f10
  540. camellia_f %f58, %f0, %f8, %f2
  541. camellia_f %f58, %f4, %f10, %f6
  542. fxor %f60, %f8, %f0
  543. fxor %f60, %f10, %f4
  544. fxor %f62, %f2, %f2
  545. retl
  546. fxor %f62, %f6, %f6
  547. .type _cmll128_encrypt_2x,#function
  548. .size _cmll128_encrypt_2x,.-_cmll128_encrypt_2x
  549. _cmll128_decrypt_2x=_cmll128_encrypt_2x
  550. .align 32
  551. _cmll256_encrypt_1x:
  552. camellia_f %f16, %f2, %f0, %f2
  553. camellia_f %f18, %f0, %f2, %f0
  554. ldd [$key + 208], %f16
  555. ldd [$key + 216], %f18
  556. camellia_f %f20, %f2, %f0, %f2
  557. camellia_f %f22, %f0, %f2, %f0
  558. ldd [$key + 224], %f20
  559. ldd [$key + 232], %f22
  560. camellia_f %f24, %f2, %f0, %f2
  561. camellia_f %f26, %f0, %f2, %f0
  562. ldd [$key + 240], %f24
  563. ldd [$key + 248], %f26
  564. camellia_fl %f28, %f0, %f0
  565. camellia_fli %f30, %f2, %f2
  566. ldd [$key + 256], %f28
  567. ldd [$key + 264], %f30
  568. ___
  569. for ($i=1; $i<3; $i++) {
  570. $code.=<<___;
  571. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  572. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  573. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  574. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  575. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  576. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  577. camellia_fl %f`16+16*$i+12`, %f0, %f0
  578. camellia_fli %f`16+16*$i+14`, %f2, %f2
  579. ___
  580. }
  581. $code.=<<___;
  582. camellia_f %f16, %f2, %f0, %f2
  583. camellia_f %f18, %f0, %f2, %f0
  584. ldd [$key + 16], %f16
  585. ldd [$key + 24], %f18
  586. camellia_f %f20, %f2, %f0, %f2
  587. camellia_f %f22, %f0, %f2, %f0
  588. ldd [$key + 32], %f20
  589. ldd [$key + 40], %f22
  590. camellia_f %f24, %f2, %f0, %f4
  591. camellia_f %f26, %f0, %f4, %f2
  592. ldd [$key + 48], %f24
  593. ldd [$key + 56], %f26
  594. fxor %f28, %f4, %f0
  595. fxor %f30, %f2, %f2
  596. ldd [$key + 64], %f28
  597. retl
  598. ldd [$key + 72], %f30
  599. .type _cmll256_encrypt_1x,#function
  600. .size _cmll256_encrypt_1x,.-_cmll256_encrypt_1x
  601. .align 32
  602. _cmll256_encrypt_2x:
  603. camellia_f %f16, %f2, %f0, %f2
  604. camellia_f %f16, %f6, %f4, %f6
  605. camellia_f %f18, %f0, %f2, %f0
  606. camellia_f %f18, %f4, %f6, %f4
  607. ldd [$key + 208], %f16
  608. ldd [$key + 216], %f18
  609. camellia_f %f20, %f2, %f0, %f2
  610. camellia_f %f20, %f6, %f4, %f6
  611. camellia_f %f22, %f0, %f2, %f0
  612. camellia_f %f22, %f4, %f6, %f4
  613. ldd [$key + 224], %f20
  614. ldd [$key + 232], %f22
  615. camellia_f %f24, %f2, %f0, %f2
  616. camellia_f %f24, %f6, %f4, %f6
  617. camellia_f %f26, %f0, %f2, %f0
  618. camellia_f %f26, %f4, %f6, %f4
  619. ldd [$key + 240], %f24
  620. ldd [$key + 248], %f26
  621. camellia_fl %f28, %f0, %f0
  622. camellia_fl %f28, %f4, %f4
  623. camellia_fli %f30, %f2, %f2
  624. camellia_fli %f30, %f6, %f6
  625. ldd [$key + 256], %f28
  626. ldd [$key + 264], %f30
  627. ___
  628. for ($i=1; $i<3; $i++) {
  629. $code.=<<___;
  630. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  631. camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
  632. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  633. camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
  634. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  635. camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
  636. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  637. camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
  638. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  639. camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
  640. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  641. camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
  642. camellia_fl %f`16+16*$i+12`, %f0, %f0
  643. camellia_fl %f`16+16*$i+12`, %f4, %f4
  644. camellia_fli %f`16+16*$i+14`, %f2, %f2
  645. camellia_fli %f`16+16*$i+14`, %f6, %f6
  646. ___
  647. }
  648. $code.=<<___;
  649. camellia_f %f16, %f2, %f0, %f2
  650. camellia_f %f16, %f6, %f4, %f6
  651. camellia_f %f18, %f0, %f2, %f0
  652. camellia_f %f18, %f4, %f6, %f4
  653. ldd [$key + 16], %f16
  654. ldd [$key + 24], %f18
  655. camellia_f %f20, %f2, %f0, %f2
  656. camellia_f %f20, %f6, %f4, %f6
  657. camellia_f %f22, %f0, %f2, %f0
  658. camellia_f %f22, %f4, %f6, %f4
  659. ldd [$key + 32], %f20
  660. ldd [$key + 40], %f22
  661. camellia_f %f24, %f2, %f0, %f8
  662. camellia_f %f24, %f6, %f4, %f10
  663. camellia_f %f26, %f0, %f8, %f2
  664. camellia_f %f26, %f4, %f10, %f6
  665. ldd [$key + 48], %f24
  666. ldd [$key + 56], %f26
  667. fxor %f28, %f8, %f0
  668. fxor %f28, %f10, %f4
  669. fxor %f30, %f2, %f2
  670. fxor %f30, %f6, %f6
  671. ldd [$key + 64], %f28
  672. retl
  673. ldd [$key + 72], %f30
  674. .type _cmll256_encrypt_2x,#function
  675. .size _cmll256_encrypt_2x,.-_cmll256_encrypt_2x
  676. .align 32
  677. _cmll256_decrypt_1x:
  678. camellia_f %f16, %f2, %f0, %f2
  679. camellia_f %f18, %f0, %f2, %f0
  680. ldd [$key - 8], %f16
  681. ldd [$key - 16], %f18
  682. camellia_f %f20, %f2, %f0, %f2
  683. camellia_f %f22, %f0, %f2, %f0
  684. ldd [$key - 24], %f20
  685. ldd [$key - 32], %f22
  686. camellia_f %f24, %f2, %f0, %f2
  687. camellia_f %f26, %f0, %f2, %f0
  688. ldd [$key - 40], %f24
  689. ldd [$key - 48], %f26
  690. camellia_fl %f28, %f0, %f0
  691. camellia_fli %f30, %f2, %f2
  692. ldd [$key - 56], %f28
  693. ldd [$key - 64], %f30
  694. ___
  695. for ($i=1; $i<3; $i++) {
  696. $code.=<<___;
  697. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  698. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  699. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  700. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  701. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  702. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  703. camellia_fl %f`16+16*$i+12`, %f0, %f0
  704. camellia_fli %f`16+16*$i+14`, %f2, %f2
  705. ___
  706. }
  707. $code.=<<___;
  708. camellia_f %f16, %f2, %f0, %f2
  709. camellia_f %f18, %f0, %f2, %f0
  710. ldd [$key + 184], %f16
  711. ldd [$key + 176], %f18
  712. camellia_f %f20, %f2, %f0, %f2
  713. camellia_f %f22, %f0, %f2, %f0
  714. ldd [$key + 168], %f20
  715. ldd [$key + 160], %f22
  716. camellia_f %f24, %f2, %f0, %f4
  717. camellia_f %f26, %f0, %f4, %f2
  718. ldd [$key + 152], %f24
  719. ldd [$key + 144], %f26
  720. fxor %f30, %f4, %f0
  721. fxor %f28, %f2, %f2
  722. ldd [$key + 136], %f28
  723. retl
  724. ldd [$key + 128], %f30
  725. .type _cmll256_decrypt_1x,#function
  726. .size _cmll256_decrypt_1x,.-_cmll256_decrypt_1x
  727. .align 32
  728. _cmll256_decrypt_2x:
  729. camellia_f %f16, %f2, %f0, %f2
  730. camellia_f %f16, %f6, %f4, %f6
  731. camellia_f %f18, %f0, %f2, %f0
  732. camellia_f %f18, %f4, %f6, %f4
  733. ldd [$key - 8], %f16
  734. ldd [$key - 16], %f18
  735. camellia_f %f20, %f2, %f0, %f2
  736. camellia_f %f20, %f6, %f4, %f6
  737. camellia_f %f22, %f0, %f2, %f0
  738. camellia_f %f22, %f4, %f6, %f4
  739. ldd [$key - 24], %f20
  740. ldd [$key - 32], %f22
  741. camellia_f %f24, %f2, %f0, %f2
  742. camellia_f %f24, %f6, %f4, %f6
  743. camellia_f %f26, %f0, %f2, %f0
  744. camellia_f %f26, %f4, %f6, %f4
  745. ldd [$key - 40], %f24
  746. ldd [$key - 48], %f26
  747. camellia_fl %f28, %f0, %f0
  748. camellia_fl %f28, %f4, %f4
  749. camellia_fli %f30, %f2, %f2
  750. camellia_fli %f30, %f6, %f6
  751. ldd [$key - 56], %f28
  752. ldd [$key - 64], %f30
  753. ___
  754. for ($i=1; $i<3; $i++) {
  755. $code.=<<___;
  756. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  757. camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
  758. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  759. camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
  760. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  761. camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
  762. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  763. camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
  764. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  765. camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
  766. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  767. camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
  768. camellia_fl %f`16+16*$i+12`, %f0, %f0
  769. camellia_fl %f`16+16*$i+12`, %f4, %f4
  770. camellia_fli %f`16+16*$i+14`, %f2, %f2
  771. camellia_fli %f`16+16*$i+14`, %f6, %f6
  772. ___
  773. }
  774. $code.=<<___;
  775. camellia_f %f16, %f2, %f0, %f2
  776. camellia_f %f16, %f6, %f4, %f6
  777. camellia_f %f18, %f0, %f2, %f0
  778. camellia_f %f18, %f4, %f6, %f4
  779. ldd [$key + 184], %f16
  780. ldd [$key + 176], %f18
  781. camellia_f %f20, %f2, %f0, %f2
  782. camellia_f %f20, %f6, %f4, %f6
  783. camellia_f %f22, %f0, %f2, %f0
  784. camellia_f %f22, %f4, %f6, %f4
  785. ldd [$key + 168], %f20
  786. ldd [$key + 160], %f22
  787. camellia_f %f24, %f2, %f0, %f8
  788. camellia_f %f24, %f6, %f4, %f10
  789. camellia_f %f26, %f0, %f8, %f2
  790. camellia_f %f26, %f4, %f10, %f6
  791. ldd [$key + 152], %f24
  792. ldd [$key + 144], %f26
  793. fxor %f30, %f8, %f0
  794. fxor %f30, %f10, %f4
  795. fxor %f28, %f2, %f2
  796. fxor %f28, %f6, %f6
  797. ldd [$key + 136], %f28
  798. retl
  799. ldd [$key + 128], %f30
  800. .type _cmll256_decrypt_2x,#function
  801. .size _cmll256_decrypt_2x,.-_cmll256_decrypt_2x
  802. ___
  803. &alg_cbc_encrypt_implement("cmll",128);
  804. &alg_cbc_encrypt_implement("cmll",256);
  805. &alg_cbc_decrypt_implement("cmll",128);
  806. &alg_cbc_decrypt_implement("cmll",256);
  807. if ($::evp) {
  808. &alg_ctr32_implement("cmll",128);
  809. &alg_ctr32_implement("cmll",256);
  810. }
  811. }}}
  812. if (!$::evp) {
  813. $code.=<<___;
  814. .global Camellia_encrypt
  815. Camellia_encrypt=cmll_t4_encrypt
  816. .global Camellia_decrypt
  817. Camellia_decrypt=cmll_t4_decrypt
  818. .global Camellia_set_key
  819. .align 32
  820. Camellia_set_key:
  821. andcc %o2, 7, %g0 ! double-check alignment
  822. bnz,a,pn %icc, 1f
  823. mov -1, %o0
  824. brz,a,pn %o0, 1f
  825. mov -1, %o0
  826. brz,a,pn %o2, 1f
  827. mov -1, %o0
  828. andncc %o1, 0x1c0, %g0
  829. bnz,a,pn %icc, 1f
  830. mov -2, %o0
  831. cmp %o1, 128
  832. bl,a,pn %icc, 1f
  833. mov -2, %o0
  834. b cmll_t4_set_key
  835. nop
  836. 1: retl
  837. nop
  838. .type Camellia_set_key,#function
  839. .size Camellia_set_key,.-Camellia_set_key
  840. ___
  841. my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
  842. $code.=<<___;
  843. .globl Camellia_cbc_encrypt
  844. .align 32
  845. Camellia_cbc_encrypt:
  846. ld [$key + 272], %g1
  847. nop
  848. brz $enc, .Lcbc_decrypt
  849. cmp %g1, 3
  850. be,pt %icc, cmll128_t4_cbc_encrypt
  851. nop
  852. ba cmll256_t4_cbc_encrypt
  853. nop
  854. .Lcbc_decrypt:
  855. be,pt %icc, cmll128_t4_cbc_decrypt
  856. nop
  857. ba cmll256_t4_cbc_decrypt
  858. nop
  859. .type Camellia_cbc_encrypt,#function
  860. .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
  861. ___
  862. }
  863. &emit_assembler();
  864. close STDOUT or die "error closing STDOUT: $!";