Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

bsaes-armv7.pl 63 KiB

Remove inconsistency in ARM support. This facilitates "universal" builds, ones that target multiple architectures, e.g. ARMv5 through ARMv7. (Imported from upstream's c1669e1c205dc8e695fb0c10a655f434e758b9f7) This is a change from a while ago which was a source of divergence between our perlasm and upstream's. This change in upstream came with the following comment in Configure: Note that -march is not among compiler options in below linux-armv4 target line. Not specifying one is intentional to give you choice to: a) rely on your compiler default by not specifying one; b) specify your target platform explicitly for optimal performance, e.g. -march=armv6 or -march=armv7-a; c) build "universal" binary that targets *range* of platforms by specifying minimum and maximum supported architecture; As for c) option. It actually makes no sense to specify maximum to be less than ARMv7, because it's the least requirement for run-time switch between platform-specific code paths. And without run-time switch performance would be equivalent to one for minimum. Secondly, there are some natural limitations that you'd have to accept and respect. Most notably you can *not* build "universal" binary for big-endian platform. This is because ARMv7 processor always picks instructions in little-endian order. Another similar limitation is that -mthumb can't "cross" -march=armv6t2 boundary, because that's where it became Thumb-2. Well, this limitation is a bit artificial, because it's not really impossible, but it's deemed too tricky to support. And of course you have to be sure that your binutils are actually up to the task of handling maximum target platform. Change-Id: Ie5f674d603393f0a1354a0d0973987484a4a650c Reviewed-on: https://boringssl-review.googlesource.com/4488 Reviewed-by: Adam Langley <agl@google.com>
před 9 roky
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. #
  8. # Specific modes and adaptation for Linux kernel by Ard Biesheuvel
  9. # <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
  10. # granted.
  11. # ====================================================================
  12. # Bit-sliced AES for ARM NEON
  13. #
  14. # February 2012.
  15. #
  16. # This implementation is direct adaptation of bsaes-x86_64 module for
  17. # ARM NEON. Except that this module is endian-neutral [in sense that
  18. # it can be compiled for either endianness] by courtesy of vld1.8's
  19. # neutrality. Initial version doesn't implement interface to OpenSSL,
  20. # only low-level primitives and unsupported entry points, just enough
  21. # to collect performance results, which for Cortex-A8 core are:
  22. #
  23. # encrypt 19.5 cycles per byte processed with 128-bit key
  24. # decrypt 22.1 cycles per byte processed with 128-bit key
  25. # key conv. 440 cycles per 128-bit key/0.18 of 8x block
  26. #
  27. # Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
  28. # which is [much] worse than anticipated (for further details see
  29. # http://www.openssl.org/~appro/Snapdragon-S4.html).
  30. #
  31. # Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
  32. # manages in 20.0 cycles].
  33. #
  34. # When comparing to x86_64 results keep in mind that NEON unit is
  35. # [mostly] single-issue and thus can't [fully] benefit from
  36. # instruction-level parallelism. And when comparing to aes-armv4
  37. # results keep in mind key schedule conversion overhead (see
  38. # bsaes-x86_64.pl for further details)...
  39. #
  40. # <appro@openssl.org>
  41. # April-August 2013
  42. #
  43. # Add CBC, CTR and XTS subroutines, adapt for kernel use.
  44. #
  45. # <ard.biesheuvel@linaro.org>
  46. $flavour = shift;
  47. if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  48. else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
  49. if ($flavour && $flavour ne "void") {
  50. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  51. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  52. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  53. die "can't locate arm-xlate.pl";
  54. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  55. } else {
  56. open STDOUT,">$output";
  57. }
  58. my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
  59. my @XMM=map("q$_",(0..15));
  60. {
  61. my ($key,$rounds,$const)=("r4","r5","r6");
  62. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  63. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  64. sub Sbox {
  65. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  66. # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
  67. my @b=@_[0..7];
  68. my @t=@_[8..11];
  69. my @s=@_[12..15];
  70. &InBasisChange (@b);
  71. &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
  72. &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
  73. }
  74. sub InBasisChange {
  75. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  76. # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
  77. my @b=@_[0..7];
  78. $code.=<<___;
  79. veor @b[2], @b[2], @b[1]
  80. veor @b[5], @b[5], @b[6]
  81. veor @b[3], @b[3], @b[0]
  82. veor @b[6], @b[6], @b[2]
  83. veor @b[5], @b[5], @b[0]
  84. veor @b[6], @b[6], @b[3]
  85. veor @b[3], @b[3], @b[7]
  86. veor @b[7], @b[7], @b[5]
  87. veor @b[3], @b[3], @b[4]
  88. veor @b[4], @b[4], @b[5]
  89. veor @b[2], @b[2], @b[7]
  90. veor @b[3], @b[3], @b[1]
  91. veor @b[1], @b[1], @b[5]
  92. ___
  93. }
  94. sub OutBasisChange {
  95. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  96. # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
  97. my @b=@_[0..7];
  98. $code.=<<___;
  99. veor @b[0], @b[0], @b[6]
  100. veor @b[1], @b[1], @b[4]
  101. veor @b[4], @b[4], @b[6]
  102. veor @b[2], @b[2], @b[0]
  103. veor @b[6], @b[6], @b[1]
  104. veor @b[1], @b[1], @b[5]
  105. veor @b[5], @b[5], @b[3]
  106. veor @b[3], @b[3], @b[7]
  107. veor @b[7], @b[7], @b[5]
  108. veor @b[2], @b[2], @b[5]
  109. veor @b[4], @b[4], @b[7]
  110. ___
  111. }
  112. sub InvSbox {
  113. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  114. # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
  115. my @b=@_[0..7];
  116. my @t=@_[8..11];
  117. my @s=@_[12..15];
  118. &InvInBasisChange (@b);
  119. &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
  120. &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
  121. }
  122. sub InvInBasisChange { # OutBasisChange in reverse (with twist)
  123. my @b=@_[5,1,2,6,3,7,0,4];
  124. $code.=<<___
  125. veor @b[1], @b[1], @b[7]
  126. veor @b[4], @b[4], @b[7]
  127. veor @b[7], @b[7], @b[5]
  128. veor @b[1], @b[1], @b[3]
  129. veor @b[2], @b[2], @b[5]
  130. veor @b[3], @b[3], @b[7]
  131. veor @b[6], @b[6], @b[1]
  132. veor @b[2], @b[2], @b[0]
  133. veor @b[5], @b[5], @b[3]
  134. veor @b[4], @b[4], @b[6]
  135. veor @b[0], @b[0], @b[6]
  136. veor @b[1], @b[1], @b[4]
  137. ___
  138. }
  139. sub InvOutBasisChange { # InBasisChange in reverse
  140. my @b=@_[2,5,7,3,6,1,0,4];
  141. $code.=<<___;
  142. veor @b[1], @b[1], @b[5]
  143. veor @b[2], @b[2], @b[7]
  144. veor @b[3], @b[3], @b[1]
  145. veor @b[4], @b[4], @b[5]
  146. veor @b[7], @b[7], @b[5]
  147. veor @b[3], @b[3], @b[4]
  148. veor @b[5], @b[5], @b[0]
  149. veor @b[3], @b[3], @b[7]
  150. veor @b[6], @b[6], @b[2]
  151. veor @b[2], @b[2], @b[1]
  152. veor @b[6], @b[6], @b[3]
  153. veor @b[3], @b[3], @b[0]
  154. veor @b[5], @b[5], @b[6]
  155. ___
  156. }
  157. sub Mul_GF4 {
  158. #;*************************************************************
  159. #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
  160. #;*************************************************************
  161. my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
  162. $code.=<<___;
  163. veor $t0, $y0, $y1
  164. vand $t0, $t0, $x0
  165. veor $x0, $x0, $x1
  166. vand $t1, $x1, $y0
  167. vand $x0, $x0, $y1
  168. veor $x1, $t1, $t0
  169. veor $x0, $x0, $t1
  170. ___
  171. }
  172. sub Mul_GF4_N { # not used, see next subroutine
  173. # multiply and scale by N
  174. my ($x0,$x1,$y0,$y1,$t0)=@_;
  175. $code.=<<___;
  176. veor $t0, $y0, $y1
  177. vand $t0, $t0, $x0
  178. veor $x0, $x0, $x1
  179. vand $x1, $x1, $y0
  180. vand $x0, $x0, $y1
  181. veor $x1, $x1, $x0
  182. veor $x0, $x0, $t0
  183. ___
  184. }
  185. sub Mul_GF4_N_GF4 {
  186. # interleaved Mul_GF4_N and Mul_GF4
  187. my ($x0,$x1,$y0,$y1,$t0,
  188. $x2,$x3,$y2,$y3,$t1)=@_;
  189. $code.=<<___;
  190. veor $t0, $y0, $y1
  191. veor $t1, $y2, $y3
  192. vand $t0, $t0, $x0
  193. vand $t1, $t1, $x2
  194. veor $x0, $x0, $x1
  195. veor $x2, $x2, $x3
  196. vand $x1, $x1, $y0
  197. vand $x3, $x3, $y2
  198. vand $x0, $x0, $y1
  199. vand $x2, $x2, $y3
  200. veor $x1, $x1, $x0
  201. veor $x2, $x2, $x3
  202. veor $x0, $x0, $t0
  203. veor $x3, $x3, $t1
  204. ___
  205. }
  206. sub Mul_GF16_2 {
  207. my @x=@_[0..7];
  208. my @y=@_[8..11];
  209. my @t=@_[12..15];
  210. $code.=<<___;
  211. veor @t[0], @x[0], @x[2]
  212. veor @t[1], @x[1], @x[3]
  213. ___
  214. &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
  215. $code.=<<___;
  216. veor @y[0], @y[0], @y[2]
  217. veor @y[1], @y[1], @y[3]
  218. ___
  219. Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  220. @x[2], @x[3], @y[2], @y[3], @t[2]);
  221. $code.=<<___;
  222. veor @x[0], @x[0], @t[0]
  223. veor @x[2], @x[2], @t[0]
  224. veor @x[1], @x[1], @t[1]
  225. veor @x[3], @x[3], @t[1]
  226. veor @t[0], @x[4], @x[6]
  227. veor @t[1], @x[5], @x[7]
  228. ___
  229. &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  230. @x[6], @x[7], @y[2], @y[3], @t[2]);
  231. $code.=<<___;
  232. veor @y[0], @y[0], @y[2]
  233. veor @y[1], @y[1], @y[3]
  234. ___
  235. &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
  236. $code.=<<___;
  237. veor @x[4], @x[4], @t[0]
  238. veor @x[6], @x[6], @t[0]
  239. veor @x[5], @x[5], @t[1]
  240. veor @x[7], @x[7], @t[1]
  241. ___
  242. }
  243. sub Inv_GF256 {
  244. #;********************************************************************
  245. #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
  246. #;********************************************************************
  247. my @x=@_[0..7];
  248. my @t=@_[8..11];
  249. my @s=@_[12..15];
  250. # direct optimizations from hardware
  251. $code.=<<___;
  252. veor @t[3], @x[4], @x[6]
  253. veor @t[2], @x[5], @x[7]
  254. veor @t[1], @x[1], @x[3]
  255. veor @s[1], @x[7], @x[6]
  256. vmov @t[0], @t[2]
  257. veor @s[0], @x[0], @x[2]
  258. vorr @t[2], @t[2], @t[1]
  259. veor @s[3], @t[3], @t[0]
  260. vand @s[2], @t[3], @s[0]
  261. vorr @t[3], @t[3], @s[0]
  262. veor @s[0], @s[0], @t[1]
  263. vand @t[0], @t[0], @t[1]
  264. veor @t[1], @x[3], @x[2]
  265. vand @s[3], @s[3], @s[0]
  266. vand @s[1], @s[1], @t[1]
  267. veor @t[1], @x[4], @x[5]
  268. veor @s[0], @x[1], @x[0]
  269. veor @t[3], @t[3], @s[1]
  270. veor @t[2], @t[2], @s[1]
  271. vand @s[1], @t[1], @s[0]
  272. vorr @t[1], @t[1], @s[0]
  273. veor @t[3], @t[3], @s[3]
  274. veor @t[0], @t[0], @s[1]
  275. veor @t[2], @t[2], @s[2]
  276. veor @t[1], @t[1], @s[3]
  277. veor @t[0], @t[0], @s[2]
  278. vand @s[0], @x[7], @x[3]
  279. veor @t[1], @t[1], @s[2]
  280. vand @s[1], @x[6], @x[2]
  281. vand @s[2], @x[5], @x[1]
  282. vorr @s[3], @x[4], @x[0]
  283. veor @t[3], @t[3], @s[0]
  284. veor @t[1], @t[1], @s[2]
  285. veor @t[0], @t[0], @s[3]
  286. veor @t[2], @t[2], @s[1]
  287. @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
  288. @ new smaller inversion
  289. vand @s[2], @t[3], @t[1]
  290. vmov @s[0], @t[0]
  291. veor @s[1], @t[2], @s[2]
  292. veor @s[3], @t[0], @s[2]
  293. veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
  294. vbsl @s[1], @t[1], @t[0]
  295. vbsl @s[3], @t[3], @t[2]
  296. veor @t[3], @t[3], @t[2]
  297. vbsl @s[0], @s[1], @s[2]
  298. vbsl @t[0], @s[2], @s[1]
  299. vand @s[2], @s[0], @s[3]
  300. veor @t[1], @t[1], @t[0]
  301. veor @s[2], @s[2], @t[3]
  302. ___
  303. # output in s3, s2, s1, t1
  304. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
  305. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
  306. &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
  307. ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
  308. }
  309. # AES linear components
  310. sub ShiftRows {
  311. my @x=@_[0..7];
  312. my @t=@_[8..11];
  313. my $mask=pop;
  314. $code.=<<___;
  315. vldmia $key!, {@t[0]-@t[3]}
  316. veor @t[0], @t[0], @x[0]
  317. veor @t[1], @t[1], @x[1]
  318. vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
  319. vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
  320. vldmia $key!, {@t[0]}
  321. veor @t[2], @t[2], @x[2]
  322. vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
  323. vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
  324. vldmia $key!, {@t[1]}
  325. veor @t[3], @t[3], @x[3]
  326. vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
  327. vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
  328. vldmia $key!, {@t[2]}
  329. vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
  330. vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
  331. vldmia $key!, {@t[3]}
  332. veor @t[0], @t[0], @x[4]
  333. veor @t[1], @t[1], @x[5]
  334. vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
  335. vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
  336. veor @t[2], @t[2], @x[6]
  337. vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
  338. vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
  339. veor @t[3], @t[3], @x[7]
  340. vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
  341. vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
  342. vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
  343. vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
  344. ___
  345. }
  346. sub MixColumns {
  347. # modified to emit output in order suitable for feeding back to aesenc[last]
  348. my @x=@_[0..7];
  349. my @t=@_[8..15];
  350. my $inv=@_[16]; # optional
  351. $code.=<<___;
  352. vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
  353. vext.8 @t[1], @x[1], @x[1], #12
  354. veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
  355. vext.8 @t[2], @x[2], @x[2], #12
  356. veor @x[1], @x[1], @t[1]
  357. vext.8 @t[3], @x[3], @x[3], #12
  358. veor @x[2], @x[2], @t[2]
  359. vext.8 @t[4], @x[4], @x[4], #12
  360. veor @x[3], @x[3], @t[3]
  361. vext.8 @t[5], @x[5], @x[5], #12
  362. veor @x[4], @x[4], @t[4]
  363. vext.8 @t[6], @x[6], @x[6], #12
  364. veor @x[5], @x[5], @t[5]
  365. vext.8 @t[7], @x[7], @x[7], #12
  366. veor @x[6], @x[6], @t[6]
  367. veor @t[1], @t[1], @x[0]
  368. veor @x[7], @x[7], @t[7]
  369. vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
  370. veor @t[2], @t[2], @x[1]
  371. veor @t[0], @t[0], @x[7]
  372. veor @t[1], @t[1], @x[7]
  373. vext.8 @x[1], @x[1], @x[1], #8
  374. veor @t[5], @t[5], @x[4]
  375. veor @x[0], @x[0], @t[0]
  376. veor @t[6], @t[6], @x[5]
  377. veor @x[1], @x[1], @t[1]
  378. vext.8 @t[0], @x[4], @x[4], #8
  379. veor @t[4], @t[4], @x[3]
  380. vext.8 @t[1], @x[5], @x[5], #8
  381. veor @t[7], @t[7], @x[6]
  382. vext.8 @x[4], @x[3], @x[3], #8
  383. veor @t[3], @t[3], @x[2]
  384. vext.8 @x[5], @x[7], @x[7], #8
  385. veor @t[4], @t[4], @x[7]
  386. vext.8 @x[3], @x[6], @x[6], #8
  387. veor @t[3], @t[3], @x[7]
  388. vext.8 @x[6], @x[2], @x[2], #8
  389. veor @x[7], @t[1], @t[5]
  390. ___
  391. $code.=<<___ if (!$inv);
  392. veor @x[2], @t[0], @t[4]
  393. veor @x[4], @x[4], @t[3]
  394. veor @x[5], @x[5], @t[7]
  395. veor @x[3], @x[3], @t[6]
  396. @ vmov @x[2], @t[0]
  397. veor @x[6], @x[6], @t[2]
  398. @ vmov @x[7], @t[1]
  399. ___
  400. $code.=<<___ if ($inv);
  401. veor @t[3], @t[3], @x[4]
  402. veor @x[5], @x[5], @t[7]
  403. veor @x[2], @x[3], @t[6]
  404. veor @x[3], @t[0], @t[4]
  405. veor @x[4], @x[6], @t[2]
  406. vmov @x[6], @t[3]
  407. @ vmov @x[7], @t[1]
  408. ___
  409. }
  410. sub InvMixColumns_orig {
  411. my @x=@_[0..7];
  412. my @t=@_[8..15];
  413. $code.=<<___;
  414. @ multiplication by 0x0e
  415. vext.8 @t[7], @x[7], @x[7], #12
  416. vmov @t[2], @x[2]
  417. veor @x[2], @x[2], @x[5] @ 2 5
  418. veor @x[7], @x[7], @x[5] @ 7 5
  419. vext.8 @t[0], @x[0], @x[0], #12
  420. vmov @t[5], @x[5]
  421. veor @x[5], @x[5], @x[0] @ 5 0 [1]
  422. veor @x[0], @x[0], @x[1] @ 0 1
  423. vext.8 @t[1], @x[1], @x[1], #12
  424. veor @x[1], @x[1], @x[2] @ 1 25
  425. veor @x[0], @x[0], @x[6] @ 01 6 [2]
  426. vext.8 @t[3], @x[3], @x[3], #12
  427. veor @x[1], @x[1], @x[3] @ 125 3 [4]
  428. veor @x[2], @x[2], @x[0] @ 25 016 [3]
  429. veor @x[3], @x[3], @x[7] @ 3 75
  430. veor @x[7], @x[7], @x[6] @ 75 6 [0]
  431. vext.8 @t[6], @x[6], @x[6], #12
  432. vmov @t[4], @x[4]
  433. veor @x[6], @x[6], @x[4] @ 6 4
  434. veor @x[4], @x[4], @x[3] @ 4 375 [6]
  435. veor @x[3], @x[3], @x[7] @ 375 756=36
  436. veor @x[6], @x[6], @t[5] @ 64 5 [7]
  437. veor @x[3], @x[3], @t[2] @ 36 2
  438. vext.8 @t[5], @t[5], @t[5], #12
  439. veor @x[3], @x[3], @t[4] @ 362 4 [5]
  440. ___
  441. my @y = @x[7,5,0,2,1,3,4,6];
  442. $code.=<<___;
  443. @ multiplication by 0x0b
  444. veor @y[1], @y[1], @y[0]
  445. veor @y[0], @y[0], @t[0]
  446. vext.8 @t[2], @t[2], @t[2], #12
  447. veor @y[1], @y[1], @t[1]
  448. veor @y[0], @y[0], @t[5]
  449. vext.8 @t[4], @t[4], @t[4], #12
  450. veor @y[1], @y[1], @t[6]
  451. veor @y[0], @y[0], @t[7]
  452. veor @t[7], @t[7], @t[6] @ clobber t[7]
  453. veor @y[3], @y[3], @t[0]
  454. veor @y[1], @y[1], @y[0]
  455. vext.8 @t[0], @t[0], @t[0], #12
  456. veor @y[2], @y[2], @t[1]
  457. veor @y[4], @y[4], @t[1]
  458. vext.8 @t[1], @t[1], @t[1], #12
  459. veor @y[2], @y[2], @t[2]
  460. veor @y[3], @y[3], @t[2]
  461. veor @y[5], @y[5], @t[2]
  462. veor @y[2], @y[2], @t[7]
  463. vext.8 @t[2], @t[2], @t[2], #12
  464. veor @y[3], @y[3], @t[3]
  465. veor @y[6], @y[6], @t[3]
  466. veor @y[4], @y[4], @t[3]
  467. veor @y[7], @y[7], @t[4]
  468. vext.8 @t[3], @t[3], @t[3], #12
  469. veor @y[5], @y[5], @t[4]
  470. veor @y[7], @y[7], @t[7]
  471. veor @t[7], @t[7], @t[5] @ clobber t[7] even more
  472. veor @y[3], @y[3], @t[5]
  473. veor @y[4], @y[4], @t[4]
  474. veor @y[5], @y[5], @t[7]
  475. vext.8 @t[4], @t[4], @t[4], #12
  476. veor @y[6], @y[6], @t[7]
  477. veor @y[4], @y[4], @t[7]
  478. veor @t[7], @t[7], @t[5]
  479. vext.8 @t[5], @t[5], @t[5], #12
  480. @ multiplication by 0x0d
  481. veor @y[4], @y[4], @y[7]
  482. veor @t[7], @t[7], @t[6] @ restore t[7]
  483. veor @y[7], @y[7], @t[4]
  484. vext.8 @t[6], @t[6], @t[6], #12
  485. veor @y[2], @y[2], @t[0]
  486. veor @y[7], @y[7], @t[5]
  487. vext.8 @t[7], @t[7], @t[7], #12
  488. veor @y[2], @y[2], @t[2]
  489. veor @y[3], @y[3], @y[1]
  490. veor @y[1], @y[1], @t[1]
  491. veor @y[0], @y[0], @t[0]
  492. veor @y[3], @y[3], @t[0]
  493. veor @y[1], @y[1], @t[5]
  494. veor @y[0], @y[0], @t[5]
  495. vext.8 @t[0], @t[0], @t[0], #12
  496. veor @y[1], @y[1], @t[7]
  497. veor @y[0], @y[0], @t[6]
  498. veor @y[3], @y[3], @y[1]
  499. veor @y[4], @y[4], @t[1]
  500. vext.8 @t[1], @t[1], @t[1], #12
  501. veor @y[7], @y[7], @t[7]
  502. veor @y[4], @y[4], @t[2]
  503. veor @y[5], @y[5], @t[2]
  504. veor @y[2], @y[2], @t[6]
  505. veor @t[6], @t[6], @t[3] @ clobber t[6]
  506. vext.8 @t[2], @t[2], @t[2], #12
  507. veor @y[4], @y[4], @y[7]
  508. veor @y[3], @y[3], @t[6]
  509. veor @y[6], @y[6], @t[6]
  510. veor @y[5], @y[5], @t[5]
  511. vext.8 @t[5], @t[5], @t[5], #12
  512. veor @y[6], @y[6], @t[4]
  513. vext.8 @t[4], @t[4], @t[4], #12
  514. veor @y[5], @y[5], @t[6]
  515. veor @y[6], @y[6], @t[7]
  516. vext.8 @t[7], @t[7], @t[7], #12
  517. veor @t[6], @t[6], @t[3] @ restore t[6]
  518. vext.8 @t[3], @t[3], @t[3], #12
  519. @ multiplication by 0x09
  520. veor @y[4], @y[4], @y[1]
  521. veor @t[1], @t[1], @y[1] @ t[1]=y[1]
  522. veor @t[0], @t[0], @t[5] @ clobber t[0]
  523. vext.8 @t[6], @t[6], @t[6], #12
  524. veor @t[1], @t[1], @t[5]
  525. veor @y[3], @y[3], @t[0]
  526. veor @t[0], @t[0], @y[0] @ t[0]=y[0]
  527. veor @t[1], @t[1], @t[6]
  528. veor @t[6], @t[6], @t[7] @ clobber t[6]
  529. veor @y[4], @y[4], @t[1]
  530. veor @y[7], @y[7], @t[4]
  531. veor @y[6], @y[6], @t[3]
  532. veor @y[5], @y[5], @t[2]
  533. veor @t[4], @t[4], @y[4] @ t[4]=y[4]
  534. veor @t[3], @t[3], @y[3] @ t[3]=y[3]
  535. veor @t[5], @t[5], @y[5] @ t[5]=y[5]
  536. veor @t[2], @t[2], @y[2] @ t[2]=y[2]
  537. veor @t[3], @t[3], @t[7]
  538. veor @XMM[5], @t[5], @t[6]
  539. veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
  540. veor @XMM[2], @t[2], @t[6]
  541. veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
  542. vmov @XMM[0], @t[0]
  543. vmov @XMM[1], @t[1]
  544. @ vmov @XMM[2], @t[2]
  545. vmov @XMM[3], @t[3]
  546. vmov @XMM[4], @t[4]
  547. @ vmov @XMM[5], @t[5]
  548. @ vmov @XMM[6], @t[6]
  549. @ vmov @XMM[7], @t[7]
  550. ___
  551. }
  552. sub InvMixColumns {
  553. my @x=@_[0..7];
  554. my @t=@_[8..15];
  555. # Thanks to Jussi Kivilinna for providing pointer to
  556. #
  557. # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
  558. # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
  559. # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
  560. # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
  561. $code.=<<___;
  562. @ multiplication by 0x05-0x00-0x04-0x00
  563. vext.8 @t[0], @x[0], @x[0], #8
  564. vext.8 @t[6], @x[6], @x[6], #8
  565. vext.8 @t[7], @x[7], @x[7], #8
  566. veor @t[0], @t[0], @x[0]
  567. vext.8 @t[1], @x[1], @x[1], #8
  568. veor @t[6], @t[6], @x[6]
  569. vext.8 @t[2], @x[2], @x[2], #8
  570. veor @t[7], @t[7], @x[7]
  571. vext.8 @t[3], @x[3], @x[3], #8
  572. veor @t[1], @t[1], @x[1]
  573. vext.8 @t[4], @x[4], @x[4], #8
  574. veor @t[2], @t[2], @x[2]
  575. vext.8 @t[5], @x[5], @x[5], #8
  576. veor @t[3], @t[3], @x[3]
  577. veor @t[4], @t[4], @x[4]
  578. veor @t[5], @t[5], @x[5]
  579. veor @x[0], @x[0], @t[6]
  580. veor @x[1], @x[1], @t[6]
  581. veor @x[2], @x[2], @t[0]
  582. veor @x[4], @x[4], @t[2]
  583. veor @x[3], @x[3], @t[1]
  584. veor @x[1], @x[1], @t[7]
  585. veor @x[2], @x[2], @t[7]
  586. veor @x[4], @x[4], @t[6]
  587. veor @x[5], @x[5], @t[3]
  588. veor @x[3], @x[3], @t[6]
  589. veor @x[6], @x[6], @t[4]
  590. veor @x[4], @x[4], @t[7]
  591. veor @x[5], @x[5], @t[7]
  592. veor @x[7], @x[7], @t[5]
  593. ___
  594. &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
  595. }
  596. sub swapmove {
  597. my ($a,$b,$n,$mask,$t)=@_;
  598. $code.=<<___;
  599. vshr.u64 $t, $b, #$n
  600. veor $t, $t, $a
  601. vand $t, $t, $mask
  602. veor $a, $a, $t
  603. vshl.u64 $t, $t, #$n
  604. veor $b, $b, $t
  605. ___
  606. }
  607. sub swapmove2x {
  608. my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
  609. $code.=<<___;
  610. vshr.u64 $t0, $b0, #$n
  611. vshr.u64 $t1, $b1, #$n
  612. veor $t0, $t0, $a0
  613. veor $t1, $t1, $a1
  614. vand $t0, $t0, $mask
  615. vand $t1, $t1, $mask
  616. veor $a0, $a0, $t0
  617. vshl.u64 $t0, $t0, #$n
  618. veor $a1, $a1, $t1
  619. vshl.u64 $t1, $t1, #$n
  620. veor $b0, $b0, $t0
  621. veor $b1, $b1, $t1
  622. ___
  623. }
  624. sub bitslice {
  625. my @x=reverse(@_[0..7]);
  626. my ($t0,$t1,$t2,$t3)=@_[8..11];
  627. $code.=<<___;
  628. vmov.i8 $t0,#0x55 @ compose .LBS0
  629. vmov.i8 $t1,#0x33 @ compose .LBS1
  630. ___
  631. &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
  632. &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  633. $code.=<<___;
  634. vmov.i8 $t0,#0x0f @ compose .LBS2
  635. ___
  636. &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
  637. &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  638. &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
  639. &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
  640. }
  641. $code.=<<___;
  642. #if defined(__arm__)
  643. #ifndef __KERNEL__
  644. # include "arm_arch.h"
  645. # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
  646. # define VFP_ABI_POP vldmia sp!,{d8-d15}
  647. # define VFP_ABI_FRAME 0x40
  648. #else
  649. # define VFP_ABI_PUSH
  650. # define VFP_ABI_POP
  651. # define VFP_ABI_FRAME 0
  652. # define BSAES_ASM_EXTENDED_KEY
  653. # define XTS_CHAIN_TWEAK
  654. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  655. # define __ARM_MAX_ARCH__ 7
  656. #endif
  657. #ifdef __thumb__
  658. # define adrl adr
  659. #endif
  660. #if __ARM_MAX_ARCH__>=7
  661. .arch armv7-a
  662. .fpu neon
  663. .text
  664. .syntax unified @ ARMv7-capable assembler is expected to handle this
  665. #if defined(__thumb2__) && !defined(__APPLE__)
  666. .thumb
  667. #else
  668. .code 32
  669. #endif
  670. .type _bsaes_decrypt8,%function
  671. .align 4
  672. _bsaes_decrypt8:
  673. adr $const,_bsaes_decrypt8
  674. vldmia $key!, {@XMM[9]} @ round 0 key
  675. #ifdef __APPLE__
  676. adr $const,.LM0ISR
  677. #else
  678. add $const,$const,#.LM0ISR-_bsaes_decrypt8
  679. #endif
  680. vldmia $const!, {@XMM[8]} @ .LM0ISR
  681. veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
  682. veor @XMM[11], @XMM[1], @XMM[9]
  683. vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  684. vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  685. veor @XMM[12], @XMM[2], @XMM[9]
  686. vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  687. vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  688. veor @XMM[13], @XMM[3], @XMM[9]
  689. vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
  690. vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
  691. veor @XMM[14], @XMM[4], @XMM[9]
  692. vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
  693. vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
  694. veor @XMM[15], @XMM[5], @XMM[9]
  695. vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
  696. vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
  697. veor @XMM[10], @XMM[6], @XMM[9]
  698. vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
  699. vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
  700. veor @XMM[11], @XMM[7], @XMM[9]
  701. vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  702. vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  703. vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  704. vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  705. ___
  706. &bitslice (@XMM[0..7, 8..11]);
  707. $code.=<<___;
  708. sub $rounds,$rounds,#1
  709. b .Ldec_sbox
  710. .align 4
  711. .Ldec_loop:
  712. ___
  713. &ShiftRows (@XMM[0..7, 8..12]);
  714. $code.=".Ldec_sbox:\n";
  715. &InvSbox (@XMM[0..7, 8..15]);
  716. $code.=<<___;
  717. subs $rounds,$rounds,#1
  718. bcc .Ldec_done
  719. ___
  720. &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
  721. $code.=<<___;
  722. vldmia $const, {@XMM[12]} @ .LISR
  723. ite eq @ Thumb2 thing, sanity check in ARM
  724. addeq $const,$const,#0x10
  725. bne .Ldec_loop
  726. vldmia $const, {@XMM[12]} @ .LISRM0
  727. b .Ldec_loop
  728. .align 4
  729. .Ldec_done:
  730. ___
  731. &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
  732. $code.=<<___;
  733. vldmia $key, {@XMM[8]} @ last round key
  734. veor @XMM[6], @XMM[6], @XMM[8]
  735. veor @XMM[4], @XMM[4], @XMM[8]
  736. veor @XMM[2], @XMM[2], @XMM[8]
  737. veor @XMM[7], @XMM[7], @XMM[8]
  738. veor @XMM[3], @XMM[3], @XMM[8]
  739. veor @XMM[5], @XMM[5], @XMM[8]
  740. veor @XMM[0], @XMM[0], @XMM[8]
  741. veor @XMM[1], @XMM[1], @XMM[8]
  742. bx lr
  743. .size _bsaes_decrypt8,.-_bsaes_decrypt8
  744. .type _bsaes_const,%object
  745. .align 6
  746. _bsaes_const:
  747. .LM0ISR: @ InvShiftRows constants
  748. .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
  749. .LISR:
  750. .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
  751. .LISRM0:
  752. .quad 0x01040b0e0205080f, 0x0306090c00070a0d
  753. .LM0SR: @ ShiftRows constants
  754. .quad 0x0a0e02060f03070b, 0x0004080c05090d01
  755. .LSR:
  756. .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
  757. .LSRM0:
  758. .quad 0x0304090e00050a0f, 0x01060b0c0207080d
  759. .LM0:
  760. .quad 0x02060a0e03070b0f, 0x0004080c0105090d
  761. .LREVM0SR:
  762. .quad 0x090d01050c000408, 0x03070b0f060a0e02
  763. .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
  764. .align 6
  765. .size _bsaes_const,.-_bsaes_const
  766. .type _bsaes_encrypt8,%function
  767. .align 4
  768. _bsaes_encrypt8:
  769. adr $const,_bsaes_encrypt8
  770. vldmia $key!, {@XMM[9]} @ round 0 key
  771. #ifdef __APPLE__
  772. adr $const,.LM0SR
  773. #else
  774. sub $const,$const,#_bsaes_encrypt8-.LM0SR
  775. #endif
  776. vldmia $const!, {@XMM[8]} @ .LM0SR
  777. _bsaes_encrypt8_alt:
  778. veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
  779. veor @XMM[11], @XMM[1], @XMM[9]
  780. vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  781. vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  782. veor @XMM[12], @XMM[2], @XMM[9]
  783. vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  784. vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  785. veor @XMM[13], @XMM[3], @XMM[9]
  786. vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
  787. vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
  788. veor @XMM[14], @XMM[4], @XMM[9]
  789. vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
  790. vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
  791. veor @XMM[15], @XMM[5], @XMM[9]
  792. vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
  793. vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
  794. veor @XMM[10], @XMM[6], @XMM[9]
  795. vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
  796. vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
  797. veor @XMM[11], @XMM[7], @XMM[9]
  798. vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  799. vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  800. vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  801. vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  802. _bsaes_encrypt8_bitslice:
  803. ___
  804. &bitslice (@XMM[0..7, 8..11]);
  805. $code.=<<___;
  806. sub $rounds,$rounds,#1
  807. b .Lenc_sbox
  808. .align 4
  809. .Lenc_loop:
  810. ___
  811. &ShiftRows (@XMM[0..7, 8..12]);
  812. $code.=".Lenc_sbox:\n";
  813. &Sbox (@XMM[0..7, 8..15]);
  814. $code.=<<___;
  815. subs $rounds,$rounds,#1
  816. bcc .Lenc_done
  817. ___
  818. &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
  819. $code.=<<___;
  820. vldmia $const, {@XMM[12]} @ .LSR
  821. ite eq @ Thumb2 thing, samity check in ARM
  822. addeq $const,$const,#0x10
  823. bne .Lenc_loop
  824. vldmia $const, {@XMM[12]} @ .LSRM0
  825. b .Lenc_loop
  826. .align 4
  827. .Lenc_done:
  828. ___
  829. # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
  830. &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
  831. $code.=<<___;
  832. vldmia $key, {@XMM[8]} @ last round key
  833. veor @XMM[4], @XMM[4], @XMM[8]
  834. veor @XMM[6], @XMM[6], @XMM[8]
  835. veor @XMM[3], @XMM[3], @XMM[8]
  836. veor @XMM[7], @XMM[7], @XMM[8]
  837. veor @XMM[2], @XMM[2], @XMM[8]
  838. veor @XMM[5], @XMM[5], @XMM[8]
  839. veor @XMM[0], @XMM[0], @XMM[8]
  840. veor @XMM[1], @XMM[1], @XMM[8]
  841. bx lr
  842. .size _bsaes_encrypt8,.-_bsaes_encrypt8
  843. ___
  844. }
  845. {
  846. my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
  847. sub bitslice_key {
  848. my @x=reverse(@_[0..7]);
  849. my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
  850. &swapmove (@x[0,1],1,$bs0,$t2,$t3);
  851. $code.=<<___;
  852. @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
  853. vmov @x[2], @x[0]
  854. vmov @x[3], @x[1]
  855. ___
  856. #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  857. &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
  858. $code.=<<___;
  859. @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  860. vmov @x[4], @x[0]
  861. vmov @x[6], @x[2]
  862. vmov @x[5], @x[1]
  863. vmov @x[7], @x[3]
  864. ___
  865. &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
  866. &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
  867. }
  868. $code.=<<___;
  869. .type _bsaes_key_convert,%function
  870. .align 4
  871. _bsaes_key_convert:
  872. adr $const,_bsaes_key_convert
  873. vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
  874. #ifdef __APPLE__
  875. adr $const,.LM0
  876. #else
  877. sub $const,$const,#_bsaes_key_convert-.LM0
  878. #endif
  879. vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
  880. vmov.i8 @XMM[8], #0x01 @ bit masks
  881. vmov.i8 @XMM[9], #0x02
  882. vmov.i8 @XMM[10], #0x04
  883. vmov.i8 @XMM[11], #0x08
  884. vmov.i8 @XMM[12], #0x10
  885. vmov.i8 @XMM[13], #0x20
  886. vldmia $const, {@XMM[14]} @ .LM0
  887. #ifdef __ARMEL__
  888. vrev32.8 @XMM[7], @XMM[7]
  889. vrev32.8 @XMM[15], @XMM[15]
  890. #endif
  891. sub $rounds,$rounds,#1
  892. vstmia $out!, {@XMM[7]} @ save round 0 key
  893. b .Lkey_loop
  894. .align 4
  895. .Lkey_loop:
  896. vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
  897. vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
  898. vmov.i8 @XMM[6], #0x40
  899. vmov.i8 @XMM[15], #0x80
  900. vtst.8 @XMM[0], @XMM[7], @XMM[8]
  901. vtst.8 @XMM[1], @XMM[7], @XMM[9]
  902. vtst.8 @XMM[2], @XMM[7], @XMM[10]
  903. vtst.8 @XMM[3], @XMM[7], @XMM[11]
  904. vtst.8 @XMM[4], @XMM[7], @XMM[12]
  905. vtst.8 @XMM[5], @XMM[7], @XMM[13]
  906. vtst.8 @XMM[6], @XMM[7], @XMM[6]
  907. vtst.8 @XMM[7], @XMM[7], @XMM[15]
  908. vld1.8 {@XMM[15]}, [$inp]! @ load next round key
  909. vmvn @XMM[0], @XMM[0] @ "pnot"
  910. vmvn @XMM[1], @XMM[1]
  911. vmvn @XMM[5], @XMM[5]
  912. vmvn @XMM[6], @XMM[6]
  913. #ifdef __ARMEL__
  914. vrev32.8 @XMM[15], @XMM[15]
  915. #endif
  916. subs $rounds,$rounds,#1
  917. vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
  918. bne .Lkey_loop
  919. vmov.i8 @XMM[7],#0x63 @ compose .L63
  920. @ don't save last round key
  921. bx lr
  922. .size _bsaes_key_convert,.-_bsaes_key_convert
  923. ___
  924. }
  925. if (0) { # following four functions are unsupported interface
  926. # used for benchmarking...
  927. $code.=<<___;
  928. .globl bsaes_enc_key_convert
  929. .hidden bsaes_enc_key_convert
  930. .type bsaes_enc_key_convert,%function
  931. .align 4
  932. bsaes_enc_key_convert:
  933. stmdb sp!,{r4-r6,lr}
  934. vstmdb sp!,{d8-d15} @ ABI specification says so
  935. ldr r5,[$inp,#240] @ pass rounds
  936. mov r4,$inp @ pass key
  937. mov r12,$out @ pass key schedule
  938. bl _bsaes_key_convert
  939. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  940. vstmia r12, {@XMM[7]} @ save last round key
  941. vldmia sp!,{d8-d15}
  942. ldmia sp!,{r4-r6,pc}
  943. .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
  944. .globl bsaes_encrypt_128
  945. .hidden bsaes_encrypt_128
  946. .type bsaes_encrypt_128,%function
  947. .align 4
  948. bsaes_encrypt_128:
  949. stmdb sp!,{r4-r6,lr}
  950. vstmdb sp!,{d8-d15} @ ABI specification says so
  951. .Lenc128_loop:
  952. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  953. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  954. mov r4,$key @ pass the key
  955. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  956. mov r5,#10 @ pass rounds
  957. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  958. bl _bsaes_encrypt8
  959. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  960. vst1.8 {@XMM[4]}, [$out]!
  961. vst1.8 {@XMM[6]}, [$out]!
  962. vst1.8 {@XMM[3]}, [$out]!
  963. vst1.8 {@XMM[7]}, [$out]!
  964. vst1.8 {@XMM[2]}, [$out]!
  965. subs $len,$len,#0x80
  966. vst1.8 {@XMM[5]}, [$out]!
  967. bhi .Lenc128_loop
  968. vldmia sp!,{d8-d15}
  969. ldmia sp!,{r4-r6,pc}
  970. .size bsaes_encrypt_128,.-bsaes_encrypt_128
  971. .globl bsaes_dec_key_convert
  972. .hidden bsaes_dec_key_convert
  973. .type bsaes_dec_key_convert,%function
  974. .align 4
  975. bsaes_dec_key_convert:
  976. stmdb sp!,{r4-r6,lr}
  977. vstmdb sp!,{d8-d15} @ ABI specification says so
  978. ldr r5,[$inp,#240] @ pass rounds
  979. mov r4,$inp @ pass key
  980. mov r12,$out @ pass key schedule
  981. bl _bsaes_key_convert
  982. vldmia $out, {@XMM[6]}
  983. vstmia r12, {@XMM[15]} @ save last round key
  984. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  985. vstmia $out, {@XMM[7]}
  986. vldmia sp!,{d8-d15}
  987. ldmia sp!,{r4-r6,pc}
  988. .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
  989. .globl bsaes_decrypt_128
  990. .hidden bsaes_decrypt_128
  991. .type bsaes_decrypt_128,%function
  992. .align 4
  993. bsaes_decrypt_128:
  994. stmdb sp!,{r4-r6,lr}
  995. vstmdb sp!,{d8-d15} @ ABI specification says so
  996. .Ldec128_loop:
  997. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  998. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  999. mov r4,$key @ pass the key
  1000. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  1001. mov r5,#10 @ pass rounds
  1002. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1003. bl _bsaes_decrypt8
  1004. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1005. vst1.8 {@XMM[6]}, [$out]!
  1006. vst1.8 {@XMM[4]}, [$out]!
  1007. vst1.8 {@XMM[2]}, [$out]!
  1008. vst1.8 {@XMM[7]}, [$out]!
  1009. vst1.8 {@XMM[3]}, [$out]!
  1010. subs $len,$len,#0x80
  1011. vst1.8 {@XMM[5]}, [$out]!
  1012. bhi .Ldec128_loop
  1013. vldmia sp!,{d8-d15}
  1014. ldmia sp!,{r4-r6,pc}
  1015. .size bsaes_decrypt_128,.-bsaes_decrypt_128
  1016. ___
  1017. }
  1018. {
  1019. my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
  1020. my ($keysched)=("sp");
  1021. $code.=<<___;
  1022. .extern AES_cbc_encrypt
  1023. .extern AES_decrypt
  1024. .global bsaes_cbc_encrypt
  1025. .hidden bsaes_cbc_encrypt
  1026. .type bsaes_cbc_encrypt,%function
  1027. .align 5
  1028. bsaes_cbc_encrypt:
  1029. #ifndef __KERNEL__
  1030. cmp $len, #128
  1031. #ifndef __thumb__
  1032. blo AES_cbc_encrypt
  1033. #else
  1034. bhs 1f
  1035. b AES_cbc_encrypt
  1036. 1:
  1037. #endif
  1038. #endif
  1039. @ it is up to the caller to make sure we are called with enc == 0
  1040. mov ip, sp
  1041. stmdb sp!, {r4-r10, lr}
  1042. VFP_ABI_PUSH
  1043. ldr $ivp, [ip] @ IV is 1st arg on the stack
  1044. mov $len, $len, lsr#4 @ len in 16 byte blocks
  1045. sub sp, #0x10 @ scratch space to carry over the IV
  1046. mov $fp, sp @ save sp
  1047. ldr $rounds, [$key, #240] @ get # of rounds
  1048. #ifndef BSAES_ASM_EXTENDED_KEY
  1049. @ allocate the key schedule on the stack
  1050. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1051. add r12, #`128-32` @ sifze of bit-slices key schedule
  1052. @ populate the key schedule
  1053. mov r4, $key @ pass key
  1054. mov r5, $rounds @ pass # of rounds
  1055. mov sp, r12 @ sp is $keysched
  1056. bl _bsaes_key_convert
  1057. vldmia $keysched, {@XMM[6]}
  1058. vstmia r12, {@XMM[15]} @ save last round key
  1059. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1060. vstmia $keysched, {@XMM[7]}
  1061. #else
  1062. ldr r12, [$key, #244]
  1063. eors r12, #1
  1064. beq 0f
  1065. @ populate the key schedule
  1066. str r12, [$key, #244]
  1067. mov r4, $key @ pass key
  1068. mov r5, $rounds @ pass # of rounds
  1069. add r12, $key, #248 @ pass key schedule
  1070. bl _bsaes_key_convert
  1071. add r4, $key, #248
  1072. vldmia r4, {@XMM[6]}
  1073. vstmia r12, {@XMM[15]} @ save last round key
  1074. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1075. vstmia r4, {@XMM[7]}
  1076. .align 2
  1077. 0:
  1078. #endif
  1079. vld1.8 {@XMM[15]}, [$ivp] @ load IV
  1080. b .Lcbc_dec_loop
  1081. .align 4
  1082. .Lcbc_dec_loop:
  1083. subs $len, $len, #0x8
  1084. bmi .Lcbc_dec_loop_finish
  1085. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  1086. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  1087. #ifndef BSAES_ASM_EXTENDED_KEY
  1088. mov r4, $keysched @ pass the key
  1089. #else
  1090. add r4, $key, #248
  1091. #endif
  1092. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  1093. mov r5, $rounds
  1094. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
  1095. sub $inp, $inp, #0x60
  1096. vstmia $fp, {@XMM[15]} @ put aside IV
  1097. bl _bsaes_decrypt8
  1098. vldmia $fp, {@XMM[14]} @ reload IV
  1099. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1100. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1101. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1102. veor @XMM[1], @XMM[1], @XMM[8]
  1103. veor @XMM[6], @XMM[6], @XMM[9]
  1104. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1105. veor @XMM[4], @XMM[4], @XMM[10]
  1106. veor @XMM[2], @XMM[2], @XMM[11]
  1107. vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
  1108. veor @XMM[7], @XMM[7], @XMM[12]
  1109. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1110. veor @XMM[3], @XMM[3], @XMM[13]
  1111. vst1.8 {@XMM[6]}, [$out]!
  1112. veor @XMM[5], @XMM[5], @XMM[14]
  1113. vst1.8 {@XMM[4]}, [$out]!
  1114. vst1.8 {@XMM[2]}, [$out]!
  1115. vst1.8 {@XMM[7]}, [$out]!
  1116. vst1.8 {@XMM[3]}, [$out]!
  1117. vst1.8 {@XMM[5]}, [$out]!
  1118. b .Lcbc_dec_loop
  1119. .Lcbc_dec_loop_finish:
  1120. adds $len, $len, #8
  1121. beq .Lcbc_dec_done
  1122. vld1.8 {@XMM[0]}, [$inp]! @ load input
  1123. cmp $len, #2
  1124. blo .Lcbc_dec_one
  1125. vld1.8 {@XMM[1]}, [$inp]!
  1126. #ifndef BSAES_ASM_EXTENDED_KEY
  1127. mov r4, $keysched @ pass the key
  1128. #else
  1129. add r4, $key, #248
  1130. #endif
  1131. mov r5, $rounds
  1132. vstmia $fp, {@XMM[15]} @ put aside IV
  1133. beq .Lcbc_dec_two
  1134. vld1.8 {@XMM[2]}, [$inp]!
  1135. cmp $len, #4
  1136. blo .Lcbc_dec_three
  1137. vld1.8 {@XMM[3]}, [$inp]!
  1138. beq .Lcbc_dec_four
  1139. vld1.8 {@XMM[4]}, [$inp]!
  1140. cmp $len, #6
  1141. blo .Lcbc_dec_five
  1142. vld1.8 {@XMM[5]}, [$inp]!
  1143. beq .Lcbc_dec_six
  1144. vld1.8 {@XMM[6]}, [$inp]!
  1145. sub $inp, $inp, #0x70
  1146. bl _bsaes_decrypt8
  1147. vldmia $fp, {@XMM[14]} @ reload IV
  1148. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1149. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1150. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1151. veor @XMM[1], @XMM[1], @XMM[8]
  1152. veor @XMM[6], @XMM[6], @XMM[9]
  1153. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1154. veor @XMM[4], @XMM[4], @XMM[10]
  1155. veor @XMM[2], @XMM[2], @XMM[11]
  1156. vld1.8 {@XMM[15]}, [$inp]!
  1157. veor @XMM[7], @XMM[7], @XMM[12]
  1158. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1159. veor @XMM[3], @XMM[3], @XMM[13]
  1160. vst1.8 {@XMM[6]}, [$out]!
  1161. vst1.8 {@XMM[4]}, [$out]!
  1162. vst1.8 {@XMM[2]}, [$out]!
  1163. vst1.8 {@XMM[7]}, [$out]!
  1164. vst1.8 {@XMM[3]}, [$out]!
  1165. b .Lcbc_dec_done
  1166. .align 4
  1167. .Lcbc_dec_six:
  1168. sub $inp, $inp, #0x60
  1169. bl _bsaes_decrypt8
  1170. vldmia $fp,{@XMM[14]} @ reload IV
  1171. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1172. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1173. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1174. veor @XMM[1], @XMM[1], @XMM[8]
  1175. veor @XMM[6], @XMM[6], @XMM[9]
  1176. vld1.8 {@XMM[12]}, [$inp]!
  1177. veor @XMM[4], @XMM[4], @XMM[10]
  1178. veor @XMM[2], @XMM[2], @XMM[11]
  1179. vld1.8 {@XMM[15]}, [$inp]!
  1180. veor @XMM[7], @XMM[7], @XMM[12]
  1181. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1182. vst1.8 {@XMM[6]}, [$out]!
  1183. vst1.8 {@XMM[4]}, [$out]!
  1184. vst1.8 {@XMM[2]}, [$out]!
  1185. vst1.8 {@XMM[7]}, [$out]!
  1186. b .Lcbc_dec_done
  1187. .align 4
  1188. .Lcbc_dec_five:
  1189. sub $inp, $inp, #0x50
  1190. bl _bsaes_decrypt8
  1191. vldmia $fp, {@XMM[14]} @ reload IV
  1192. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1193. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1194. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1195. veor @XMM[1], @XMM[1], @XMM[8]
  1196. veor @XMM[6], @XMM[6], @XMM[9]
  1197. vld1.8 {@XMM[15]}, [$inp]!
  1198. veor @XMM[4], @XMM[4], @XMM[10]
  1199. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1200. veor @XMM[2], @XMM[2], @XMM[11]
  1201. vst1.8 {@XMM[6]}, [$out]!
  1202. vst1.8 {@XMM[4]}, [$out]!
  1203. vst1.8 {@XMM[2]}, [$out]!
  1204. b .Lcbc_dec_done
  1205. .align 4
  1206. .Lcbc_dec_four:
  1207. sub $inp, $inp, #0x40
  1208. bl _bsaes_decrypt8
  1209. vldmia $fp, {@XMM[14]} @ reload IV
  1210. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1211. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1212. vld1.8 {@XMM[10]}, [$inp]!
  1213. veor @XMM[1], @XMM[1], @XMM[8]
  1214. veor @XMM[6], @XMM[6], @XMM[9]
  1215. vld1.8 {@XMM[15]}, [$inp]!
  1216. veor @XMM[4], @XMM[4], @XMM[10]
  1217. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1218. vst1.8 {@XMM[6]}, [$out]!
  1219. vst1.8 {@XMM[4]}, [$out]!
  1220. b .Lcbc_dec_done
  1221. .align 4
  1222. .Lcbc_dec_three:
  1223. sub $inp, $inp, #0x30
  1224. bl _bsaes_decrypt8
  1225. vldmia $fp, {@XMM[14]} @ reload IV
  1226. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1227. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1228. vld1.8 {@XMM[15]}, [$inp]!
  1229. veor @XMM[1], @XMM[1], @XMM[8]
  1230. veor @XMM[6], @XMM[6], @XMM[9]
  1231. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1232. vst1.8 {@XMM[6]}, [$out]!
  1233. b .Lcbc_dec_done
  1234. .align 4
  1235. .Lcbc_dec_two:
  1236. sub $inp, $inp, #0x20
  1237. bl _bsaes_decrypt8
  1238. vldmia $fp, {@XMM[14]} @ reload IV
  1239. vld1.8 {@XMM[8]}, [$inp]! @ reload input
  1240. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1241. vld1.8 {@XMM[15]}, [$inp]! @ reload input
  1242. veor @XMM[1], @XMM[1], @XMM[8]
  1243. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1244. b .Lcbc_dec_done
  1245. .align 4
  1246. .Lcbc_dec_one:
  1247. sub $inp, $inp, #0x10
  1248. mov $rounds, $out @ save original out pointer
  1249. mov $out, $fp @ use the iv scratch space as out buffer
  1250. mov r2, $key
  1251. vmov @XMM[4],@XMM[15] @ just in case ensure that IV
  1252. vmov @XMM[5],@XMM[0] @ and input are preserved
  1253. bl AES_decrypt
  1254. vld1.8 {@XMM[0]}, [$fp,:64] @ load result
  1255. veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
  1256. vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
  1257. vst1.8 {@XMM[0]}, [$rounds] @ write output
  1258. .Lcbc_dec_done:
  1259. #ifndef BSAES_ASM_EXTENDED_KEY
  1260. vmov.i32 q0, #0
  1261. vmov.i32 q1, #0
  1262. .Lcbc_dec_bzero: @ wipe key schedule [if any]
  1263. vstmia $keysched!, {q0-q1}
  1264. cmp $keysched, $fp
  1265. bne .Lcbc_dec_bzero
  1266. #endif
  1267. mov sp, $fp
  1268. add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
  1269. vst1.8 {@XMM[15]}, [$ivp] @ return IV
  1270. VFP_ABI_POP
  1271. ldmia sp!, {r4-r10, pc}
  1272. .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
  1273. ___
  1274. }
  1275. {
  1276. my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
  1277. my $const = "r6"; # shared with _bsaes_encrypt8_alt
  1278. my $keysched = "sp";
  1279. $code.=<<___;
  1280. .extern AES_encrypt
  1281. .global bsaes_ctr32_encrypt_blocks
  1282. .hidden bsaes_ctr32_encrypt_blocks
  1283. .type bsaes_ctr32_encrypt_blocks,%function
  1284. .align 5
  1285. bsaes_ctr32_encrypt_blocks:
  1286. cmp $len, #8 @ use plain AES for
  1287. blo .Lctr_enc_short @ small sizes
  1288. mov ip, sp
  1289. stmdb sp!, {r4-r10, lr}
  1290. VFP_ABI_PUSH
  1291. ldr $ctr, [ip] @ ctr is 1st arg on the stack
  1292. sub sp, sp, #0x10 @ scratch space to carry over the ctr
  1293. mov $fp, sp @ save sp
  1294. ldr $rounds, [$key, #240] @ get # of rounds
  1295. #ifndef BSAES_ASM_EXTENDED_KEY
  1296. @ allocate the key schedule on the stack
  1297. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1298. add r12, #`128-32` @ size of bit-sliced key schedule
  1299. @ populate the key schedule
  1300. mov r4, $key @ pass key
  1301. mov r5, $rounds @ pass # of rounds
  1302. mov sp, r12 @ sp is $keysched
  1303. bl _bsaes_key_convert
  1304. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  1305. vstmia r12, {@XMM[7]} @ save last round key
  1306. vld1.8 {@XMM[0]}, [$ctr] @ load counter
  1307. #ifdef __APPLE__
  1308. mov $ctr, #:lower16:(.LREVM0SR-.LM0)
  1309. add $ctr, $const, $ctr
  1310. #else
  1311. add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
  1312. #endif
  1313. vldmia $keysched, {@XMM[4]} @ load round0 key
  1314. #else
  1315. ldr r12, [$key, #244]
  1316. eors r12, #1
  1317. beq 0f
  1318. @ populate the key schedule
  1319. str r12, [$key, #244]
  1320. mov r4, $key @ pass key
  1321. mov r5, $rounds @ pass # of rounds
  1322. add r12, $key, #248 @ pass key schedule
  1323. bl _bsaes_key_convert
  1324. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  1325. vstmia r12, {@XMM[7]} @ save last round key
  1326. .align 2
  1327. 0: add r12, $key, #248
  1328. vld1.8 {@XMM[0]}, [$ctr] @ load counter
  1329. adrl $ctr, .LREVM0SR @ borrow $ctr
  1330. vldmia r12, {@XMM[4]} @ load round0 key
  1331. sub sp, #0x10 @ place for adjusted round0 key
  1332. #endif
  1333. vmov.i32 @XMM[8],#1 @ compose 1<<96
  1334. veor @XMM[9],@XMM[9],@XMM[9]
  1335. vrev32.8 @XMM[0],@XMM[0]
  1336. vext.8 @XMM[8],@XMM[9],@XMM[8],#4
  1337. vrev32.8 @XMM[4],@XMM[4]
  1338. vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
  1339. vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
  1340. b .Lctr_enc_loop
  1341. .align 4
  1342. .Lctr_enc_loop:
  1343. vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
  1344. vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
  1345. vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
  1346. vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
  1347. vadd.u32 @XMM[4], @XMM[1], @XMM[10]
  1348. vadd.u32 @XMM[5], @XMM[2], @XMM[10]
  1349. vadd.u32 @XMM[6], @XMM[3], @XMM[10]
  1350. vadd.u32 @XMM[7], @XMM[4], @XMM[10]
  1351. vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
  1352. @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
  1353. @ to flip byte order in 32-bit counter
  1354. vldmia $keysched, {@XMM[9]} @ load round0 key
  1355. #ifndef BSAES_ASM_EXTENDED_KEY
  1356. add r4, $keysched, #0x10 @ pass next round key
  1357. #else
  1358. add r4, $key, #`248+16`
  1359. #endif
  1360. vldmia $ctr, {@XMM[8]} @ .LREVM0SR
  1361. mov r5, $rounds @ pass rounds
  1362. vstmia $fp, {@XMM[10]} @ save next counter
  1363. #ifdef __APPLE__
  1364. mov $const, #:lower16:(.LREVM0SR-.LSR)
  1365. sub $const, $ctr, $const
  1366. #else
  1367. sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
  1368. #endif
  1369. bl _bsaes_encrypt8_alt
  1370. subs $len, $len, #8
  1371. blo .Lctr_enc_loop_done
  1372. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
  1373. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1374. veor @XMM[0], @XMM[8]
  1375. veor @XMM[1], @XMM[9]
  1376. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1377. veor @XMM[4], @XMM[10]
  1378. veor @XMM[6], @XMM[11]
  1379. vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
  1380. veor @XMM[3], @XMM[12]
  1381. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1382. veor @XMM[7], @XMM[13]
  1383. veor @XMM[2], @XMM[14]
  1384. vst1.8 {@XMM[4]}, [$out]!
  1385. veor @XMM[5], @XMM[15]
  1386. vst1.8 {@XMM[6]}, [$out]!
  1387. vmov.i32 @XMM[8], #1 @ compose 1<<96
  1388. vst1.8 {@XMM[3]}, [$out]!
  1389. veor @XMM[9], @XMM[9], @XMM[9]
  1390. vst1.8 {@XMM[7]}, [$out]!
  1391. vext.8 @XMM[8], @XMM[9], @XMM[8], #4
  1392. vst1.8 {@XMM[2]}, [$out]!
  1393. vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
  1394. vst1.8 {@XMM[5]}, [$out]!
  1395. vldmia $fp, {@XMM[0]} @ load counter
  1396. bne .Lctr_enc_loop
  1397. b .Lctr_enc_done
  1398. .align 4
  1399. .Lctr_enc_loop_done:
  1400. add $len, $len, #8
  1401. vld1.8 {@XMM[8]}, [$inp]! @ load input
  1402. veor @XMM[0], @XMM[8]
  1403. vst1.8 {@XMM[0]}, [$out]! @ write output
  1404. cmp $len, #2
  1405. blo .Lctr_enc_done
  1406. vld1.8 {@XMM[9]}, [$inp]!
  1407. veor @XMM[1], @XMM[9]
  1408. vst1.8 {@XMM[1]}, [$out]!
  1409. beq .Lctr_enc_done
  1410. vld1.8 {@XMM[10]}, [$inp]!
  1411. veor @XMM[4], @XMM[10]
  1412. vst1.8 {@XMM[4]}, [$out]!
  1413. cmp $len, #4
  1414. blo .Lctr_enc_done
  1415. vld1.8 {@XMM[11]}, [$inp]!
  1416. veor @XMM[6], @XMM[11]
  1417. vst1.8 {@XMM[6]}, [$out]!
  1418. beq .Lctr_enc_done
  1419. vld1.8 {@XMM[12]}, [$inp]!
  1420. veor @XMM[3], @XMM[12]
  1421. vst1.8 {@XMM[3]}, [$out]!
  1422. cmp $len, #6
  1423. blo .Lctr_enc_done
  1424. vld1.8 {@XMM[13]}, [$inp]!
  1425. veor @XMM[7], @XMM[13]
  1426. vst1.8 {@XMM[7]}, [$out]!
  1427. beq .Lctr_enc_done
  1428. vld1.8 {@XMM[14]}, [$inp]
  1429. veor @XMM[2], @XMM[14]
  1430. vst1.8 {@XMM[2]}, [$out]!
  1431. .Lctr_enc_done:
  1432. vmov.i32 q0, #0
  1433. vmov.i32 q1, #0
  1434. #ifndef BSAES_ASM_EXTENDED_KEY
  1435. .Lctr_enc_bzero: @ wipe key schedule [if any]
  1436. vstmia $keysched!, {q0-q1}
  1437. cmp $keysched, $fp
  1438. bne .Lctr_enc_bzero
  1439. #else
  1440. vstmia $keysched, {q0-q1}
  1441. #endif
  1442. mov sp, $fp
  1443. add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
  1444. VFP_ABI_POP
  1445. ldmia sp!, {r4-r10, pc} @ return
  1446. .align 4
  1447. .Lctr_enc_short:
  1448. ldr ip, [sp] @ ctr pointer is passed on stack
  1449. stmdb sp!, {r4-r8, lr}
  1450. mov r4, $inp @ copy arguments
  1451. mov r5, $out
  1452. mov r6, $len
  1453. mov r7, $key
  1454. ldr r8, [ip, #12] @ load counter LSW
  1455. vld1.8 {@XMM[1]}, [ip] @ load whole counter value
  1456. #ifdef __ARMEL__
  1457. rev r8, r8
  1458. #endif
  1459. sub sp, sp, #0x10
  1460. vst1.8 {@XMM[1]}, [sp] @ copy counter value
  1461. sub sp, sp, #0x10
  1462. .Lctr_enc_short_loop:
  1463. add r0, sp, #0x10 @ input counter value
  1464. mov r1, sp @ output on the stack
  1465. mov r2, r7 @ key
  1466. bl AES_encrypt
  1467. vld1.8 {@XMM[0]}, [r4]! @ load input
  1468. vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
  1469. add r8, r8, #1
  1470. #ifdef __ARMEL__
  1471. rev r0, r8
  1472. str r0, [sp, #0x1c] @ next counter value
  1473. #else
  1474. str r8, [sp, #0x1c] @ next counter value
  1475. #endif
  1476. veor @XMM[0],@XMM[0],@XMM[1]
  1477. vst1.8 {@XMM[0]}, [r5]! @ store output
  1478. subs r6, r6, #1
  1479. bne .Lctr_enc_short_loop
  1480. vmov.i32 q0, #0
  1481. vmov.i32 q1, #0
  1482. vstmia sp!, {q0-q1}
  1483. ldmia sp!, {r4-r8, pc}
  1484. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
  1485. ___
  1486. }
  1487. {
  1488. ######################################################################
  1489. # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
  1490. # const AES_KEY *key1, const AES_KEY *key2,
  1491. # const unsigned char iv[16]);
  1492. #
  1493. my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
  1494. my $const="r6"; # returned by _bsaes_key_convert
  1495. my $twmask=@XMM[5];
  1496. my @T=@XMM[6..7];
  1497. $code.=<<___;
  1498. .globl bsaes_xts_encrypt
  1499. .hidden bsaes_xts_encrypt
  1500. .type bsaes_xts_encrypt,%function
  1501. .align 4
  1502. bsaes_xts_encrypt:
  1503. mov ip, sp
  1504. stmdb sp!, {r4-r10, lr} @ 0x20
  1505. VFP_ABI_PUSH
  1506. mov r6, sp @ future $fp
  1507. mov $inp, r0
  1508. mov $out, r1
  1509. mov $len, r2
  1510. mov $key, r3
  1511. sub r0, sp, #0x10 @ 0x10
  1512. bic r0, #0xf @ align at 16 bytes
  1513. mov sp, r0
  1514. #ifdef XTS_CHAIN_TWEAK
  1515. ldr r0, [ip] @ pointer to input tweak
  1516. #else
  1517. @ generate initial tweak
  1518. ldr r0, [ip, #4] @ iv[]
  1519. mov r1, sp
  1520. ldr r2, [ip, #0] @ key2
  1521. bl AES_encrypt
  1522. mov r0,sp @ pointer to initial tweak
  1523. #endif
  1524. ldr $rounds, [$key, #240] @ get # of rounds
  1525. mov $fp, r6
  1526. #ifndef BSAES_ASM_EXTENDED_KEY
  1527. @ allocate the key schedule on the stack
  1528. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1529. @ add r12, #`128-32` @ size of bit-sliced key schedule
  1530. sub r12, #`32+16` @ place for tweak[9]
  1531. @ populate the key schedule
  1532. mov r4, $key @ pass key
  1533. mov r5, $rounds @ pass # of rounds
  1534. mov sp, r12
  1535. add r12, #0x90 @ pass key schedule
  1536. bl _bsaes_key_convert
  1537. veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
  1538. vstmia r12, {@XMM[7]} @ save last round key
  1539. #else
  1540. ldr r12, [$key, #244]
  1541. eors r12, #1
  1542. beq 0f
  1543. str r12, [$key, #244]
  1544. mov r4, $key @ pass key
  1545. mov r5, $rounds @ pass # of rounds
  1546. add r12, $key, #248 @ pass key schedule
  1547. bl _bsaes_key_convert
  1548. veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
  1549. vstmia r12, {@XMM[7]}
  1550. .align 2
  1551. 0: sub sp, #0x90 @ place for tweak[9]
  1552. #endif
  1553. vld1.8 {@XMM[8]}, [r0] @ initial tweak
  1554. adr $magic, .Lxts_magic
  1555. subs $len, #0x80
  1556. blo .Lxts_enc_short
  1557. b .Lxts_enc_loop
  1558. .align 4
  1559. .Lxts_enc_loop:
  1560. vldmia $magic, {$twmask} @ load XTS magic
  1561. vshr.s64 @T[0], @XMM[8], #63
  1562. mov r0, sp
  1563. vand @T[0], @T[0], $twmask
  1564. ___
  1565. for($i=9;$i<16;$i++) {
  1566. $code.=<<___;
  1567. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1568. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1569. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1570. vshr.s64 @T[1], @XMM[$i], #63
  1571. veor @XMM[$i], @XMM[$i], @T[0]
  1572. vand @T[1], @T[1], $twmask
  1573. ___
  1574. @T=reverse(@T);
  1575. $code.=<<___ if ($i>=10);
  1576. vld1.8 {@XMM[$i-10]}, [$inp]!
  1577. ___
  1578. $code.=<<___ if ($i>=11);
  1579. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1580. ___
  1581. }
  1582. $code.=<<___;
  1583. vadd.u64 @XMM[8], @XMM[15], @XMM[15]
  1584. vst1.64 {@XMM[15]}, [r0,:128]!
  1585. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1586. veor @XMM[8], @XMM[8], @T[0]
  1587. vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1588. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1589. veor @XMM[5], @XMM[5], @XMM[13]
  1590. #ifndef BSAES_ASM_EXTENDED_KEY
  1591. add r4, sp, #0x90 @ pass key schedule
  1592. #else
  1593. add r4, $key, #248 @ pass key schedule
  1594. #endif
  1595. veor @XMM[6], @XMM[6], @XMM[14]
  1596. mov r5, $rounds @ pass rounds
  1597. veor @XMM[7], @XMM[7], @XMM[15]
  1598. mov r0, sp
  1599. bl _bsaes_encrypt8
  1600. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1601. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1602. veor @XMM[0], @XMM[0], @XMM[ 8]
  1603. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1604. veor @XMM[1], @XMM[1], @XMM[ 9]
  1605. veor @XMM[8], @XMM[4], @XMM[10]
  1606. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1607. veor @XMM[9], @XMM[6], @XMM[11]
  1608. vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
  1609. veor @XMM[10], @XMM[3], @XMM[12]
  1610. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1611. veor @XMM[11], @XMM[7], @XMM[13]
  1612. veor @XMM[12], @XMM[2], @XMM[14]
  1613. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1614. veor @XMM[13], @XMM[5], @XMM[15]
  1615. vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
  1616. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1617. subs $len, #0x80
  1618. bpl .Lxts_enc_loop
  1619. .Lxts_enc_short:
  1620. adds $len, #0x70
  1621. bmi .Lxts_enc_done
  1622. vldmia $magic, {$twmask} @ load XTS magic
  1623. vshr.s64 @T[0], @XMM[8], #63
  1624. mov r0, sp
  1625. vand @T[0], @T[0], $twmask
  1626. ___
  1627. for($i=9;$i<16;$i++) {
  1628. $code.=<<___;
  1629. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1630. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1631. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1632. vshr.s64 @T[1], @XMM[$i], #63
  1633. veor @XMM[$i], @XMM[$i], @T[0]
  1634. vand @T[1], @T[1], $twmask
  1635. ___
  1636. @T=reverse(@T);
  1637. $code.=<<___ if ($i>=10);
  1638. vld1.8 {@XMM[$i-10]}, [$inp]!
  1639. subs $len, #0x10
  1640. bmi .Lxts_enc_`$i-9`
  1641. ___
  1642. $code.=<<___ if ($i>=11);
  1643. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1644. ___
  1645. }
  1646. $code.=<<___;
  1647. sub $len, #0x10
  1648. vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
  1649. vld1.8 {@XMM[6]}, [$inp]!
  1650. veor @XMM[5], @XMM[5], @XMM[13]
  1651. #ifndef BSAES_ASM_EXTENDED_KEY
  1652. add r4, sp, #0x90 @ pass key schedule
  1653. #else
  1654. add r4, $key, #248 @ pass key schedule
  1655. #endif
  1656. veor @XMM[6], @XMM[6], @XMM[14]
  1657. mov r5, $rounds @ pass rounds
  1658. mov r0, sp
  1659. bl _bsaes_encrypt8
  1660. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1661. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1662. veor @XMM[0], @XMM[0], @XMM[ 8]
  1663. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1664. veor @XMM[1], @XMM[1], @XMM[ 9]
  1665. veor @XMM[8], @XMM[4], @XMM[10]
  1666. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1667. veor @XMM[9], @XMM[6], @XMM[11]
  1668. vld1.64 {@XMM[14]}, [r0,:128]!
  1669. veor @XMM[10], @XMM[3], @XMM[12]
  1670. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1671. veor @XMM[11], @XMM[7], @XMM[13]
  1672. veor @XMM[12], @XMM[2], @XMM[14]
  1673. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1674. vst1.8 {@XMM[12]}, [$out]!
  1675. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1676. b .Lxts_enc_done
  1677. .align 4
  1678. .Lxts_enc_6:
  1679. vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
  1680. veor @XMM[4], @XMM[4], @XMM[12]
  1681. #ifndef BSAES_ASM_EXTENDED_KEY
  1682. add r4, sp, #0x90 @ pass key schedule
  1683. #else
  1684. add r4, $key, #248 @ pass key schedule
  1685. #endif
  1686. veor @XMM[5], @XMM[5], @XMM[13]
  1687. mov r5, $rounds @ pass rounds
  1688. mov r0, sp
  1689. bl _bsaes_encrypt8
  1690. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1691. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1692. veor @XMM[0], @XMM[0], @XMM[ 8]
  1693. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1694. veor @XMM[1], @XMM[1], @XMM[ 9]
  1695. veor @XMM[8], @XMM[4], @XMM[10]
  1696. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1697. veor @XMM[9], @XMM[6], @XMM[11]
  1698. veor @XMM[10], @XMM[3], @XMM[12]
  1699. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1700. veor @XMM[11], @XMM[7], @XMM[13]
  1701. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1702. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1703. b .Lxts_enc_done
  1704. @ put this in range for both ARM and Thumb mode adr instructions
  1705. .align 5
  1706. .Lxts_magic:
  1707. .quad 1, 0x87
  1708. .align 5
  1709. .Lxts_enc_5:
  1710. vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
  1711. veor @XMM[3], @XMM[3], @XMM[11]
  1712. #ifndef BSAES_ASM_EXTENDED_KEY
  1713. add r4, sp, #0x90 @ pass key schedule
  1714. #else
  1715. add r4, $key, #248 @ pass key schedule
  1716. #endif
  1717. veor @XMM[4], @XMM[4], @XMM[12]
  1718. mov r5, $rounds @ pass rounds
  1719. mov r0, sp
  1720. bl _bsaes_encrypt8
  1721. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1722. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1723. veor @XMM[0], @XMM[0], @XMM[ 8]
  1724. vld1.64 {@XMM[12]}, [r0,:128]!
  1725. veor @XMM[1], @XMM[1], @XMM[ 9]
  1726. veor @XMM[8], @XMM[4], @XMM[10]
  1727. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1728. veor @XMM[9], @XMM[6], @XMM[11]
  1729. veor @XMM[10], @XMM[3], @XMM[12]
  1730. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1731. vst1.8 {@XMM[10]}, [$out]!
  1732. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1733. b .Lxts_enc_done
  1734. .align 4
  1735. .Lxts_enc_4:
  1736. vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
  1737. veor @XMM[2], @XMM[2], @XMM[10]
  1738. #ifndef BSAES_ASM_EXTENDED_KEY
  1739. add r4, sp, #0x90 @ pass key schedule
  1740. #else
  1741. add r4, $key, #248 @ pass key schedule
  1742. #endif
  1743. veor @XMM[3], @XMM[3], @XMM[11]
  1744. mov r5, $rounds @ pass rounds
  1745. mov r0, sp
  1746. bl _bsaes_encrypt8
  1747. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1748. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1749. veor @XMM[0], @XMM[0], @XMM[ 8]
  1750. veor @XMM[1], @XMM[1], @XMM[ 9]
  1751. veor @XMM[8], @XMM[4], @XMM[10]
  1752. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1753. veor @XMM[9], @XMM[6], @XMM[11]
  1754. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1755. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1756. b .Lxts_enc_done
  1757. .align 4
  1758. .Lxts_enc_3:
  1759. vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
  1760. veor @XMM[1], @XMM[1], @XMM[9]
  1761. #ifndef BSAES_ASM_EXTENDED_KEY
  1762. add r4, sp, #0x90 @ pass key schedule
  1763. #else
  1764. add r4, $key, #248 @ pass key schedule
  1765. #endif
  1766. veor @XMM[2], @XMM[2], @XMM[10]
  1767. mov r5, $rounds @ pass rounds
  1768. mov r0, sp
  1769. bl _bsaes_encrypt8
  1770. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  1771. vld1.64 {@XMM[10]}, [r0,:128]!
  1772. veor @XMM[0], @XMM[0], @XMM[ 8]
  1773. veor @XMM[1], @XMM[1], @XMM[ 9]
  1774. veor @XMM[8], @XMM[4], @XMM[10]
  1775. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1776. vst1.8 {@XMM[8]}, [$out]!
  1777. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1778. b .Lxts_enc_done
  1779. .align 4
  1780. .Lxts_enc_2:
  1781. vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
  1782. veor @XMM[0], @XMM[0], @XMM[8]
  1783. #ifndef BSAES_ASM_EXTENDED_KEY
  1784. add r4, sp, #0x90 @ pass key schedule
  1785. #else
  1786. add r4, $key, #248 @ pass key schedule
  1787. #endif
  1788. veor @XMM[1], @XMM[1], @XMM[9]
  1789. mov r5, $rounds @ pass rounds
  1790. mov r0, sp
  1791. bl _bsaes_encrypt8
  1792. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  1793. veor @XMM[0], @XMM[0], @XMM[ 8]
  1794. veor @XMM[1], @XMM[1], @XMM[ 9]
  1795. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1796. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1797. b .Lxts_enc_done
  1798. .align 4
  1799. .Lxts_enc_1:
  1800. mov r0, sp
  1801. veor @XMM[0], @XMM[8]
  1802. mov r1, sp
  1803. vst1.8 {@XMM[0]}, [sp,:128]
  1804. mov r2, $key
  1805. mov r4, $fp @ preserve fp
  1806. bl AES_encrypt
  1807. vld1.8 {@XMM[0]}, [sp,:128]
  1808. veor @XMM[0], @XMM[0], @XMM[8]
  1809. vst1.8 {@XMM[0]}, [$out]!
  1810. mov $fp, r4
  1811. vmov @XMM[8], @XMM[9] @ next round tweak
  1812. .Lxts_enc_done:
  1813. #ifndef XTS_CHAIN_TWEAK
  1814. adds $len, #0x10
  1815. beq .Lxts_enc_ret
  1816. sub r6, $out, #0x10
  1817. .Lxts_enc_steal:
  1818. ldrb r0, [$inp], #1
  1819. ldrb r1, [$out, #-0x10]
  1820. strb r0, [$out, #-0x10]
  1821. strb r1, [$out], #1
  1822. subs $len, #1
  1823. bhi .Lxts_enc_steal
  1824. vld1.8 {@XMM[0]}, [r6]
  1825. mov r0, sp
  1826. veor @XMM[0], @XMM[0], @XMM[8]
  1827. mov r1, sp
  1828. vst1.8 {@XMM[0]}, [sp,:128]
  1829. mov r2, $key
  1830. mov r4, $fp @ preserve fp
  1831. bl AES_encrypt
  1832. vld1.8 {@XMM[0]}, [sp,:128]
  1833. veor @XMM[0], @XMM[0], @XMM[8]
  1834. vst1.8 {@XMM[0]}, [r6]
  1835. mov $fp, r4
  1836. #endif
  1837. .Lxts_enc_ret:
  1838. bic r0, $fp, #0xf
  1839. vmov.i32 q0, #0
  1840. vmov.i32 q1, #0
  1841. #ifdef XTS_CHAIN_TWEAK
  1842. ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
  1843. #endif
  1844. .Lxts_enc_bzero: @ wipe key schedule [if any]
  1845. vstmia sp!, {q0-q1}
  1846. cmp sp, r0
  1847. bne .Lxts_enc_bzero
  1848. mov sp, $fp
  1849. #ifdef XTS_CHAIN_TWEAK
  1850. vst1.8 {@XMM[8]}, [r1]
  1851. #endif
  1852. VFP_ABI_POP
  1853. ldmia sp!, {r4-r10, pc} @ return
  1854. .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
  1855. .globl bsaes_xts_decrypt
  1856. .hidden bsaes_xts_decrypt
  1857. .type bsaes_xts_decrypt,%function
  1858. .align 4
  1859. bsaes_xts_decrypt:
  1860. mov ip, sp
  1861. stmdb sp!, {r4-r10, lr} @ 0x20
  1862. VFP_ABI_PUSH
  1863. mov r6, sp @ future $fp
  1864. mov $inp, r0
  1865. mov $out, r1
  1866. mov $len, r2
  1867. mov $key, r3
  1868. sub r0, sp, #0x10 @ 0x10
  1869. bic r0, #0xf @ align at 16 bytes
  1870. mov sp, r0
  1871. #ifdef XTS_CHAIN_TWEAK
  1872. ldr r0, [ip] @ pointer to input tweak
  1873. #else
  1874. @ generate initial tweak
  1875. ldr r0, [ip, #4] @ iv[]
  1876. mov r1, sp
  1877. ldr r2, [ip, #0] @ key2
  1878. bl AES_encrypt
  1879. mov r0, sp @ pointer to initial tweak
  1880. #endif
  1881. ldr $rounds, [$key, #240] @ get # of rounds
  1882. mov $fp, r6
  1883. #ifndef BSAES_ASM_EXTENDED_KEY
  1884. @ allocate the key schedule on the stack
  1885. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1886. @ add r12, #`128-32` @ size of bit-sliced key schedule
  1887. sub r12, #`32+16` @ place for tweak[9]
  1888. @ populate the key schedule
  1889. mov r4, $key @ pass key
  1890. mov r5, $rounds @ pass # of rounds
  1891. mov sp, r12
  1892. add r12, #0x90 @ pass key schedule
  1893. bl _bsaes_key_convert
  1894. add r4, sp, #0x90
  1895. vldmia r4, {@XMM[6]}
  1896. vstmia r12, {@XMM[15]} @ save last round key
  1897. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1898. vstmia r4, {@XMM[7]}
  1899. #else
  1900. ldr r12, [$key, #244]
  1901. eors r12, #1
  1902. beq 0f
  1903. str r12, [$key, #244]
  1904. mov r4, $key @ pass key
  1905. mov r5, $rounds @ pass # of rounds
  1906. add r12, $key, #248 @ pass key schedule
  1907. bl _bsaes_key_convert
  1908. add r4, $key, #248
  1909. vldmia r4, {@XMM[6]}
  1910. vstmia r12, {@XMM[15]} @ save last round key
  1911. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1912. vstmia r4, {@XMM[7]}
  1913. .align 2
  1914. 0: sub sp, #0x90 @ place for tweak[9]
  1915. #endif
  1916. vld1.8 {@XMM[8]}, [r0] @ initial tweak
  1917. adr $magic, .Lxts_magic
  1918. #ifndef XTS_CHAIN_TWEAK
  1919. tst $len, #0xf @ if not multiple of 16
  1920. it ne @ Thumb2 thing, sanity check in ARM
  1921. subne $len, #0x10 @ subtract another 16 bytes
  1922. #endif
  1923. subs $len, #0x80
  1924. blo .Lxts_dec_short
  1925. b .Lxts_dec_loop
  1926. .align 4
  1927. .Lxts_dec_loop:
  1928. vldmia $magic, {$twmask} @ load XTS magic
  1929. vshr.s64 @T[0], @XMM[8], #63
  1930. mov r0, sp
  1931. vand @T[0], @T[0], $twmask
  1932. ___
  1933. for($i=9;$i<16;$i++) {
  1934. $code.=<<___;
  1935. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1936. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1937. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1938. vshr.s64 @T[1], @XMM[$i], #63
  1939. veor @XMM[$i], @XMM[$i], @T[0]
  1940. vand @T[1], @T[1], $twmask
  1941. ___
  1942. @T=reverse(@T);
  1943. $code.=<<___ if ($i>=10);
  1944. vld1.8 {@XMM[$i-10]}, [$inp]!
  1945. ___
  1946. $code.=<<___ if ($i>=11);
  1947. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1948. ___
  1949. }
  1950. $code.=<<___;
  1951. vadd.u64 @XMM[8], @XMM[15], @XMM[15]
  1952. vst1.64 {@XMM[15]}, [r0,:128]!
  1953. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1954. veor @XMM[8], @XMM[8], @T[0]
  1955. vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1956. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1957. veor @XMM[5], @XMM[5], @XMM[13]
  1958. #ifndef BSAES_ASM_EXTENDED_KEY
  1959. add r4, sp, #0x90 @ pass key schedule
  1960. #else
  1961. add r4, $key, #248 @ pass key schedule
  1962. #endif
  1963. veor @XMM[6], @XMM[6], @XMM[14]
  1964. mov r5, $rounds @ pass rounds
  1965. veor @XMM[7], @XMM[7], @XMM[15]
  1966. mov r0, sp
  1967. bl _bsaes_decrypt8
  1968. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1969. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1970. veor @XMM[0], @XMM[0], @XMM[ 8]
  1971. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1972. veor @XMM[1], @XMM[1], @XMM[ 9]
  1973. veor @XMM[8], @XMM[6], @XMM[10]
  1974. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1975. veor @XMM[9], @XMM[4], @XMM[11]
  1976. vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
  1977. veor @XMM[10], @XMM[2], @XMM[12]
  1978. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1979. veor @XMM[11], @XMM[7], @XMM[13]
  1980. veor @XMM[12], @XMM[3], @XMM[14]
  1981. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1982. veor @XMM[13], @XMM[5], @XMM[15]
  1983. vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
  1984. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1985. subs $len, #0x80
  1986. bpl .Lxts_dec_loop
  1987. .Lxts_dec_short:
  1988. adds $len, #0x70
  1989. bmi .Lxts_dec_done
  1990. vldmia $magic, {$twmask} @ load XTS magic
  1991. vshr.s64 @T[0], @XMM[8], #63
  1992. mov r0, sp
  1993. vand @T[0], @T[0], $twmask
  1994. ___
  1995. for($i=9;$i<16;$i++) {
  1996. $code.=<<___;
  1997. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1998. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1999. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  2000. vshr.s64 @T[1], @XMM[$i], #63
  2001. veor @XMM[$i], @XMM[$i], @T[0]
  2002. vand @T[1], @T[1], $twmask
  2003. ___
  2004. @T=reverse(@T);
  2005. $code.=<<___ if ($i>=10);
  2006. vld1.8 {@XMM[$i-10]}, [$inp]!
  2007. subs $len, #0x10
  2008. bmi .Lxts_dec_`$i-9`
  2009. ___
  2010. $code.=<<___ if ($i>=11);
  2011. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  2012. ___
  2013. }
  2014. $code.=<<___;
  2015. sub $len, #0x10
  2016. vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
  2017. vld1.8 {@XMM[6]}, [$inp]!
  2018. veor @XMM[5], @XMM[5], @XMM[13]
  2019. #ifndef BSAES_ASM_EXTENDED_KEY
  2020. add r4, sp, #0x90 @ pass key schedule
  2021. #else
  2022. add r4, $key, #248 @ pass key schedule
  2023. #endif
  2024. veor @XMM[6], @XMM[6], @XMM[14]
  2025. mov r5, $rounds @ pass rounds
  2026. mov r0, sp
  2027. bl _bsaes_decrypt8
  2028. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2029. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2030. veor @XMM[0], @XMM[0], @XMM[ 8]
  2031. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  2032. veor @XMM[1], @XMM[1], @XMM[ 9]
  2033. veor @XMM[8], @XMM[6], @XMM[10]
  2034. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2035. veor @XMM[9], @XMM[4], @XMM[11]
  2036. vld1.64 {@XMM[14]}, [r0,:128]!
  2037. veor @XMM[10], @XMM[2], @XMM[12]
  2038. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2039. veor @XMM[11], @XMM[7], @XMM[13]
  2040. veor @XMM[12], @XMM[3], @XMM[14]
  2041. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  2042. vst1.8 {@XMM[12]}, [$out]!
  2043. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2044. b .Lxts_dec_done
  2045. .align 4
  2046. .Lxts_dec_6:
  2047. vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
  2048. veor @XMM[4], @XMM[4], @XMM[12]
  2049. #ifndef BSAES_ASM_EXTENDED_KEY
  2050. add r4, sp, #0x90 @ pass key schedule
  2051. #else
  2052. add r4, $key, #248 @ pass key schedule
  2053. #endif
  2054. veor @XMM[5], @XMM[5], @XMM[13]
  2055. mov r5, $rounds @ pass rounds
  2056. mov r0, sp
  2057. bl _bsaes_decrypt8
  2058. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2059. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2060. veor @XMM[0], @XMM[0], @XMM[ 8]
  2061. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  2062. veor @XMM[1], @XMM[1], @XMM[ 9]
  2063. veor @XMM[8], @XMM[6], @XMM[10]
  2064. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2065. veor @XMM[9], @XMM[4], @XMM[11]
  2066. veor @XMM[10], @XMM[2], @XMM[12]
  2067. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2068. veor @XMM[11], @XMM[7], @XMM[13]
  2069. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  2070. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2071. b .Lxts_dec_done
  2072. .align 4
  2073. .Lxts_dec_5:
  2074. vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
  2075. veor @XMM[3], @XMM[3], @XMM[11]
  2076. #ifndef BSAES_ASM_EXTENDED_KEY
  2077. add r4, sp, #0x90 @ pass key schedule
  2078. #else
  2079. add r4, $key, #248 @ pass key schedule
  2080. #endif
  2081. veor @XMM[4], @XMM[4], @XMM[12]
  2082. mov r5, $rounds @ pass rounds
  2083. mov r0, sp
  2084. bl _bsaes_decrypt8
  2085. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2086. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2087. veor @XMM[0], @XMM[0], @XMM[ 8]
  2088. vld1.64 {@XMM[12]}, [r0,:128]!
  2089. veor @XMM[1], @XMM[1], @XMM[ 9]
  2090. veor @XMM[8], @XMM[6], @XMM[10]
  2091. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2092. veor @XMM[9], @XMM[4], @XMM[11]
  2093. veor @XMM[10], @XMM[2], @XMM[12]
  2094. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2095. vst1.8 {@XMM[10]}, [$out]!
  2096. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2097. b .Lxts_dec_done
  2098. .align 4
  2099. .Lxts_dec_4:
  2100. vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
  2101. veor @XMM[2], @XMM[2], @XMM[10]
  2102. #ifndef BSAES_ASM_EXTENDED_KEY
  2103. add r4, sp, #0x90 @ pass key schedule
  2104. #else
  2105. add r4, $key, #248 @ pass key schedule
  2106. #endif
  2107. veor @XMM[3], @XMM[3], @XMM[11]
  2108. mov r5, $rounds @ pass rounds
  2109. mov r0, sp
  2110. bl _bsaes_decrypt8
  2111. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2112. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2113. veor @XMM[0], @XMM[0], @XMM[ 8]
  2114. veor @XMM[1], @XMM[1], @XMM[ 9]
  2115. veor @XMM[8], @XMM[6], @XMM[10]
  2116. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2117. veor @XMM[9], @XMM[4], @XMM[11]
  2118. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2119. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2120. b .Lxts_dec_done
  2121. .align 4
  2122. .Lxts_dec_3:
  2123. vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
  2124. veor @XMM[1], @XMM[1], @XMM[9]
  2125. #ifndef BSAES_ASM_EXTENDED_KEY
  2126. add r4, sp, #0x90 @ pass key schedule
  2127. #else
  2128. add r4, $key, #248 @ pass key schedule
  2129. #endif
  2130. veor @XMM[2], @XMM[2], @XMM[10]
  2131. mov r5, $rounds @ pass rounds
  2132. mov r0, sp
  2133. bl _bsaes_decrypt8
  2134. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  2135. vld1.64 {@XMM[10]}, [r0,:128]!
  2136. veor @XMM[0], @XMM[0], @XMM[ 8]
  2137. veor @XMM[1], @XMM[1], @XMM[ 9]
  2138. veor @XMM[8], @XMM[6], @XMM[10]
  2139. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2140. vst1.8 {@XMM[8]}, [$out]!
  2141. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2142. b .Lxts_dec_done
  2143. .align 4
  2144. .Lxts_dec_2:
  2145. vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
  2146. veor @XMM[0], @XMM[0], @XMM[8]
  2147. #ifndef BSAES_ASM_EXTENDED_KEY
  2148. add r4, sp, #0x90 @ pass key schedule
  2149. #else
  2150. add r4, $key, #248 @ pass key schedule
  2151. #endif
  2152. veor @XMM[1], @XMM[1], @XMM[9]
  2153. mov r5, $rounds @ pass rounds
  2154. mov r0, sp
  2155. bl _bsaes_decrypt8
  2156. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  2157. veor @XMM[0], @XMM[0], @XMM[ 8]
  2158. veor @XMM[1], @XMM[1], @XMM[ 9]
  2159. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2160. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2161. b .Lxts_dec_done
  2162. .align 4
  2163. .Lxts_dec_1:
  2164. mov r0, sp
  2165. veor @XMM[0], @XMM[8]
  2166. mov r1, sp
  2167. vst1.8 {@XMM[0]}, [sp,:128]
  2168. mov r2, $key
  2169. mov r4, $fp @ preserve fp
  2170. mov r5, $magic @ preserve magic
  2171. bl AES_decrypt
  2172. vld1.8 {@XMM[0]}, [sp,:128]
  2173. veor @XMM[0], @XMM[0], @XMM[8]
  2174. vst1.8 {@XMM[0]}, [$out]!
  2175. mov $fp, r4
  2176. mov $magic, r5
  2177. vmov @XMM[8], @XMM[9] @ next round tweak
  2178. .Lxts_dec_done:
  2179. #ifndef XTS_CHAIN_TWEAK
  2180. adds $len, #0x10
  2181. beq .Lxts_dec_ret
  2182. @ calculate one round of extra tweak for the stolen ciphertext
  2183. vldmia $magic, {$twmask}
  2184. vshr.s64 @XMM[6], @XMM[8], #63
  2185. vand @XMM[6], @XMM[6], $twmask
  2186. vadd.u64 @XMM[9], @XMM[8], @XMM[8]
  2187. vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
  2188. veor @XMM[9], @XMM[9], @XMM[6]
  2189. @ perform the final decryption with the last tweak value
  2190. vld1.8 {@XMM[0]}, [$inp]!
  2191. mov r0, sp
  2192. veor @XMM[0], @XMM[0], @XMM[9]
  2193. mov r1, sp
  2194. vst1.8 {@XMM[0]}, [sp,:128]
  2195. mov r2, $key
  2196. mov r4, $fp @ preserve fp
  2197. bl AES_decrypt
  2198. vld1.8 {@XMM[0]}, [sp,:128]
  2199. veor @XMM[0], @XMM[0], @XMM[9]
  2200. vst1.8 {@XMM[0]}, [$out]
  2201. mov r6, $out
  2202. .Lxts_dec_steal:
  2203. ldrb r1, [$out]
  2204. ldrb r0, [$inp], #1
  2205. strb r1, [$out, #0x10]
  2206. strb r0, [$out], #1
  2207. subs $len, #1
  2208. bhi .Lxts_dec_steal
  2209. vld1.8 {@XMM[0]}, [r6]
  2210. mov r0, sp
  2211. veor @XMM[0], @XMM[8]
  2212. mov r1, sp
  2213. vst1.8 {@XMM[0]}, [sp,:128]
  2214. mov r2, $key
  2215. bl AES_decrypt
  2216. vld1.8 {@XMM[0]}, [sp,:128]
  2217. veor @XMM[0], @XMM[0], @XMM[8]
  2218. vst1.8 {@XMM[0]}, [r6]
  2219. mov $fp, r4
  2220. #endif
  2221. .Lxts_dec_ret:
  2222. bic r0, $fp, #0xf
  2223. vmov.i32 q0, #0
  2224. vmov.i32 q1, #0
  2225. #ifdef XTS_CHAIN_TWEAK
  2226. ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
  2227. #endif
  2228. .Lxts_dec_bzero: @ wipe key schedule [if any]
  2229. vstmia sp!, {q0-q1}
  2230. cmp sp, r0
  2231. bne .Lxts_dec_bzero
  2232. mov sp, $fp
  2233. #ifdef XTS_CHAIN_TWEAK
  2234. vst1.8 {@XMM[8]}, [r1]
  2235. #endif
  2236. VFP_ABI_POP
  2237. ldmia sp!, {r4-r10, pc} @ return
  2238. .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
  2239. ___
  2240. }
  2241. $code.=<<___;
  2242. #endif
  2243. #endif
  2244. ___
  2245. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  2246. open SELF,$0;
  2247. while(<SELF>) {
  2248. next if (/^#!/);
  2249. last if (!s/^#/@/ and !/^$/);
  2250. print;
  2251. }
  2252. close SELF;
  2253. print $code;
  2254. close STDOUT;