1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # December 2015
|
---|
18 | #
|
---|
19 | # ChaCha20 for s390x.
|
---|
20 | #
|
---|
21 | # 3 times faster than compiler-generated code.
|
---|
22 |
|
---|
23 | #
|
---|
24 | # August 2018
|
---|
25 | #
|
---|
26 | # Add vx code path: 4x"vertical".
|
---|
27 | #
|
---|
28 | # Copyright IBM Corp. 2018
|
---|
29 | # Author: Patrick Steuer <patrick.steuer@de.ibm.com>
|
---|
30 |
|
---|
31 | #
|
---|
32 | # February 2019
|
---|
33 | #
|
---|
34 | # Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
|
---|
35 | # 4x"vertical" submission [on z13] and >3 faster than scalar code.
|
---|
36 | # But to harness overheads revert to transliteration of VSX code path
|
---|
37 | # from chacha-ppc module, which is also 4x"vertical", to handle inputs
|
---|
38 | # not longer than 256 bytes.
|
---|
39 |
|
---|
40 | use strict;
|
---|
41 | use FindBin qw($Bin);
|
---|
42 | use lib "$Bin/../..";
|
---|
43 | use perlasm::s390x qw(:DEFAULT :VX :EI AUTOLOAD LABEL INCLUDE);
|
---|
44 |
|
---|
45 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
46 | # $flavour is the first argument if it doesn't look like a file
|
---|
47 | my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
48 | my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
49 |
|
---|
50 | my ($z,$SIZE_T);
|
---|
51 | if ($flavour =~ /3[12]/) {
|
---|
52 | $z=0; # S/390 ABI
|
---|
53 | $SIZE_T=4;
|
---|
54 | } else {
|
---|
55 | $z=1; # zSeries ABI
|
---|
56 | $SIZE_T=8;
|
---|
57 | }
|
---|
58 |
|
---|
59 | my $sp="%r15";
|
---|
60 | my $stdframe=16*$SIZE_T+4*8;
|
---|
61 |
|
---|
62 | sub ROUND {
|
---|
63 | my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
|
---|
64 | my @t=map("%r$_",(8,9));
|
---|
65 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
66 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
67 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
68 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
69 | my ($xc,$xc_)=map("$_",@t);
|
---|
70 |
|
---|
71 | # Consider order in which variables are addressed by their
|
---|
72 | # index:
|
---|
73 | #
|
---|
74 | # a b c d
|
---|
75 | #
|
---|
76 | # 0 4 8 12 < even round
|
---|
77 | # 1 5 9 13
|
---|
78 | # 2 6 10 14
|
---|
79 | # 3 7 11 15
|
---|
80 | # 0 5 10 15 < odd round
|
---|
81 | # 1 6 11 12
|
---|
82 | # 2 7 8 13
|
---|
83 | # 3 4 9 14
|
---|
84 | #
|
---|
85 | # 'a', 'b' and 'd's are permanently allocated in registers,
|
---|
86 | # @x[0..7,12..15], while 'c's are maintained in memory. If
|
---|
87 | # you observe 'c' column, you'll notice that pair of 'c's is
|
---|
88 | # invariant between rounds. This means that we have to reload
|
---|
89 | # them once per round, in the middle. This is why you'll see
|
---|
90 | # 'c' stores and loads in the middle, but none in the beginning
|
---|
91 | # or end.
|
---|
92 |
|
---|
93 | alr (@x[$a0],@x[$b0]); # Q1
|
---|
94 | alr (@x[$a1],@x[$b1]); # Q2
|
---|
95 | xr (@x[$d0],@x[$a0]);
|
---|
96 | xr (@x[$d1],@x[$a1]);
|
---|
97 | rll (@x[$d0],@x[$d0],16);
|
---|
98 | rll (@x[$d1],@x[$d1],16);
|
---|
99 |
|
---|
100 | alr ($xc,@x[$d0]);
|
---|
101 | alr ($xc_,@x[$d1]);
|
---|
102 | xr (@x[$b0],$xc);
|
---|
103 | xr (@x[$b1],$xc_);
|
---|
104 | rll (@x[$b0],@x[$b0],12);
|
---|
105 | rll (@x[$b1],@x[$b1],12);
|
---|
106 |
|
---|
107 | alr (@x[$a0],@x[$b0]);
|
---|
108 | alr (@x[$a1],@x[$b1]);
|
---|
109 | xr (@x[$d0],@x[$a0]);
|
---|
110 | xr (@x[$d1],@x[$a1]);
|
---|
111 | rll (@x[$d0],@x[$d0],8);
|
---|
112 | rll (@x[$d1],@x[$d1],8);
|
---|
113 |
|
---|
114 | alr ($xc,@x[$d0]);
|
---|
115 | alr ($xc_,@x[$d1]);
|
---|
116 | xr (@x[$b0],$xc);
|
---|
117 | xr (@x[$b1],$xc_);
|
---|
118 | rll (@x[$b0],@x[$b0],7);
|
---|
119 | rll (@x[$b1],@x[$b1],7);
|
---|
120 |
|
---|
121 | stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's
|
---|
122 | lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
|
---|
123 |
|
---|
124 | alr (@x[$a2],@x[$b2]); # Q3
|
---|
125 | alr (@x[$a3],@x[$b3]); # Q4
|
---|
126 | xr (@x[$d2],@x[$a2]);
|
---|
127 | xr (@x[$d3],@x[$a3]);
|
---|
128 | rll (@x[$d2],@x[$d2],16);
|
---|
129 | rll (@x[$d3],@x[$d3],16);
|
---|
130 |
|
---|
131 | alr ($xc,@x[$d2]);
|
---|
132 | alr ($xc_,@x[$d3]);
|
---|
133 | xr (@x[$b2],$xc);
|
---|
134 | xr (@x[$b3],$xc_);
|
---|
135 | rll (@x[$b2],@x[$b2],12);
|
---|
136 | rll (@x[$b3],@x[$b3],12);
|
---|
137 |
|
---|
138 | alr (@x[$a2],@x[$b2]);
|
---|
139 | alr (@x[$a3],@x[$b3]);
|
---|
140 | xr (@x[$d2],@x[$a2]);
|
---|
141 | xr (@x[$d3],@x[$a3]);
|
---|
142 | rll (@x[$d2],@x[$d2],8);
|
---|
143 | rll (@x[$d3],@x[$d3],8);
|
---|
144 |
|
---|
145 | alr ($xc,@x[$d2]);
|
---|
146 | alr ($xc_,@x[$d3]);
|
---|
147 | xr (@x[$b2],$xc);
|
---|
148 | xr (@x[$b3],$xc_);
|
---|
149 | rll (@x[$b2],@x[$b2],7);
|
---|
150 | rll (@x[$b3],@x[$b3],7);
|
---|
151 | }
|
---|
152 |
|
---|
153 | sub VX_lane_ROUND {
|
---|
154 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
155 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
156 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
157 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
158 | my @x=map("%v$_",(0..15));
|
---|
159 |
|
---|
160 | vaf (@x[$a0],@x[$a0],@x[$b0]); # Q1
|
---|
161 | vx (@x[$d0],@x[$d0],@x[$a0]);
|
---|
162 | verllf (@x[$d0],@x[$d0],16);
|
---|
163 | vaf (@x[$a1],@x[$a1],@x[$b1]); # Q2
|
---|
164 | vx (@x[$d1],@x[$d1],@x[$a1]);
|
---|
165 | verllf (@x[$d1],@x[$d1],16);
|
---|
166 | vaf (@x[$a2],@x[$a2],@x[$b2]); # Q3
|
---|
167 | vx (@x[$d2],@x[$d2],@x[$a2]);
|
---|
168 | verllf (@x[$d2],@x[$d2],16);
|
---|
169 | vaf (@x[$a3],@x[$a3],@x[$b3]); # Q4
|
---|
170 | vx (@x[$d3],@x[$d3],@x[$a3]);
|
---|
171 | verllf (@x[$d3],@x[$d3],16);
|
---|
172 |
|
---|
173 | vaf (@x[$c0],@x[$c0],@x[$d0]);
|
---|
174 | vx (@x[$b0],@x[$b0],@x[$c0]);
|
---|
175 | verllf (@x[$b0],@x[$b0],12);
|
---|
176 | vaf (@x[$c1],@x[$c1],@x[$d1]);
|
---|
177 | vx (@x[$b1],@x[$b1],@x[$c1]);
|
---|
178 | verllf (@x[$b1],@x[$b1],12);
|
---|
179 | vaf (@x[$c2],@x[$c2],@x[$d2]);
|
---|
180 | vx (@x[$b2],@x[$b2],@x[$c2]);
|
---|
181 | verllf (@x[$b2],@x[$b2],12);
|
---|
182 | vaf (@x[$c3],@x[$c3],@x[$d3]);
|
---|
183 | vx (@x[$b3],@x[$b3],@x[$c3]);
|
---|
184 | verllf (@x[$b3],@x[$b3],12);
|
---|
185 |
|
---|
186 | vaf (@x[$a0],@x[$a0],@x[$b0]);
|
---|
187 | vx (@x[$d0],@x[$d0],@x[$a0]);
|
---|
188 | verllf (@x[$d0],@x[$d0],8);
|
---|
189 | vaf (@x[$a1],@x[$a1],@x[$b1]);
|
---|
190 | vx (@x[$d1],@x[$d1],@x[$a1]);
|
---|
191 | verllf (@x[$d1],@x[$d1],8);
|
---|
192 | vaf (@x[$a2],@x[$a2],@x[$b2]);
|
---|
193 | vx (@x[$d2],@x[$d2],@x[$a2]);
|
---|
194 | verllf (@x[$d2],@x[$d2],8);
|
---|
195 | vaf (@x[$a3],@x[$a3],@x[$b3]);
|
---|
196 | vx (@x[$d3],@x[$d3],@x[$a3]);
|
---|
197 | verllf (@x[$d3],@x[$d3],8);
|
---|
198 |
|
---|
199 | vaf (@x[$c0],@x[$c0],@x[$d0]);
|
---|
200 | vx (@x[$b0],@x[$b0],@x[$c0]);
|
---|
201 | verllf (@x[$b0],@x[$b0],7);
|
---|
202 | vaf (@x[$c1],@x[$c1],@x[$d1]);
|
---|
203 | vx (@x[$b1],@x[$b1],@x[$c1]);
|
---|
204 | verllf (@x[$b1],@x[$b1],7);
|
---|
205 | vaf (@x[$c2],@x[$c2],@x[$d2]);
|
---|
206 | vx (@x[$b2],@x[$b2],@x[$c2]);
|
---|
207 | verllf (@x[$b2],@x[$b2],7);
|
---|
208 | vaf (@x[$c3],@x[$c3],@x[$d3]);
|
---|
209 | vx (@x[$b3],@x[$b3],@x[$c3]);
|
---|
210 | verllf (@x[$b3],@x[$b3],7);
|
---|
211 | }
|
---|
212 |
|
---|
213 | sub VX_ROUND {
|
---|
214 | my @a=@_[0..5];
|
---|
215 | my @b=@_[6..11];
|
---|
216 | my @c=@_[12..17];
|
---|
217 | my @d=@_[18..23];
|
---|
218 | my $odd=@_[24];
|
---|
219 |
|
---|
220 | vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
|
---|
221 | vx (@d[$_],@d[$_],@a[$_]) for (0..5);
|
---|
222 | verllf (@d[$_],@d[$_],16) for (0..5);
|
---|
223 |
|
---|
224 | vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
|
---|
225 | vx (@b[$_],@b[$_],@c[$_]) for (0..5);
|
---|
226 | verllf (@b[$_],@b[$_],12) for (0..5);
|
---|
227 |
|
---|
228 | vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
|
---|
229 | vx (@d[$_],@d[$_],@a[$_]) for (0..5);
|
---|
230 | verllf (@d[$_],@d[$_],8) for (0..5);
|
---|
231 |
|
---|
232 | vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
|
---|
233 | vx (@b[$_],@b[$_],@c[$_]) for (0..5);
|
---|
234 | verllf (@b[$_],@b[$_],7) for (0..5);
|
---|
235 |
|
---|
236 | vsldb (@c[$_],@c[$_],@c[$_],8) for (0..5);
|
---|
237 | vsldb (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
|
---|
238 | vsldb (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
|
---|
239 | }
|
---|
240 |
|
---|
241 | PERLASM_BEGIN($output);
|
---|
242 |
|
---|
243 | INCLUDE ("s390x_arch.h");
|
---|
244 | TEXT ();
|
---|
245 |
|
---|
246 | ################
|
---|
247 | # void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
|
---|
248 | # const unsigned int key[8], const unsigned int counter[4])
|
---|
249 | my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
|
---|
250 | {
|
---|
251 | my $frame=$stdframe+4*20;
|
---|
252 | my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
|
---|
253 | my @t=map("%r$_",(8,9));
|
---|
254 |
|
---|
255 | GLOBL ("ChaCha20_ctr32");
|
---|
256 | TYPE ("ChaCha20_ctr32","\@function");
|
---|
257 | ALIGN (32);
|
---|
258 | LABEL ("ChaCha20_ctr32");
|
---|
259 | larl ("%r1","OPENSSL_s390xcap_P");
|
---|
260 |
|
---|
261 | lghi ("%r0",64);
|
---|
262 | &{$z? \<gr:\<r} ($len,$len); # len==0?
|
---|
263 | bzr ("%r14");
|
---|
264 | lg ("%r1","S390X_STFLE+16(%r1)");
|
---|
265 | &{$z? \&clgr:\&clr} ($len,"%r0");
|
---|
266 | jle (".Lshort");
|
---|
267 |
|
---|
268 | tmhh ("%r1",0x4000); # check for vx bit
|
---|
269 | jnz (".LChaCha20_ctr32_vx");
|
---|
270 |
|
---|
271 | LABEL (".Lshort");
|
---|
272 | &{$z? \&aghi:\&ahi} ($len,-64);
|
---|
273 | &{$z? \&lghi:\&lhi} ("%r1",-$frame);
|
---|
274 | &{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
|
---|
275 | &{$z? \&slgr:\&slr} ($out,$inp); # difference
|
---|
276 | la ($len,"0($inp,$len)"); # end of input minus 64
|
---|
277 | larl ("%r7",".Lsigma");
|
---|
278 | lgr ("%r0",$sp);
|
---|
279 | la ($sp,"0(%r1,$sp)");
|
---|
280 | &{$z? \&stg:\&st} ("%r0","0($sp)");
|
---|
281 |
|
---|
282 | lmg ("%r8","%r11","0($key)"); # load key
|
---|
283 | lmg ("%r12","%r13","0($counter)"); # load counter
|
---|
284 | lmg ("%r6","%r7","0(%r7)"); # load sigma constant
|
---|
285 |
|
---|
286 | la ("%r14","0($inp)");
|
---|
287 | &{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)");
|
---|
288 | &{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)");
|
---|
289 | stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
|
---|
290 | srlg (@x[12],"%r12",32); # 32-bit counter value
|
---|
291 | j (".Loop_outer");
|
---|
292 |
|
---|
293 | ALIGN (16);
|
---|
294 | LABEL (".Loop_outer");
|
---|
295 | lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7]
|
---|
296 | lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11]
|
---|
297 | lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15]
|
---|
298 | stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
|
---|
299 | lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9]
|
---|
300 | st (@x[12],"$stdframe+4*12($sp)"); # save counter
|
---|
301 | &{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
|
---|
302 | lhi ("%r14",10);
|
---|
303 | j (".Loop");
|
---|
304 |
|
---|
305 | ALIGN (4);
|
---|
306 | LABEL (".Loop");
|
---|
307 | ROUND (0, 4, 8,12);
|
---|
308 | ROUND (0, 5,10,15);
|
---|
309 | brct ("%r14",".Loop");
|
---|
310 |
|
---|
311 | &{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
|
---|
312 | stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9]
|
---|
313 | &{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
|
---|
314 |
|
---|
315 | al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule
|
---|
316 | al (@x[1],"$stdframe+4*1($sp)");
|
---|
317 | al (@x[2],"$stdframe+4*2($sp)");
|
---|
318 | al (@x[3],"$stdframe+4*3($sp)");
|
---|
319 | al (@x[4],"$stdframe+4*4($sp)");
|
---|
320 | al (@x[5],"$stdframe+4*5($sp)");
|
---|
321 | al (@x[6],"$stdframe+4*6($sp)");
|
---|
322 | al (@x[7],"$stdframe+4*7($sp)");
|
---|
323 | lrvr (@x[0],@x[0]);
|
---|
324 | lrvr (@x[1],@x[1]);
|
---|
325 | lrvr (@x[2],@x[2]);
|
---|
326 | lrvr (@x[3],@x[3]);
|
---|
327 | lrvr (@x[4],@x[4]);
|
---|
328 | lrvr (@x[5],@x[5]);
|
---|
329 | lrvr (@x[6],@x[6]);
|
---|
330 | lrvr (@x[7],@x[7]);
|
---|
331 | al (@x[12],"$stdframe+4*12($sp)");
|
---|
332 | al (@x[13],"$stdframe+4*13($sp)");
|
---|
333 | al (@x[14],"$stdframe+4*14($sp)");
|
---|
334 | al (@x[15],"$stdframe+4*15($sp)");
|
---|
335 | lrvr (@x[12],@x[12]);
|
---|
336 | lrvr (@x[13],@x[13]);
|
---|
337 | lrvr (@x[14],@x[14]);
|
---|
338 | lrvr (@x[15],@x[15]);
|
---|
339 |
|
---|
340 | la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer
|
---|
341 | &{$z? \&clgr:\&clr} ("%r14",@t[1]);
|
---|
342 | jh (".Ltail");
|
---|
343 |
|
---|
344 | x (@x[0],"4*0(%r14)"); # xor with input
|
---|
345 | x (@x[1],"4*1(%r14)");
|
---|
346 | st (@x[0],"4*0(@t[0])"); # store output
|
---|
347 | x (@x[2],"4*2(%r14)");
|
---|
348 | st (@x[1],"4*1(@t[0])");
|
---|
349 | x (@x[3],"4*3(%r14)");
|
---|
350 | st (@x[2],"4*2(@t[0])");
|
---|
351 | x (@x[4],"4*4(%r14)");
|
---|
352 | st (@x[3],"4*3(@t[0])");
|
---|
353 | lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11]
|
---|
354 | x (@x[5],"4*5(%r14)");
|
---|
355 | st (@x[4],"4*4(@t[0])");
|
---|
356 | x (@x[6],"4*6(%r14)");
|
---|
357 | al (@x[0],"$stdframe+4*8($sp)");
|
---|
358 | st (@x[5],"4*5(@t[0])");
|
---|
359 | x (@x[7],"4*7(%r14)");
|
---|
360 | al (@x[1],"$stdframe+4*9($sp)");
|
---|
361 | st (@x[6],"4*6(@t[0])");
|
---|
362 | x (@x[12],"4*12(%r14)");
|
---|
363 | al (@x[2],"$stdframe+4*10($sp)");
|
---|
364 | st (@x[7],"4*7(@t[0])");
|
---|
365 | x (@x[13],"4*13(%r14)");
|
---|
366 | al (@x[3],"$stdframe+4*11($sp)");
|
---|
367 | st (@x[12],"4*12(@t[0])");
|
---|
368 | x (@x[14],"4*14(%r14)");
|
---|
369 | st (@x[13],"4*13(@t[0])");
|
---|
370 | x (@x[15],"4*15(%r14)");
|
---|
371 | st (@x[14],"4*14(@t[0])");
|
---|
372 | lrvr (@x[0],@x[0]);
|
---|
373 | st (@x[15],"4*15(@t[0])");
|
---|
374 | lrvr (@x[1],@x[1]);
|
---|
375 | lrvr (@x[2],@x[2]);
|
---|
376 | lrvr (@x[3],@x[3]);
|
---|
377 | lhi (@x[12],1);
|
---|
378 | x (@x[0],"4*8(%r14)");
|
---|
379 | al (@x[12],"$stdframe+4*12($sp)"); # increment counter
|
---|
380 | x (@x[1],"4*9(%r14)");
|
---|
381 | st (@x[0],"4*8(@t[0])");
|
---|
382 | x (@x[2],"4*10(%r14)");
|
---|
383 | st (@x[1],"4*9(@t[0])");
|
---|
384 | x (@x[3],"4*11(%r14)");
|
---|
385 | st (@x[2],"4*10(@t[0])");
|
---|
386 | st (@x[3],"4*11(@t[0])");
|
---|
387 |
|
---|
388 | &{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet?
|
---|
389 | la ("%r14","64(%r14)");
|
---|
390 | jl (".Loop_outer");
|
---|
391 |
|
---|
392 | LABEL (".Ldone");
|
---|
393 | xgr ("%r0","%r0");
|
---|
394 | xgr ("%r1","%r1");
|
---|
395 | xgr ("%r2","%r2");
|
---|
396 | xgr ("%r3","%r3");
|
---|
397 | stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy
|
---|
398 | stmg ("%r0","%r3","$stdframe+4*12($sp)");
|
---|
399 |
|
---|
400 | &{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)");
|
---|
401 | br ("%r14");
|
---|
402 |
|
---|
403 | ALIGN (16);
|
---|
404 | LABEL (".Ltail");
|
---|
405 | la (@t[1],"64($t[1])");
|
---|
406 | stm (@x[0],@x[7],"$stdframe+4*0($sp)");
|
---|
407 | &{$z? \&slgr:\&slr} (@t[1],"%r14");
|
---|
408 | lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
|
---|
409 | &{$z? \&lghi:\&lhi} (@x[6],0);
|
---|
410 | stm (@x[12],@x[15],"$stdframe+4*12($sp)");
|
---|
411 | al (@x[0],"$stdframe+4*8($sp)");
|
---|
412 | al (@x[1],"$stdframe+4*9($sp)");
|
---|
413 | al (@x[2],"$stdframe+4*10($sp)");
|
---|
414 | al (@x[3],"$stdframe+4*11($sp)");
|
---|
415 | lrvr (@x[0],@x[0]);
|
---|
416 | lrvr (@x[1],@x[1]);
|
---|
417 | lrvr (@x[2],@x[2]);
|
---|
418 | lrvr (@x[3],@x[3]);
|
---|
419 | stm (@x[0],@x[3],"$stdframe+4*8($sp)");
|
---|
420 |
|
---|
421 | LABEL (".Loop_tail");
|
---|
422 | llgc (@x[4],"0(@x[6],%r14)");
|
---|
423 | llgc (@x[5],"$stdframe(@x[6],$sp)");
|
---|
424 | xr (@x[5],@x[4]);
|
---|
425 | stc (@x[5],"0(@x[6],@t[0])");
|
---|
426 | la (@x[6],"1(@x[6])");
|
---|
427 | brct (@t[1],".Loop_tail");
|
---|
428 |
|
---|
429 | j (".Ldone");
|
---|
430 | SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
|
---|
431 | }
|
---|
432 |
|
---|
433 | ########################################################################
|
---|
434 | # 4x"vertical" layout minimizes amount of instructions, but pipeline
|
---|
435 | # runs underutilized [because of vector instructions' high latency].
|
---|
436 | # On the other hand minimum amount of data it takes to fully utilize
|
---|
437 | # the pipeline is higher, so that effectively, short inputs would be
|
---|
438 | # processed slower. Hence this code path targeting <=256 bytes lengths.
|
---|
439 | #
|
---|
440 | {
|
---|
441 | my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
442 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
|
---|
443 | my @K=map("%v$_",(16..19));
|
---|
444 | my $CTR="%v26";
|
---|
445 | my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
|
---|
446 | my $beperm="%v31";
|
---|
447 | my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
|
---|
448 | my $FRAME=$stdframe+4*16;
|
---|
449 |
|
---|
450 | ALIGN (32);
|
---|
451 | LABEL ("ChaCha20_ctr32_4x");
|
---|
452 | LABEL (".LChaCha20_ctr32_4x");
|
---|
453 | &{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
|
---|
454 | if (!$z) {
|
---|
455 | std ("%f4","16*$SIZE_T+2*8($sp)");
|
---|
456 | std ("%f6","16*$SIZE_T+3*8($sp)");
|
---|
457 | }
|
---|
458 | &{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
|
---|
459 | lgr ("%r0",$sp);
|
---|
460 | la ($sp,"0(%r1,$sp)");
|
---|
461 | &{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
|
---|
462 | if ($z) {
|
---|
463 | std ("%f8","$stdframe+8*0($sp)");
|
---|
464 | std ("%f9","$stdframe+8*1($sp)");
|
---|
465 | std ("%f10","$stdframe+8*2($sp)");
|
---|
466 | std ("%f11","$stdframe+8*3($sp)");
|
---|
467 | std ("%f12","$stdframe+8*4($sp)");
|
---|
468 | std ("%f13","$stdframe+8*5($sp)");
|
---|
469 | std ("%f14","$stdframe+8*6($sp)");
|
---|
470 | std ("%f15","$stdframe+8*7($sp)");
|
---|
471 | }
|
---|
472 | larl ("%r7",".Lsigma");
|
---|
473 | lhi ("%r0",10);
|
---|
474 | lhi ("%r1",0);
|
---|
475 |
|
---|
476 | vl (@K[0],"0(%r7)"); # load sigma
|
---|
477 | vl (@K[1],"0($key)"); # load key
|
---|
478 | vl (@K[2],"16($key)");
|
---|
479 | vl (@K[3],"0($counter)"); # load counter
|
---|
480 |
|
---|
481 | vl ($beperm,"0x40(%r7)");
|
---|
482 | vl ($xt1,"0x50(%r7)");
|
---|
483 | vrepf ($CTR,@K[3],0);
|
---|
484 | vlvgf (@K[3],"%r1",0); # clear @K[3].word[0]
|
---|
485 | vaf ($CTR,$CTR,$xt1);
|
---|
486 |
|
---|
487 | #LABEL (".Loop_outer_4x");
|
---|
488 | vlm ($xa0,$xa3,"0x60(%r7)"); # load [smashed] sigma
|
---|
489 |
|
---|
490 | vrepf ($xb0,@K[1],0); # smash the key
|
---|
491 | vrepf ($xb1,@K[1],1);
|
---|
492 | vrepf ($xb2,@K[1],2);
|
---|
493 | vrepf ($xb3,@K[1],3);
|
---|
494 |
|
---|
495 | vrepf ($xc0,@K[2],0);
|
---|
496 | vrepf ($xc1,@K[2],1);
|
---|
497 | vrepf ($xc2,@K[2],2);
|
---|
498 | vrepf ($xc3,@K[2],3);
|
---|
499 |
|
---|
500 | vlr ($xd0,$CTR);
|
---|
501 | vrepf ($xd1,@K[3],1);
|
---|
502 | vrepf ($xd2,@K[3],2);
|
---|
503 | vrepf ($xd3,@K[3],3);
|
---|
504 |
|
---|
505 | LABEL (".Loop_4x");
|
---|
506 | VX_lane_ROUND(0, 4, 8,12);
|
---|
507 | VX_lane_ROUND(0, 5,10,15);
|
---|
508 | brct ("%r0",".Loop_4x");
|
---|
509 |
|
---|
510 | vaf ($xd0,$xd0,$CTR);
|
---|
511 |
|
---|
512 | vmrhf ($xt0,$xa0,$xa1); # transpose data
|
---|
513 | vmrhf ($xt1,$xa2,$xa3);
|
---|
514 | vmrlf ($xt2,$xa0,$xa1);
|
---|
515 | vmrlf ($xt3,$xa2,$xa3);
|
---|
516 | vpdi ($xa0,$xt0,$xt1,0b0000);
|
---|
517 | vpdi ($xa1,$xt0,$xt1,0b0101);
|
---|
518 | vpdi ($xa2,$xt2,$xt3,0b0000);
|
---|
519 | vpdi ($xa3,$xt2,$xt3,0b0101);
|
---|
520 |
|
---|
521 | vmrhf ($xt0,$xb0,$xb1);
|
---|
522 | vmrhf ($xt1,$xb2,$xb3);
|
---|
523 | vmrlf ($xt2,$xb0,$xb1);
|
---|
524 | vmrlf ($xt3,$xb2,$xb3);
|
---|
525 | vpdi ($xb0,$xt0,$xt1,0b0000);
|
---|
526 | vpdi ($xb1,$xt0,$xt1,0b0101);
|
---|
527 | vpdi ($xb2,$xt2,$xt3,0b0000);
|
---|
528 | vpdi ($xb3,$xt2,$xt3,0b0101);
|
---|
529 |
|
---|
530 | vmrhf ($xt0,$xc0,$xc1);
|
---|
531 | vmrhf ($xt1,$xc2,$xc3);
|
---|
532 | vmrlf ($xt2,$xc0,$xc1);
|
---|
533 | vmrlf ($xt3,$xc2,$xc3);
|
---|
534 | vpdi ($xc0,$xt0,$xt1,0b0000);
|
---|
535 | vpdi ($xc1,$xt0,$xt1,0b0101);
|
---|
536 | vpdi ($xc2,$xt2,$xt3,0b0000);
|
---|
537 | vpdi ($xc3,$xt2,$xt3,0b0101);
|
---|
538 |
|
---|
539 | vmrhf ($xt0,$xd0,$xd1);
|
---|
540 | vmrhf ($xt1,$xd2,$xd3);
|
---|
541 | vmrlf ($xt2,$xd0,$xd1);
|
---|
542 | vmrlf ($xt3,$xd2,$xd3);
|
---|
543 | vpdi ($xd0,$xt0,$xt1,0b0000);
|
---|
544 | vpdi ($xd1,$xt0,$xt1,0b0101);
|
---|
545 | vpdi ($xd2,$xt2,$xt3,0b0000);
|
---|
546 | vpdi ($xd3,$xt2,$xt3,0b0101);
|
---|
547 |
|
---|
548 | #vrepif ($xt0,4);
|
---|
549 | #vaf ($CTR,$CTR,$xt0); # next counter value
|
---|
550 |
|
---|
551 | vaf ($xa0,$xa0,@K[0]);
|
---|
552 | vaf ($xb0,$xb0,@K[1]);
|
---|
553 | vaf ($xc0,$xc0,@K[2]);
|
---|
554 | vaf ($xd0,$xd0,@K[3]);
|
---|
555 |
|
---|
556 | vperm ($xa0,$xa0,$xa0,$beperm);
|
---|
557 | vperm ($xb0,$xb0,$xb0,$beperm);
|
---|
558 | vperm ($xc0,$xc0,$xc0,$beperm);
|
---|
559 | vperm ($xd0,$xd0,$xd0,$beperm);
|
---|
560 |
|
---|
561 | #&{$z? \&clgfi:\&clfi} ($len,0x40);
|
---|
562 | #jl (".Ltail_4x");
|
---|
563 |
|
---|
564 | vlm ($xt0,$xt3,"0($inp)");
|
---|
565 |
|
---|
566 | vx ($xt0,$xt0,$xa0);
|
---|
567 | vx ($xt1,$xt1,$xb0);
|
---|
568 | vx ($xt2,$xt2,$xc0);
|
---|
569 | vx ($xt3,$xt3,$xd0);
|
---|
570 |
|
---|
571 | vstm ($xt0,$xt3,"0($out)");
|
---|
572 |
|
---|
573 | la ($inp,"0x40($inp)");
|
---|
574 | la ($out,"0x40($out)");
|
---|
575 | &{$z? \&aghi:\&ahi} ($len,-0x40);
|
---|
576 | #je (".Ldone_4x");
|
---|
577 |
|
---|
578 | vaf ($xa0,$xa1,@K[0]);
|
---|
579 | vaf ($xb0,$xb1,@K[1]);
|
---|
580 | vaf ($xc0,$xc1,@K[2]);
|
---|
581 | vaf ($xd0,$xd1,@K[3]);
|
---|
582 |
|
---|
583 | vperm ($xa0,$xa0,$xa0,$beperm);
|
---|
584 | vperm ($xb0,$xb0,$xb0,$beperm);
|
---|
585 | vperm ($xc0,$xc0,$xc0,$beperm);
|
---|
586 | vperm ($xd0,$xd0,$xd0,$beperm);
|
---|
587 |
|
---|
588 | &{$z? \&clgfi:\&clfi} ($len,0x40);
|
---|
589 | jl (".Ltail_4x");
|
---|
590 |
|
---|
591 | vlm ($xt0,$xt3,"0($inp)");
|
---|
592 |
|
---|
593 | vx ($xt0,$xt0,$xa0);
|
---|
594 | vx ($xt1,$xt1,$xb0);
|
---|
595 | vx ($xt2,$xt2,$xc0);
|
---|
596 | vx ($xt3,$xt3,$xd0);
|
---|
597 |
|
---|
598 | vstm ($xt0,$xt3,"0($out)");
|
---|
599 |
|
---|
600 | la ($inp,"0x40($inp)");
|
---|
601 | la ($out,"0x40($out)");
|
---|
602 | &{$z? \&aghi:\&ahi} ($len,-0x40);
|
---|
603 | je (".Ldone_4x");
|
---|
604 |
|
---|
605 | vaf ($xa0,$xa2,@K[0]);
|
---|
606 | vaf ($xb0,$xb2,@K[1]);
|
---|
607 | vaf ($xc0,$xc2,@K[2]);
|
---|
608 | vaf ($xd0,$xd2,@K[3]);
|
---|
609 |
|
---|
610 | vperm ($xa0,$xa0,$xa0,$beperm);
|
---|
611 | vperm ($xb0,$xb0,$xb0,$beperm);
|
---|
612 | vperm ($xc0,$xc0,$xc0,$beperm);
|
---|
613 | vperm ($xd0,$xd0,$xd0,$beperm);
|
---|
614 |
|
---|
615 | &{$z? \&clgfi:\&clfi} ($len,0x40);
|
---|
616 | jl (".Ltail_4x");
|
---|
617 |
|
---|
618 | vlm ($xt0,$xt3,"0($inp)");
|
---|
619 |
|
---|
620 | vx ($xt0,$xt0,$xa0);
|
---|
621 | vx ($xt1,$xt1,$xb0);
|
---|
622 | vx ($xt2,$xt2,$xc0);
|
---|
623 | vx ($xt3,$xt3,$xd0);
|
---|
624 |
|
---|
625 | vstm ($xt0,$xt3,"0($out)");
|
---|
626 |
|
---|
627 | la ($inp,"0x40($inp)");
|
---|
628 | la ($out,"0x40($out)");
|
---|
629 | &{$z? \&aghi:\&ahi} ($len,-0x40);
|
---|
630 | je (".Ldone_4x");
|
---|
631 |
|
---|
632 | vaf ($xa0,$xa3,@K[0]);
|
---|
633 | vaf ($xb0,$xb3,@K[1]);
|
---|
634 | vaf ($xc0,$xc3,@K[2]);
|
---|
635 | vaf ($xd0,$xd3,@K[3]);
|
---|
636 |
|
---|
637 | vperm ($xa0,$xa0,$xa0,$beperm);
|
---|
638 | vperm ($xb0,$xb0,$xb0,$beperm);
|
---|
639 | vperm ($xc0,$xc0,$xc0,$beperm);
|
---|
640 | vperm ($xd0,$xd0,$xd0,$beperm);
|
---|
641 |
|
---|
642 | &{$z? \&clgfi:\&clfi} ($len,0x40);
|
---|
643 | jl (".Ltail_4x");
|
---|
644 |
|
---|
645 | vlm ($xt0,$xt3,"0($inp)");
|
---|
646 |
|
---|
647 | vx ($xt0,$xt0,$xa0);
|
---|
648 | vx ($xt1,$xt1,$xb0);
|
---|
649 | vx ($xt2,$xt2,$xc0);
|
---|
650 | vx ($xt3,$xt3,$xd0);
|
---|
651 |
|
---|
652 | vstm ($xt0,$xt3,"0($out)");
|
---|
653 |
|
---|
654 | #la $inp,0x40($inp));
|
---|
655 | #la $out,0x40($out));
|
---|
656 | #lhi %r0,10);
|
---|
657 | #&{$z? \&aghi:\&ahi} $len,-0x40);
|
---|
658 | #jne .Loop_outer_4x);
|
---|
659 |
|
---|
660 | LABEL (".Ldone_4x");
|
---|
661 | if (!$z) {
|
---|
662 | ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
|
---|
663 | ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
|
---|
664 | } else {
|
---|
665 | ld ("%f8","$stdframe+8*0($sp)");
|
---|
666 | ld ("%f9","$stdframe+8*1($sp)");
|
---|
667 | ld ("%f10","$stdframe+8*2($sp)");
|
---|
668 | ld ("%f11","$stdframe+8*3($sp)");
|
---|
669 | ld ("%f12","$stdframe+8*4($sp)");
|
---|
670 | ld ("%f13","$stdframe+8*5($sp)");
|
---|
671 | ld ("%f14","$stdframe+8*6($sp)");
|
---|
672 | ld ("%f15","$stdframe+8*7($sp)");
|
---|
673 | }
|
---|
674 | &{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
|
---|
675 | la ($sp,"$FRAME($sp)");
|
---|
676 | br ("%r14");
|
---|
677 |
|
---|
678 | ALIGN (16);
|
---|
679 | LABEL (".Ltail_4x");
|
---|
680 | if (!$z) {
|
---|
681 | vlr ($xt0,$xb0);
|
---|
682 | ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
|
---|
683 | ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
|
---|
684 |
|
---|
685 | vst ($xa0,"$stdframe+0x00($sp)");
|
---|
686 | vst ($xt0,"$stdframe+0x10($sp)");
|
---|
687 | vst ($xc0,"$stdframe+0x20($sp)");
|
---|
688 | vst ($xd0,"$stdframe+0x30($sp)");
|
---|
689 | } else {
|
---|
690 | vlr ($xt0,$xc0);
|
---|
691 | ld ("%f8","$stdframe+8*0($sp)");
|
---|
692 | ld ("%f9","$stdframe+8*1($sp)");
|
---|
693 | ld ("%f10","$stdframe+8*2($sp)");
|
---|
694 | ld ("%f11","$stdframe+8*3($sp)");
|
---|
695 | vlr ($xt1,$xd0);
|
---|
696 | ld ("%f12","$stdframe+8*4($sp)");
|
---|
697 | ld ("%f13","$stdframe+8*5($sp)");
|
---|
698 | ld ("%f14","$stdframe+8*6($sp)");
|
---|
699 | ld ("%f15","$stdframe+8*7($sp)");
|
---|
700 |
|
---|
701 | vst ($xa0,"$stdframe+0x00($sp)");
|
---|
702 | vst ($xb0,"$stdframe+0x10($sp)");
|
---|
703 | vst ($xt0,"$stdframe+0x20($sp)");
|
---|
704 | vst ($xt1,"$stdframe+0x30($sp)");
|
---|
705 | }
|
---|
706 | lghi ("%r1",0);
|
---|
707 |
|
---|
708 | LABEL (".Loop_tail_4x");
|
---|
709 | llgc ("%r5","0(%r1,$inp)");
|
---|
710 | llgc ("%r6","$stdframe(%r1,$sp)");
|
---|
711 | xr ("%r6","%r5");
|
---|
712 | stc ("%r6","0(%r1,$out)");
|
---|
713 | la ("%r1","1(%r1)");
|
---|
714 | brct ($len,".Loop_tail_4x");
|
---|
715 |
|
---|
716 | &{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
|
---|
717 | la ($sp,"$FRAME($sp)");
|
---|
718 | br ("%r14");
|
---|
719 | SIZE ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
|
---|
720 | }
|
---|
721 |
|
---|
722 | ########################################################################
|
---|
723 | # 6x"horizontal" layout is optimal fit for the platform in its current
|
---|
724 | # shape, more specifically for given vector instructions' latency. Well,
|
---|
725 | # computational part of 8x"vertical" would be faster, but it consumes
|
---|
726 | # all registers and dealing with that will diminish the return...
|
---|
727 | #
|
---|
728 | {
|
---|
729 | my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
|
---|
730 | $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
|
---|
731 | $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
|
---|
732 | my @K=map("%v$_",(27,24..26));
|
---|
733 | my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
|
---|
734 | my $beperm="%v31";
|
---|
735 | my $FRAME=$stdframe + 4*16;
|
---|
736 |
|
---|
737 | GLOBL ("ChaCha20_ctr32_vx");
|
---|
738 | ALIGN (32);
|
---|
739 | LABEL ("ChaCha20_ctr32_vx");
|
---|
740 | LABEL (".LChaCha20_ctr32_vx");
|
---|
741 | &{$z? \&clgfi:\&clfi} ($len,256);
|
---|
742 | jle (".LChaCha20_ctr32_4x");
|
---|
743 | &{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
|
---|
744 | if (!$z) {
|
---|
745 | std ("%f4","16*$SIZE_T+2*8($sp)");
|
---|
746 | std ("%f6","16*$SIZE_T+3*8($sp)");
|
---|
747 | }
|
---|
748 | &{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
|
---|
749 | lgr ("%r0",$sp);
|
---|
750 | la ($sp,"0(%r1,$sp)");
|
---|
751 | &{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
|
---|
752 | if ($z) {
|
---|
753 | std ("%f8","$FRAME-8*8($sp)");
|
---|
754 | std ("%f9","$FRAME-8*7($sp)");
|
---|
755 | std ("%f10","$FRAME-8*6($sp)");
|
---|
756 | std ("%f11","$FRAME-8*5($sp)");
|
---|
757 | std ("%f12","$FRAME-8*4($sp)");
|
---|
758 | std ("%f13","$FRAME-8*3($sp)");
|
---|
759 | std ("%f14","$FRAME-8*2($sp)");
|
---|
760 | std ("%f15","$FRAME-8*1($sp)");
|
---|
761 | }
|
---|
762 | larl ("%r7",".Lsigma");
|
---|
763 | lhi ("%r0",10);
|
---|
764 |
|
---|
765 | vlm (@K[1],@K[2],"0($key)"); # load key
|
---|
766 | vl (@K[3],"0($counter)"); # load counter
|
---|
767 |
|
---|
768 | vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ...
|
---|
769 |
|
---|
770 | LABEL (".Loop_outer_vx");
|
---|
771 | vlr ($a0,@K[0]);
|
---|
772 | vlr ($b0,@K[1]);
|
---|
773 | vlr ($a1,@K[0]);
|
---|
774 | vlr ($b1,@K[1]);
|
---|
775 | vlr ($a2,@K[0]);
|
---|
776 | vlr ($b2,@K[1]);
|
---|
777 | vlr ($a3,@K[0]);
|
---|
778 | vlr ($b3,@K[1]);
|
---|
779 | vlr ($a4,@K[0]);
|
---|
780 | vlr ($b4,@K[1]);
|
---|
781 | vlr ($a5,@K[0]);
|
---|
782 | vlr ($b5,@K[1]);
|
---|
783 |
|
---|
784 | vlr ($d0,@K[3]);
|
---|
785 | vaf ($d1,@K[3],$t1); # K[3]+1
|
---|
786 | vaf ($d2,@K[3],$t2); # K[3]+2
|
---|
787 | vaf ($d3,@K[3],$t3); # K[3]+3
|
---|
788 | vaf ($d4,$d2,$t2); # K[3]+4
|
---|
789 | vaf ($d5,$d2,$t3); # K[3]+5
|
---|
790 |
|
---|
791 | vlr ($c0,@K[2]);
|
---|
792 | vlr ($c1,@K[2]);
|
---|
793 | vlr ($c2,@K[2]);
|
---|
794 | vlr ($c3,@K[2]);
|
---|
795 | vlr ($c4,@K[2]);
|
---|
796 | vlr ($c5,@K[2]);
|
---|
797 |
|
---|
798 | vlr ($t1,$d1);
|
---|
799 | vlr ($t2,$d2);
|
---|
800 | vlr ($t3,$d3);
|
---|
801 |
|
---|
802 | ALIGN (4);
|
---|
803 | LABEL (".Loop_vx");
|
---|
804 |
|
---|
805 | VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
|
---|
806 | $b0,$b1,$b2,$b3,$b4,$b5,
|
---|
807 | $c0,$c1,$c2,$c3,$c4,$c5,
|
---|
808 | $d0,$d1,$d2,$d3,$d4,$d5,
|
---|
809 | 0);
|
---|
810 |
|
---|
811 | VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
|
---|
812 | $b0,$b1,$b2,$b3,$b4,$b5,
|
---|
813 | $c0,$c1,$c2,$c3,$c4,$c5,
|
---|
814 | $d0,$d1,$d2,$d3,$d4,$d5,
|
---|
815 | 1);
|
---|
816 |
|
---|
817 | brct ("%r0",".Loop_vx");
|
---|
818 |
|
---|
819 | vaf ($a0,$a0,@K[0]);
|
---|
820 | vaf ($b0,$b0,@K[1]);
|
---|
821 | vaf ($c0,$c0,@K[2]);
|
---|
822 | vaf ($d0,$d0,@K[3]);
|
---|
823 | vaf ($a1,$a1,@K[0]);
|
---|
824 | vaf ($d1,$d1,$t1); # +K[3]+1
|
---|
825 |
|
---|
826 | vperm ($a0,$a0,$a0,$beperm);
|
---|
827 | vperm ($b0,$b0,$b0,$beperm);
|
---|
828 | vperm ($c0,$c0,$c0,$beperm);
|
---|
829 | vperm ($d0,$d0,$d0,$beperm);
|
---|
830 |
|
---|
831 | &{$z? \&clgfi:\&clfi} ($len,0x40);
|
---|
832 | jl (".Ltail_vx");
|
---|
833 |
|
---|
834 | vaf ($d2,$d2,$t2); # +K[3]+2
|
---|
835 | vaf ($d3,$d3,$t3); # +K[3]+3
|
---|
836 | vlm ($t0,$t3,"0($inp)");
|
---|
837 |
|
---|
838 | vx ($a0,$a0,$t0);
|
---|
839 | vx ($b0,$b0,$t1);
|
---|
840 | vx ($c0,$c0,$t2);
|
---|
841 | vx ($d0,$d0,$t3);
|
---|
842 |
|
---|
843 | vlm (@K[0],$t3,"0(%r7)"); # re-load sigma and increments
|
---|
844 |
|
---|
845 | vstm ($a0,$d0,"0($out)");
|
---|
846 |
|
---|
847 | la ($inp,"0x40($inp)");
|
---|
848 | la ($out,"0x40($out)");
|
---|
849 | &{$z? \&aghi:\&ahi} ($len,-0x40);
|
---|
850 | je (".Ldone_vx");
|
---|
851 |
|
---|
852 | vaf ($b1,$b1,@K[1]);
|
---|
853 | vaf ($c1,$c1,@K[2]);
|
---|
854 |
|
---|
855 | vperm ($a0,$a1,$a1,$beperm);
|
---|
856 | vperm ($b0,$b1,$b1,$beperm);
|
---|
857 | vperm ($c0,$c1,$c1,$beperm);
|
---|
858 | vperm ($d0,$d1,$d1,$beperm);
|
---|
859 |
|
---|
860 | &{$z? \&clgfi:\&clfi} ($len,0x40);
|
---|
861 | jl (".Ltail_vx");
|
---|
862 |
|
---|
863 | vlm ($a1,$d1,"0($inp)");
|
---|
864 |
|
---|
865 | vx ($a0,$a0,$a1);
|
---|
866 | vx ($b0,$b0,$b1);
|
---|
867 | vx ($c0,$c0,$c1);
|
---|
868 | vx ($d0,$d0,$d1);
|
---|
869 |
|
---|
870 | vstm ($a0,$d0,"0($out)");
|
---|
871 |
|
---|
872 | la ($inp,"0x40($inp)");
|
---|
873 | la ($out,"0x40($out)");
|
---|
874 | &{$z? \&aghi:\&ahi} ($len,-0x40);
|
---|
875 | je (".Ldone_vx");
|
---|
876 |
|
---|
877 | vaf ($a2,$a2,@K[0]);
|
---|
878 | vaf ($b2,$b2,@K[1]);
|
---|
879 | vaf ($c2,$c2,@K[2]);
|
---|
880 |
|
---|
881 | vperm ($a0,$a2,$a2,$beperm);
|
---|
882 | vperm ($b0,$b2,$b2,$beperm);
|
---|
883 | vperm ($c0,$c2,$c2,$beperm);
|
---|
884 | vperm ($d0,$d2,$d2,$beperm);
|
---|
885 |
|
---|
886 | &{$z? \&clgfi:\&clfi} ($len,0x40);
|
---|
887 | jl (".Ltail_vx");
|
---|
888 |
|
---|
889 | vlm ($a1,$d1,"0($inp)");
|
---|
890 |
|
---|
891 | vx ($a0,$a0,$a1);
|
---|
892 | vx ($b0,$b0,$b1);
|
---|
893 | vx ($c0,$c0,$c1);
|
---|
894 | vx ($d0,$d0,$d1);
|
---|
895 |
|
---|
896 | vstm ($a0,$d0,"0($out)");
|
---|
897 |
|
---|
898 | la ($inp,"0x40($inp)");
|
---|
899 | la ($out,"0x40($out)");
|
---|
900 | &{$z? \&aghi:\&ahi} ($len,-0x40);
|
---|
901 | je (".Ldone_vx");
|
---|
902 |
|
---|
903 | vaf ($a3,$a3,@K[0]);
|
---|
904 | vaf ($b3,$b3,@K[1]);
|
---|
905 | vaf ($c3,$c3,@K[2]);
|
---|
906 | vaf ($d2,@K[3],$t3); # K[3]+3
|
---|
907 |
|
---|
908 | vperm ($a0,$a3,$a3,$beperm);
|
---|
909 | vperm ($b0,$b3,$b3,$beperm);
|
---|
910 | vperm ($c0,$c3,$c3,$beperm);
|
---|
911 | vperm ($d0,$d3,$d3,$beperm);
|
---|
912 |
|
---|
913 | &{$z? \&clgfi:\&clfi} ($len,0x40);
|
---|
914 | jl (".Ltail_vx");
|
---|
915 |
|
---|
916 | vaf ($d3,$d2,$t1); # K[3]+4
|
---|
917 | vlm ($a1,$d1,"0($inp)");
|
---|
918 |
|
---|
919 | vx ($a0,$a0,$a1);
|
---|
920 | vx ($b0,$b0,$b1);
|
---|
921 | vx ($c0,$c0,$c1);
|
---|
922 | vx ($d0,$d0,$d1);
|
---|
923 |
|
---|
924 | vstm ($a0,$d0,"0($out)");
|
---|
925 |
|
---|
926 | la ($inp,"0x40($inp)");
|
---|
927 | la ($out,"0x40($out)");
|
---|
928 | &{$z? \&aghi:\&ahi} ($len,-0x40);
|
---|
929 | je (".Ldone_vx");
|
---|
930 |
|
---|
931 | vaf ($a4,$a4,@K[0]);
|
---|
932 | vaf ($b4,$b4,@K[1]);
|
---|
933 | vaf ($c4,$c4,@K[2]);
|
---|
934 | vaf ($d4,$d4,$d3); # +K[3]+4
|
---|
935 | vaf ($d3,$d3,$t1); # K[3]+5
|
---|
936 | vaf (@K[3],$d2,$t3); # K[3]+=6
|
---|
937 |
|
---|
938 | vperm ($a0,$a4,$a4,$beperm);
|
---|
939 | vperm ($b0,$b4,$b4,$beperm);
|
---|
940 | vperm ($c0,$c4,$c4,$beperm);
|
---|
941 | vperm ($d0,$d4,$d4,$beperm);
|
---|
942 |
|
---|
943 | &{$z? \&clgfi:\&clfi} ($len,0x40);
|
---|
944 | jl (".Ltail_vx");
|
---|
945 |
|
---|
946 | vlm ($a1,$d1,"0($inp)");
|
---|
947 |
|
---|
948 | vx ($a0,$a0,$a1);
|
---|
949 | vx ($b0,$b0,$b1);
|
---|
950 | vx ($c0,$c0,$c1);
|
---|
951 | vx ($d0,$d0,$d1);
|
---|
952 |
|
---|
953 | vstm ($a0,$d0,"0($out)");
|
---|
954 |
|
---|
955 | la ($inp,"0x40($inp)");
|
---|
956 | la ($out,"0x40($out)");
|
---|
957 | &{$z? \&aghi:\&ahi} ($len,-0x40);
|
---|
958 | je (".Ldone_vx");
|
---|
959 |
|
---|
960 | vaf ($a5,$a5,@K[0]);
|
---|
961 | vaf ($b5,$b5,@K[1]);
|
---|
962 | vaf ($c5,$c5,@K[2]);
|
---|
963 | vaf ($d5,$d5,$d3); # +K[3]+5
|
---|
964 |
|
---|
965 | vperm ($a0,$a5,$a5,$beperm);
|
---|
966 | vperm ($b0,$b5,$b5,$beperm);
|
---|
967 | vperm ($c0,$c5,$c5,$beperm);
|
---|
968 | vperm ($d0,$d5,$d5,$beperm);
|
---|
969 |
|
---|
970 | &{$z? \&clgfi:\&clfi} ($len,0x40);
|
---|
971 | jl (".Ltail_vx");
|
---|
972 |
|
---|
973 | vlm ($a1,$d1,"0($inp)");
|
---|
974 |
|
---|
975 | vx ($a0,$a0,$a1);
|
---|
976 | vx ($b0,$b0,$b1);
|
---|
977 | vx ($c0,$c0,$c1);
|
---|
978 | vx ($d0,$d0,$d1);
|
---|
979 |
|
---|
980 | vstm ($a0,$d0,"0($out)");
|
---|
981 |
|
---|
982 | la ($inp,"0x40($inp)");
|
---|
983 | la ($out,"0x40($out)");
|
---|
984 | lhi ("%r0",10);
|
---|
985 | &{$z? \&aghi:\&ahi} ($len,-0x40);
|
---|
986 | jne (".Loop_outer_vx");
|
---|
987 |
|
---|
988 | LABEL (".Ldone_vx");
|
---|
989 | if (!$z) {
|
---|
990 | ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
|
---|
991 | ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
|
---|
992 | } else {
|
---|
993 | ld ("%f8","$FRAME-8*8($sp)");
|
---|
994 | ld ("%f9","$FRAME-8*7($sp)");
|
---|
995 | ld ("%f10","$FRAME-8*6($sp)");
|
---|
996 | ld ("%f11","$FRAME-8*5($sp)");
|
---|
997 | ld ("%f12","$FRAME-8*4($sp)");
|
---|
998 | ld ("%f13","$FRAME-8*3($sp)");
|
---|
999 | ld ("%f14","$FRAME-8*2($sp)");
|
---|
1000 | ld ("%f15","$FRAME-8*1($sp)");
|
---|
1001 | }
|
---|
1002 | &{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
|
---|
1003 | la ($sp,"$FRAME($sp)");
|
---|
1004 | br ("%r14");
|
---|
1005 |
|
---|
1006 | ALIGN (16);
|
---|
1007 | LABEL (".Ltail_vx");
|
---|
1008 | if (!$z) {
|
---|
1009 | ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
|
---|
1010 | ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
|
---|
1011 | } else {
|
---|
1012 | ld ("%f8","$FRAME-8*8($sp)");
|
---|
1013 | ld ("%f9","$FRAME-8*7($sp)");
|
---|
1014 | ld ("%f10","$FRAME-8*6($sp)");
|
---|
1015 | ld ("%f11","$FRAME-8*5($sp)");
|
---|
1016 | ld ("%f12","$FRAME-8*4($sp)");
|
---|
1017 | ld ("%f13","$FRAME-8*3($sp)");
|
---|
1018 | ld ("%f14","$FRAME-8*2($sp)");
|
---|
1019 | ld ("%f15","$FRAME-8*1($sp)");
|
---|
1020 | }
|
---|
1021 | vstm ($a0,$d0,"$stdframe($sp)");
|
---|
1022 | lghi ("%r1",0);
|
---|
1023 |
|
---|
1024 | LABEL (".Loop_tail_vx");
|
---|
1025 | llgc ("%r5","0(%r1,$inp)");
|
---|
1026 | llgc ("%r6","$stdframe(%r1,$sp)");
|
---|
1027 | xr ("%r6","%r5");
|
---|
1028 | stc ("%r6","0(%r1,$out)");
|
---|
1029 | la ("%r1","1(%r1)");
|
---|
1030 | brct ($len,".Loop_tail_vx");
|
---|
1031 |
|
---|
1032 | &{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
|
---|
1033 | la ($sp,"$FRAME($sp)");
|
---|
1034 | br ("%r14");
|
---|
1035 | SIZE ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
|
---|
1036 | }
|
---|
1037 | ################
|
---|
1038 |
|
---|
1039 | ALIGN (32);
|
---|
1040 | LABEL (".Lsigma");
|
---|
1041 | LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
|
---|
1042 | LONG (1,0,0,0);
|
---|
1043 | LONG (2,0,0,0);
|
---|
1044 | LONG (3,0,0,0);
|
---|
1045 | LONG (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c); # byte swap
|
---|
1046 |
|
---|
1047 | LONG (0,1,2,3);
|
---|
1048 | LONG (0x61707865,0x61707865,0x61707865,0x61707865); # smashed sigma
|
---|
1049 | LONG (0x3320646e,0x3320646e,0x3320646e,0x3320646e);
|
---|
1050 | LONG (0x79622d32,0x79622d32,0x79622d32,0x79622d32);
|
---|
1051 | LONG (0x6b206574,0x6b206574,0x6b206574,0x6b206574);
|
---|
1052 |
|
---|
1053 | ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
|
---|
1054 | ALIGN (4);
|
---|
1055 |
|
---|
1056 | PERLASM_END();
|
---|