VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.2/crypto/chacha/asm/chacha-s390x.pl@ 95087

Last change on this file since 95087 was 94082, checked in by vboxsync, 3 years ago

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

  • Property svn:executable set to *
File size: 26.0 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2015
18#
19# ChaCha20 for s390x.
20#
21# 3 times faster than compiler-generated code.
22
23#
24# August 2018
25#
26# Add vx code path: 4x"vertical".
27#
28# Copyright IBM Corp. 2018
29# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
30
31#
32# February 2019
33#
34# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
35# 4x"vertical" submission [on z13] and >3 faster than scalar code.
36# But to harness overheads revert to transliteration of VSX code path
37# from chacha-ppc module, which is also 4x"vertical", to handle inputs
38# not longer than 256 bytes.
39
40use strict;
41use FindBin qw($Bin);
42use lib "$Bin/../..";
43use perlasm::s390x qw(:DEFAULT :VX :EI AUTOLOAD LABEL INCLUDE);
44
45# $output is the last argument if it looks like a file (it has an extension)
46# $flavour is the first argument if it doesn't look like a file
47my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
48my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
49
50my ($z,$SIZE_T);
51if ($flavour =~ /3[12]/) {
52 $z=0; # S/390 ABI
53 $SIZE_T=4;
54} else {
55 $z=1; # zSeries ABI
56 $SIZE_T=8;
57}
58
59my $sp="%r15";
60my $stdframe=16*$SIZE_T+4*8;
61
62sub ROUND {
63my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
64my @t=map("%r$_",(8,9));
65my ($a0,$b0,$c0,$d0)=@_;
66my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
67my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
68my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
69my ($xc,$xc_)=map("$_",@t);
70
71 # Consider order in which variables are addressed by their
72 # index:
73 #
74 # a b c d
75 #
76 # 0 4 8 12 < even round
77 # 1 5 9 13
78 # 2 6 10 14
79 # 3 7 11 15
80 # 0 5 10 15 < odd round
81 # 1 6 11 12
82 # 2 7 8 13
83 # 3 4 9 14
84 #
85 # 'a', 'b' and 'd's are permanently allocated in registers,
86 # @x[0..7,12..15], while 'c's are maintained in memory. If
87 # you observe 'c' column, you'll notice that pair of 'c's is
88 # invariant between rounds. This means that we have to reload
89 # them once per round, in the middle. This is why you'll see
90 # 'c' stores and loads in the middle, but none in the beginning
91 # or end.
92
93 alr (@x[$a0],@x[$b0]); # Q1
94 alr (@x[$a1],@x[$b1]); # Q2
95 xr (@x[$d0],@x[$a0]);
96 xr (@x[$d1],@x[$a1]);
97 rll (@x[$d0],@x[$d0],16);
98 rll (@x[$d1],@x[$d1],16);
99
100 alr ($xc,@x[$d0]);
101 alr ($xc_,@x[$d1]);
102 xr (@x[$b0],$xc);
103 xr (@x[$b1],$xc_);
104 rll (@x[$b0],@x[$b0],12);
105 rll (@x[$b1],@x[$b1],12);
106
107 alr (@x[$a0],@x[$b0]);
108 alr (@x[$a1],@x[$b1]);
109 xr (@x[$d0],@x[$a0]);
110 xr (@x[$d1],@x[$a1]);
111 rll (@x[$d0],@x[$d0],8);
112 rll (@x[$d1],@x[$d1],8);
113
114 alr ($xc,@x[$d0]);
115 alr ($xc_,@x[$d1]);
116 xr (@x[$b0],$xc);
117 xr (@x[$b1],$xc_);
118 rll (@x[$b0],@x[$b0],7);
119 rll (@x[$b1],@x[$b1],7);
120
121 stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's
122 lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
123
124 alr (@x[$a2],@x[$b2]); # Q3
125 alr (@x[$a3],@x[$b3]); # Q4
126 xr (@x[$d2],@x[$a2]);
127 xr (@x[$d3],@x[$a3]);
128 rll (@x[$d2],@x[$d2],16);
129 rll (@x[$d3],@x[$d3],16);
130
131 alr ($xc,@x[$d2]);
132 alr ($xc_,@x[$d3]);
133 xr (@x[$b2],$xc);
134 xr (@x[$b3],$xc_);
135 rll (@x[$b2],@x[$b2],12);
136 rll (@x[$b3],@x[$b3],12);
137
138 alr (@x[$a2],@x[$b2]);
139 alr (@x[$a3],@x[$b3]);
140 xr (@x[$d2],@x[$a2]);
141 xr (@x[$d3],@x[$a3]);
142 rll (@x[$d2],@x[$d2],8);
143 rll (@x[$d3],@x[$d3],8);
144
145 alr ($xc,@x[$d2]);
146 alr ($xc_,@x[$d3]);
147 xr (@x[$b2],$xc);
148 xr (@x[$b3],$xc_);
149 rll (@x[$b2],@x[$b2],7);
150 rll (@x[$b3],@x[$b3],7);
151}
152
153sub VX_lane_ROUND {
154my ($a0,$b0,$c0,$d0)=@_;
155my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
156my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
157my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
158my @x=map("%v$_",(0..15));
159
160 vaf (@x[$a0],@x[$a0],@x[$b0]); # Q1
161 vx (@x[$d0],@x[$d0],@x[$a0]);
162 verllf (@x[$d0],@x[$d0],16);
163 vaf (@x[$a1],@x[$a1],@x[$b1]); # Q2
164 vx (@x[$d1],@x[$d1],@x[$a1]);
165 verllf (@x[$d1],@x[$d1],16);
166 vaf (@x[$a2],@x[$a2],@x[$b2]); # Q3
167 vx (@x[$d2],@x[$d2],@x[$a2]);
168 verllf (@x[$d2],@x[$d2],16);
169 vaf (@x[$a3],@x[$a3],@x[$b3]); # Q4
170 vx (@x[$d3],@x[$d3],@x[$a3]);
171 verllf (@x[$d3],@x[$d3],16);
172
173 vaf (@x[$c0],@x[$c0],@x[$d0]);
174 vx (@x[$b0],@x[$b0],@x[$c0]);
175 verllf (@x[$b0],@x[$b0],12);
176 vaf (@x[$c1],@x[$c1],@x[$d1]);
177 vx (@x[$b1],@x[$b1],@x[$c1]);
178 verllf (@x[$b1],@x[$b1],12);
179 vaf (@x[$c2],@x[$c2],@x[$d2]);
180 vx (@x[$b2],@x[$b2],@x[$c2]);
181 verllf (@x[$b2],@x[$b2],12);
182 vaf (@x[$c3],@x[$c3],@x[$d3]);
183 vx (@x[$b3],@x[$b3],@x[$c3]);
184 verllf (@x[$b3],@x[$b3],12);
185
186 vaf (@x[$a0],@x[$a0],@x[$b0]);
187 vx (@x[$d0],@x[$d0],@x[$a0]);
188 verllf (@x[$d0],@x[$d0],8);
189 vaf (@x[$a1],@x[$a1],@x[$b1]);
190 vx (@x[$d1],@x[$d1],@x[$a1]);
191 verllf (@x[$d1],@x[$d1],8);
192 vaf (@x[$a2],@x[$a2],@x[$b2]);
193 vx (@x[$d2],@x[$d2],@x[$a2]);
194 verllf (@x[$d2],@x[$d2],8);
195 vaf (@x[$a3],@x[$a3],@x[$b3]);
196 vx (@x[$d3],@x[$d3],@x[$a3]);
197 verllf (@x[$d3],@x[$d3],8);
198
199 vaf (@x[$c0],@x[$c0],@x[$d0]);
200 vx (@x[$b0],@x[$b0],@x[$c0]);
201 verllf (@x[$b0],@x[$b0],7);
202 vaf (@x[$c1],@x[$c1],@x[$d1]);
203 vx (@x[$b1],@x[$b1],@x[$c1]);
204 verllf (@x[$b1],@x[$b1],7);
205 vaf (@x[$c2],@x[$c2],@x[$d2]);
206 vx (@x[$b2],@x[$b2],@x[$c2]);
207 verllf (@x[$b2],@x[$b2],7);
208 vaf (@x[$c3],@x[$c3],@x[$d3]);
209 vx (@x[$b3],@x[$b3],@x[$c3]);
210 verllf (@x[$b3],@x[$b3],7);
211}
212
213sub VX_ROUND {
214my @a=@_[0..5];
215my @b=@_[6..11];
216my @c=@_[12..17];
217my @d=@_[18..23];
218my $odd=@_[24];
219
220 vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
221 vx (@d[$_],@d[$_],@a[$_]) for (0..5);
222 verllf (@d[$_],@d[$_],16) for (0..5);
223
224 vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
225 vx (@b[$_],@b[$_],@c[$_]) for (0..5);
226 verllf (@b[$_],@b[$_],12) for (0..5);
227
228 vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
229 vx (@d[$_],@d[$_],@a[$_]) for (0..5);
230 verllf (@d[$_],@d[$_],8) for (0..5);
231
232 vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
233 vx (@b[$_],@b[$_],@c[$_]) for (0..5);
234 verllf (@b[$_],@b[$_],7) for (0..5);
235
236 vsldb (@c[$_],@c[$_],@c[$_],8) for (0..5);
237 vsldb (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
238 vsldb (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
239}
240
241PERLASM_BEGIN($output);
242
243INCLUDE ("s390x_arch.h");
244TEXT ();
245
246################
247# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
248# const unsigned int key[8], const unsigned int counter[4])
249my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
250{
251my $frame=$stdframe+4*20;
252my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
253my @t=map("%r$_",(8,9));
254
255GLOBL ("ChaCha20_ctr32");
256TYPE ("ChaCha20_ctr32","\@function");
257ALIGN (32);
258LABEL ("ChaCha20_ctr32");
259 larl ("%r1","OPENSSL_s390xcap_P");
260
261 lghi ("%r0",64);
262&{$z? \&ltgr:\&ltr} ($len,$len); # len==0?
263 bzr ("%r14");
264 lg ("%r1","S390X_STFLE+16(%r1)");
265&{$z? \&clgr:\&clr} ($len,"%r0");
266 jle (".Lshort");
267
268 tmhh ("%r1",0x4000); # check for vx bit
269 jnz (".LChaCha20_ctr32_vx");
270
271LABEL (".Lshort");
272&{$z? \&aghi:\&ahi} ($len,-64);
273&{$z? \&lghi:\&lhi} ("%r1",-$frame);
274&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
275&{$z? \&slgr:\&slr} ($out,$inp); # difference
276 la ($len,"0($inp,$len)"); # end of input minus 64
277 larl ("%r7",".Lsigma");
278 lgr ("%r0",$sp);
279 la ($sp,"0(%r1,$sp)");
280&{$z? \&stg:\&st} ("%r0","0($sp)");
281
282 lmg ("%r8","%r11","0($key)"); # load key
283 lmg ("%r12","%r13","0($counter)"); # load counter
284 lmg ("%r6","%r7","0(%r7)"); # load sigma constant
285
286 la ("%r14","0($inp)");
287&{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)");
288&{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)");
289 stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
290 srlg (@x[12],"%r12",32); # 32-bit counter value
291 j (".Loop_outer");
292
293ALIGN (16);
294LABEL (".Loop_outer");
295 lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7]
296 lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11]
297 lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15]
298 stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
299 lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9]
300 st (@x[12],"$stdframe+4*12($sp)"); # save counter
301&{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
302 lhi ("%r14",10);
303 j (".Loop");
304
305ALIGN (4);
306LABEL (".Loop");
307 ROUND (0, 4, 8,12);
308 ROUND (0, 5,10,15);
309 brct ("%r14",".Loop");
310
311&{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
312 stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9]
313&{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
314
315 al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule
316 al (@x[1],"$stdframe+4*1($sp)");
317 al (@x[2],"$stdframe+4*2($sp)");
318 al (@x[3],"$stdframe+4*3($sp)");
319 al (@x[4],"$stdframe+4*4($sp)");
320 al (@x[5],"$stdframe+4*5($sp)");
321 al (@x[6],"$stdframe+4*6($sp)");
322 al (@x[7],"$stdframe+4*7($sp)");
323 lrvr (@x[0],@x[0]);
324 lrvr (@x[1],@x[1]);
325 lrvr (@x[2],@x[2]);
326 lrvr (@x[3],@x[3]);
327 lrvr (@x[4],@x[4]);
328 lrvr (@x[5],@x[5]);
329 lrvr (@x[6],@x[6]);
330 lrvr (@x[7],@x[7]);
331 al (@x[12],"$stdframe+4*12($sp)");
332 al (@x[13],"$stdframe+4*13($sp)");
333 al (@x[14],"$stdframe+4*14($sp)");
334 al (@x[15],"$stdframe+4*15($sp)");
335 lrvr (@x[12],@x[12]);
336 lrvr (@x[13],@x[13]);
337 lrvr (@x[14],@x[14]);
338 lrvr (@x[15],@x[15]);
339
340 la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer
341&{$z? \&clgr:\&clr} ("%r14",@t[1]);
342 jh (".Ltail");
343
344 x (@x[0],"4*0(%r14)"); # xor with input
345 x (@x[1],"4*1(%r14)");
346 st (@x[0],"4*0(@t[0])"); # store output
347 x (@x[2],"4*2(%r14)");
348 st (@x[1],"4*1(@t[0])");
349 x (@x[3],"4*3(%r14)");
350 st (@x[2],"4*2(@t[0])");
351 x (@x[4],"4*4(%r14)");
352 st (@x[3],"4*3(@t[0])");
353 lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11]
354 x (@x[5],"4*5(%r14)");
355 st (@x[4],"4*4(@t[0])");
356 x (@x[6],"4*6(%r14)");
357 al (@x[0],"$stdframe+4*8($sp)");
358 st (@x[5],"4*5(@t[0])");
359 x (@x[7],"4*7(%r14)");
360 al (@x[1],"$stdframe+4*9($sp)");
361 st (@x[6],"4*6(@t[0])");
362 x (@x[12],"4*12(%r14)");
363 al (@x[2],"$stdframe+4*10($sp)");
364 st (@x[7],"4*7(@t[0])");
365 x (@x[13],"4*13(%r14)");
366 al (@x[3],"$stdframe+4*11($sp)");
367 st (@x[12],"4*12(@t[0])");
368 x (@x[14],"4*14(%r14)");
369 st (@x[13],"4*13(@t[0])");
370 x (@x[15],"4*15(%r14)");
371 st (@x[14],"4*14(@t[0])");
372 lrvr (@x[0],@x[0]);
373 st (@x[15],"4*15(@t[0])");
374 lrvr (@x[1],@x[1]);
375 lrvr (@x[2],@x[2]);
376 lrvr (@x[3],@x[3]);
377 lhi (@x[12],1);
378 x (@x[0],"4*8(%r14)");
379 al (@x[12],"$stdframe+4*12($sp)"); # increment counter
380 x (@x[1],"4*9(%r14)");
381 st (@x[0],"4*8(@t[0])");
382 x (@x[2],"4*10(%r14)");
383 st (@x[1],"4*9(@t[0])");
384 x (@x[3],"4*11(%r14)");
385 st (@x[2],"4*10(@t[0])");
386 st (@x[3],"4*11(@t[0])");
387
388&{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet?
389 la ("%r14","64(%r14)");
390 jl (".Loop_outer");
391
392LABEL (".Ldone");
393 xgr ("%r0","%r0");
394 xgr ("%r1","%r1");
395 xgr ("%r2","%r2");
396 xgr ("%r3","%r3");
397 stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy
398 stmg ("%r0","%r3","$stdframe+4*12($sp)");
399
400&{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)");
401 br ("%r14");
402
403ALIGN (16);
404LABEL (".Ltail");
405 la (@t[1],"64($t[1])");
406 stm (@x[0],@x[7],"$stdframe+4*0($sp)");
407&{$z? \&slgr:\&slr} (@t[1],"%r14");
408 lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
409&{$z? \&lghi:\&lhi} (@x[6],0);
410 stm (@x[12],@x[15],"$stdframe+4*12($sp)");
411 al (@x[0],"$stdframe+4*8($sp)");
412 al (@x[1],"$stdframe+4*9($sp)");
413 al (@x[2],"$stdframe+4*10($sp)");
414 al (@x[3],"$stdframe+4*11($sp)");
415 lrvr (@x[0],@x[0]);
416 lrvr (@x[1],@x[1]);
417 lrvr (@x[2],@x[2]);
418 lrvr (@x[3],@x[3]);
419 stm (@x[0],@x[3],"$stdframe+4*8($sp)");
420
421LABEL (".Loop_tail");
422 llgc (@x[4],"0(@x[6],%r14)");
423 llgc (@x[5],"$stdframe(@x[6],$sp)");
424 xr (@x[5],@x[4]);
425 stc (@x[5],"0(@x[6],@t[0])");
426 la (@x[6],"1(@x[6])");
427 brct (@t[1],".Loop_tail");
428
429 j (".Ldone");
430SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
431}
432
433########################################################################
434# 4x"vertical" layout minimizes amount of instructions, but pipeline
435# runs underutilized [because of vector instructions' high latency].
436# On the other hand minimum amount of data it takes to fully utilize
437# the pipeline is higher, so that effectively, short inputs would be
438# processed slower. Hence this code path targeting <=256 bytes lengths.
439#
440{
441my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
442 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
443my @K=map("%v$_",(16..19));
444my $CTR="%v26";
445my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
446my $beperm="%v31";
447my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
448my $FRAME=$stdframe+4*16;
449
450ALIGN (32);
451LABEL ("ChaCha20_ctr32_4x");
452LABEL (".LChaCha20_ctr32_4x");
453&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
454if (!$z) {
455 std ("%f4","16*$SIZE_T+2*8($sp)");
456 std ("%f6","16*$SIZE_T+3*8($sp)");
457}
458&{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
459 lgr ("%r0",$sp);
460 la ($sp,"0(%r1,$sp)");
461&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
462if ($z) {
463 std ("%f8","$stdframe+8*0($sp)");
464 std ("%f9","$stdframe+8*1($sp)");
465 std ("%f10","$stdframe+8*2($sp)");
466 std ("%f11","$stdframe+8*3($sp)");
467 std ("%f12","$stdframe+8*4($sp)");
468 std ("%f13","$stdframe+8*5($sp)");
469 std ("%f14","$stdframe+8*6($sp)");
470 std ("%f15","$stdframe+8*7($sp)");
471}
472 larl ("%r7",".Lsigma");
473 lhi ("%r0",10);
474 lhi ("%r1",0);
475
476 vl (@K[0],"0(%r7)"); # load sigma
477 vl (@K[1],"0($key)"); # load key
478 vl (@K[2],"16($key)");
479 vl (@K[3],"0($counter)"); # load counter
480
481 vl ($beperm,"0x40(%r7)");
482 vl ($xt1,"0x50(%r7)");
483 vrepf ($CTR,@K[3],0);
484 vlvgf (@K[3],"%r1",0); # clear @K[3].word[0]
485 vaf ($CTR,$CTR,$xt1);
486
487#LABEL (".Loop_outer_4x");
488 vlm ($xa0,$xa3,"0x60(%r7)"); # load [smashed] sigma
489
490 vrepf ($xb0,@K[1],0); # smash the key
491 vrepf ($xb1,@K[1],1);
492 vrepf ($xb2,@K[1],2);
493 vrepf ($xb3,@K[1],3);
494
495 vrepf ($xc0,@K[2],0);
496 vrepf ($xc1,@K[2],1);
497 vrepf ($xc2,@K[2],2);
498 vrepf ($xc3,@K[2],3);
499
500 vlr ($xd0,$CTR);
501 vrepf ($xd1,@K[3],1);
502 vrepf ($xd2,@K[3],2);
503 vrepf ($xd3,@K[3],3);
504
505LABEL (".Loop_4x");
506 VX_lane_ROUND(0, 4, 8,12);
507 VX_lane_ROUND(0, 5,10,15);
508 brct ("%r0",".Loop_4x");
509
510 vaf ($xd0,$xd0,$CTR);
511
512 vmrhf ($xt0,$xa0,$xa1); # transpose data
513 vmrhf ($xt1,$xa2,$xa3);
514 vmrlf ($xt2,$xa0,$xa1);
515 vmrlf ($xt3,$xa2,$xa3);
516 vpdi ($xa0,$xt0,$xt1,0b0000);
517 vpdi ($xa1,$xt0,$xt1,0b0101);
518 vpdi ($xa2,$xt2,$xt3,0b0000);
519 vpdi ($xa3,$xt2,$xt3,0b0101);
520
521 vmrhf ($xt0,$xb0,$xb1);
522 vmrhf ($xt1,$xb2,$xb3);
523 vmrlf ($xt2,$xb0,$xb1);
524 vmrlf ($xt3,$xb2,$xb3);
525 vpdi ($xb0,$xt0,$xt1,0b0000);
526 vpdi ($xb1,$xt0,$xt1,0b0101);
527 vpdi ($xb2,$xt2,$xt3,0b0000);
528 vpdi ($xb3,$xt2,$xt3,0b0101);
529
530 vmrhf ($xt0,$xc0,$xc1);
531 vmrhf ($xt1,$xc2,$xc3);
532 vmrlf ($xt2,$xc0,$xc1);
533 vmrlf ($xt3,$xc2,$xc3);
534 vpdi ($xc0,$xt0,$xt1,0b0000);
535 vpdi ($xc1,$xt0,$xt1,0b0101);
536 vpdi ($xc2,$xt2,$xt3,0b0000);
537 vpdi ($xc3,$xt2,$xt3,0b0101);
538
539 vmrhf ($xt0,$xd0,$xd1);
540 vmrhf ($xt1,$xd2,$xd3);
541 vmrlf ($xt2,$xd0,$xd1);
542 vmrlf ($xt3,$xd2,$xd3);
543 vpdi ($xd0,$xt0,$xt1,0b0000);
544 vpdi ($xd1,$xt0,$xt1,0b0101);
545 vpdi ($xd2,$xt2,$xt3,0b0000);
546 vpdi ($xd3,$xt2,$xt3,0b0101);
547
548 #vrepif ($xt0,4);
549 #vaf ($CTR,$CTR,$xt0); # next counter value
550
551 vaf ($xa0,$xa0,@K[0]);
552 vaf ($xb0,$xb0,@K[1]);
553 vaf ($xc0,$xc0,@K[2]);
554 vaf ($xd0,$xd0,@K[3]);
555
556 vperm ($xa0,$xa0,$xa0,$beperm);
557 vperm ($xb0,$xb0,$xb0,$beperm);
558 vperm ($xc0,$xc0,$xc0,$beperm);
559 vperm ($xd0,$xd0,$xd0,$beperm);
560
561 #&{$z? \&clgfi:\&clfi} ($len,0x40);
562 #jl (".Ltail_4x");
563
564 vlm ($xt0,$xt3,"0($inp)");
565
566 vx ($xt0,$xt0,$xa0);
567 vx ($xt1,$xt1,$xb0);
568 vx ($xt2,$xt2,$xc0);
569 vx ($xt3,$xt3,$xd0);
570
571 vstm ($xt0,$xt3,"0($out)");
572
573 la ($inp,"0x40($inp)");
574 la ($out,"0x40($out)");
575&{$z? \&aghi:\&ahi} ($len,-0x40);
576 #je (".Ldone_4x");
577
578 vaf ($xa0,$xa1,@K[0]);
579 vaf ($xb0,$xb1,@K[1]);
580 vaf ($xc0,$xc1,@K[2]);
581 vaf ($xd0,$xd1,@K[3]);
582
583 vperm ($xa0,$xa0,$xa0,$beperm);
584 vperm ($xb0,$xb0,$xb0,$beperm);
585 vperm ($xc0,$xc0,$xc0,$beperm);
586 vperm ($xd0,$xd0,$xd0,$beperm);
587
588&{$z? \&clgfi:\&clfi} ($len,0x40);
589 jl (".Ltail_4x");
590
591 vlm ($xt0,$xt3,"0($inp)");
592
593 vx ($xt0,$xt0,$xa0);
594 vx ($xt1,$xt1,$xb0);
595 vx ($xt2,$xt2,$xc0);
596 vx ($xt3,$xt3,$xd0);
597
598 vstm ($xt0,$xt3,"0($out)");
599
600 la ($inp,"0x40($inp)");
601 la ($out,"0x40($out)");
602&{$z? \&aghi:\&ahi} ($len,-0x40);
603 je (".Ldone_4x");
604
605 vaf ($xa0,$xa2,@K[0]);
606 vaf ($xb0,$xb2,@K[1]);
607 vaf ($xc0,$xc2,@K[2]);
608 vaf ($xd0,$xd2,@K[3]);
609
610 vperm ($xa0,$xa0,$xa0,$beperm);
611 vperm ($xb0,$xb0,$xb0,$beperm);
612 vperm ($xc0,$xc0,$xc0,$beperm);
613 vperm ($xd0,$xd0,$xd0,$beperm);
614
615&{$z? \&clgfi:\&clfi} ($len,0x40);
616 jl (".Ltail_4x");
617
618 vlm ($xt0,$xt3,"0($inp)");
619
620 vx ($xt0,$xt0,$xa0);
621 vx ($xt1,$xt1,$xb0);
622 vx ($xt2,$xt2,$xc0);
623 vx ($xt3,$xt3,$xd0);
624
625 vstm ($xt0,$xt3,"0($out)");
626
627 la ($inp,"0x40($inp)");
628 la ($out,"0x40($out)");
629&{$z? \&aghi:\&ahi} ($len,-0x40);
630 je (".Ldone_4x");
631
632 vaf ($xa0,$xa3,@K[0]);
633 vaf ($xb0,$xb3,@K[1]);
634 vaf ($xc0,$xc3,@K[2]);
635 vaf ($xd0,$xd3,@K[3]);
636
637 vperm ($xa0,$xa0,$xa0,$beperm);
638 vperm ($xb0,$xb0,$xb0,$beperm);
639 vperm ($xc0,$xc0,$xc0,$beperm);
640 vperm ($xd0,$xd0,$xd0,$beperm);
641
642&{$z? \&clgfi:\&clfi} ($len,0x40);
643 jl (".Ltail_4x");
644
645 vlm ($xt0,$xt3,"0($inp)");
646
647 vx ($xt0,$xt0,$xa0);
648 vx ($xt1,$xt1,$xb0);
649 vx ($xt2,$xt2,$xc0);
650 vx ($xt3,$xt3,$xd0);
651
652 vstm ($xt0,$xt3,"0($out)");
653
654 #la $inp,0x40($inp));
655 #la $out,0x40($out));
656 #lhi %r0,10);
657 #&{$z? \&aghi:\&ahi} $len,-0x40);
658 #jne .Loop_outer_4x);
659
660LABEL (".Ldone_4x");
661if (!$z) {
662 ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
663 ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
664} else {
665 ld ("%f8","$stdframe+8*0($sp)");
666 ld ("%f9","$stdframe+8*1($sp)");
667 ld ("%f10","$stdframe+8*2($sp)");
668 ld ("%f11","$stdframe+8*3($sp)");
669 ld ("%f12","$stdframe+8*4($sp)");
670 ld ("%f13","$stdframe+8*5($sp)");
671 ld ("%f14","$stdframe+8*6($sp)");
672 ld ("%f15","$stdframe+8*7($sp)");
673}
674&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
675 la ($sp,"$FRAME($sp)");
676 br ("%r14");
677
678ALIGN (16);
679LABEL (".Ltail_4x");
680if (!$z) {
681 vlr ($xt0,$xb0);
682 ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
683 ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
684
685 vst ($xa0,"$stdframe+0x00($sp)");
686 vst ($xt0,"$stdframe+0x10($sp)");
687 vst ($xc0,"$stdframe+0x20($sp)");
688 vst ($xd0,"$stdframe+0x30($sp)");
689} else {
690 vlr ($xt0,$xc0);
691 ld ("%f8","$stdframe+8*0($sp)");
692 ld ("%f9","$stdframe+8*1($sp)");
693 ld ("%f10","$stdframe+8*2($sp)");
694 ld ("%f11","$stdframe+8*3($sp)");
695 vlr ($xt1,$xd0);
696 ld ("%f12","$stdframe+8*4($sp)");
697 ld ("%f13","$stdframe+8*5($sp)");
698 ld ("%f14","$stdframe+8*6($sp)");
699 ld ("%f15","$stdframe+8*7($sp)");
700
701 vst ($xa0,"$stdframe+0x00($sp)");
702 vst ($xb0,"$stdframe+0x10($sp)");
703 vst ($xt0,"$stdframe+0x20($sp)");
704 vst ($xt1,"$stdframe+0x30($sp)");
705}
706 lghi ("%r1",0);
707
708LABEL (".Loop_tail_4x");
709 llgc ("%r5","0(%r1,$inp)");
710 llgc ("%r6","$stdframe(%r1,$sp)");
711 xr ("%r6","%r5");
712 stc ("%r6","0(%r1,$out)");
713 la ("%r1","1(%r1)");
714 brct ($len,".Loop_tail_4x");
715
716&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
717 la ($sp,"$FRAME($sp)");
718 br ("%r14");
719SIZE ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
720}
721
722########################################################################
723# 6x"horizontal" layout is optimal fit for the platform in its current
724# shape, more specifically for given vector instructions' latency. Well,
725# computational part of 8x"vertical" would be faster, but it consumes
726# all registers and dealing with that will diminish the return...
727#
728{
729my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
730 $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
731 $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
732my @K=map("%v$_",(27,24..26));
733my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
734my $beperm="%v31";
735my $FRAME=$stdframe + 4*16;
736
737GLOBL ("ChaCha20_ctr32_vx");
738ALIGN (32);
739LABEL ("ChaCha20_ctr32_vx");
740LABEL (".LChaCha20_ctr32_vx");
741&{$z? \&clgfi:\&clfi} ($len,256);
742 jle (".LChaCha20_ctr32_4x");
743&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
744if (!$z) {
745 std ("%f4","16*$SIZE_T+2*8($sp)");
746 std ("%f6","16*$SIZE_T+3*8($sp)");
747}
748&{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
749 lgr ("%r0",$sp);
750 la ($sp,"0(%r1,$sp)");
751&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
752if ($z) {
753 std ("%f8","$FRAME-8*8($sp)");
754 std ("%f9","$FRAME-8*7($sp)");
755 std ("%f10","$FRAME-8*6($sp)");
756 std ("%f11","$FRAME-8*5($sp)");
757 std ("%f12","$FRAME-8*4($sp)");
758 std ("%f13","$FRAME-8*3($sp)");
759 std ("%f14","$FRAME-8*2($sp)");
760 std ("%f15","$FRAME-8*1($sp)");
761}
762 larl ("%r7",".Lsigma");
763 lhi ("%r0",10);
764
765 vlm (@K[1],@K[2],"0($key)"); # load key
766 vl (@K[3],"0($counter)"); # load counter
767
768 vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ...
769
770LABEL (".Loop_outer_vx");
771 vlr ($a0,@K[0]);
772 vlr ($b0,@K[1]);
773 vlr ($a1,@K[0]);
774 vlr ($b1,@K[1]);
775 vlr ($a2,@K[0]);
776 vlr ($b2,@K[1]);
777 vlr ($a3,@K[0]);
778 vlr ($b3,@K[1]);
779 vlr ($a4,@K[0]);
780 vlr ($b4,@K[1]);
781 vlr ($a5,@K[0]);
782 vlr ($b5,@K[1]);
783
784 vlr ($d0,@K[3]);
785 vaf ($d1,@K[3],$t1); # K[3]+1
786 vaf ($d2,@K[3],$t2); # K[3]+2
787 vaf ($d3,@K[3],$t3); # K[3]+3
788 vaf ($d4,$d2,$t2); # K[3]+4
789 vaf ($d5,$d2,$t3); # K[3]+5
790
791 vlr ($c0,@K[2]);
792 vlr ($c1,@K[2]);
793 vlr ($c2,@K[2]);
794 vlr ($c3,@K[2]);
795 vlr ($c4,@K[2]);
796 vlr ($c5,@K[2]);
797
798 vlr ($t1,$d1);
799 vlr ($t2,$d2);
800 vlr ($t3,$d3);
801
802ALIGN (4);
803LABEL (".Loop_vx");
804
805 VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
806 $b0,$b1,$b2,$b3,$b4,$b5,
807 $c0,$c1,$c2,$c3,$c4,$c5,
808 $d0,$d1,$d2,$d3,$d4,$d5,
809 0);
810
811 VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
812 $b0,$b1,$b2,$b3,$b4,$b5,
813 $c0,$c1,$c2,$c3,$c4,$c5,
814 $d0,$d1,$d2,$d3,$d4,$d5,
815 1);
816
817 brct ("%r0",".Loop_vx");
818
819 vaf ($a0,$a0,@K[0]);
820 vaf ($b0,$b0,@K[1]);
821 vaf ($c0,$c0,@K[2]);
822 vaf ($d0,$d0,@K[3]);
823 vaf ($a1,$a1,@K[0]);
824 vaf ($d1,$d1,$t1); # +K[3]+1
825
826 vperm ($a0,$a0,$a0,$beperm);
827 vperm ($b0,$b0,$b0,$beperm);
828 vperm ($c0,$c0,$c0,$beperm);
829 vperm ($d0,$d0,$d0,$beperm);
830
831&{$z? \&clgfi:\&clfi} ($len,0x40);
832 jl (".Ltail_vx");
833
834 vaf ($d2,$d2,$t2); # +K[3]+2
835 vaf ($d3,$d3,$t3); # +K[3]+3
836 vlm ($t0,$t3,"0($inp)");
837
838 vx ($a0,$a0,$t0);
839 vx ($b0,$b0,$t1);
840 vx ($c0,$c0,$t2);
841 vx ($d0,$d0,$t3);
842
843 vlm (@K[0],$t3,"0(%r7)"); # re-load sigma and increments
844
845 vstm ($a0,$d0,"0($out)");
846
847 la ($inp,"0x40($inp)");
848 la ($out,"0x40($out)");
849&{$z? \&aghi:\&ahi} ($len,-0x40);
850 je (".Ldone_vx");
851
852 vaf ($b1,$b1,@K[1]);
853 vaf ($c1,$c1,@K[2]);
854
855 vperm ($a0,$a1,$a1,$beperm);
856 vperm ($b0,$b1,$b1,$beperm);
857 vperm ($c0,$c1,$c1,$beperm);
858 vperm ($d0,$d1,$d1,$beperm);
859
860&{$z? \&clgfi:\&clfi} ($len,0x40);
861 jl (".Ltail_vx");
862
863 vlm ($a1,$d1,"0($inp)");
864
865 vx ($a0,$a0,$a1);
866 vx ($b0,$b0,$b1);
867 vx ($c0,$c0,$c1);
868 vx ($d0,$d0,$d1);
869
870 vstm ($a0,$d0,"0($out)");
871
872 la ($inp,"0x40($inp)");
873 la ($out,"0x40($out)");
874&{$z? \&aghi:\&ahi} ($len,-0x40);
875 je (".Ldone_vx");
876
877 vaf ($a2,$a2,@K[0]);
878 vaf ($b2,$b2,@K[1]);
879 vaf ($c2,$c2,@K[2]);
880
881 vperm ($a0,$a2,$a2,$beperm);
882 vperm ($b0,$b2,$b2,$beperm);
883 vperm ($c0,$c2,$c2,$beperm);
884 vperm ($d0,$d2,$d2,$beperm);
885
886&{$z? \&clgfi:\&clfi} ($len,0x40);
887 jl (".Ltail_vx");
888
889 vlm ($a1,$d1,"0($inp)");
890
891 vx ($a0,$a0,$a1);
892 vx ($b0,$b0,$b1);
893 vx ($c0,$c0,$c1);
894 vx ($d0,$d0,$d1);
895
896 vstm ($a0,$d0,"0($out)");
897
898 la ($inp,"0x40($inp)");
899 la ($out,"0x40($out)");
900&{$z? \&aghi:\&ahi} ($len,-0x40);
901 je (".Ldone_vx");
902
903 vaf ($a3,$a3,@K[0]);
904 vaf ($b3,$b3,@K[1]);
905 vaf ($c3,$c3,@K[2]);
906 vaf ($d2,@K[3],$t3); # K[3]+3
907
908 vperm ($a0,$a3,$a3,$beperm);
909 vperm ($b0,$b3,$b3,$beperm);
910 vperm ($c0,$c3,$c3,$beperm);
911 vperm ($d0,$d3,$d3,$beperm);
912
913&{$z? \&clgfi:\&clfi} ($len,0x40);
914 jl (".Ltail_vx");
915
916 vaf ($d3,$d2,$t1); # K[3]+4
917 vlm ($a1,$d1,"0($inp)");
918
919 vx ($a0,$a0,$a1);
920 vx ($b0,$b0,$b1);
921 vx ($c0,$c0,$c1);
922 vx ($d0,$d0,$d1);
923
924 vstm ($a0,$d0,"0($out)");
925
926 la ($inp,"0x40($inp)");
927 la ($out,"0x40($out)");
928&{$z? \&aghi:\&ahi} ($len,-0x40);
929 je (".Ldone_vx");
930
931 vaf ($a4,$a4,@K[0]);
932 vaf ($b4,$b4,@K[1]);
933 vaf ($c4,$c4,@K[2]);
934 vaf ($d4,$d4,$d3); # +K[3]+4
935 vaf ($d3,$d3,$t1); # K[3]+5
936 vaf (@K[3],$d2,$t3); # K[3]+=6
937
938 vperm ($a0,$a4,$a4,$beperm);
939 vperm ($b0,$b4,$b4,$beperm);
940 vperm ($c0,$c4,$c4,$beperm);
941 vperm ($d0,$d4,$d4,$beperm);
942
943&{$z? \&clgfi:\&clfi} ($len,0x40);
944 jl (".Ltail_vx");
945
946 vlm ($a1,$d1,"0($inp)");
947
948 vx ($a0,$a0,$a1);
949 vx ($b0,$b0,$b1);
950 vx ($c0,$c0,$c1);
951 vx ($d0,$d0,$d1);
952
953 vstm ($a0,$d0,"0($out)");
954
955 la ($inp,"0x40($inp)");
956 la ($out,"0x40($out)");
957&{$z? \&aghi:\&ahi} ($len,-0x40);
958 je (".Ldone_vx");
959
960 vaf ($a5,$a5,@K[0]);
961 vaf ($b5,$b5,@K[1]);
962 vaf ($c5,$c5,@K[2]);
963 vaf ($d5,$d5,$d3); # +K[3]+5
964
965 vperm ($a0,$a5,$a5,$beperm);
966 vperm ($b0,$b5,$b5,$beperm);
967 vperm ($c0,$c5,$c5,$beperm);
968 vperm ($d0,$d5,$d5,$beperm);
969
970&{$z? \&clgfi:\&clfi} ($len,0x40);
971 jl (".Ltail_vx");
972
973 vlm ($a1,$d1,"0($inp)");
974
975 vx ($a0,$a0,$a1);
976 vx ($b0,$b0,$b1);
977 vx ($c0,$c0,$c1);
978 vx ($d0,$d0,$d1);
979
980 vstm ($a0,$d0,"0($out)");
981
982 la ($inp,"0x40($inp)");
983 la ($out,"0x40($out)");
984 lhi ("%r0",10);
985&{$z? \&aghi:\&ahi} ($len,-0x40);
986 jne (".Loop_outer_vx");
987
988LABEL (".Ldone_vx");
989if (!$z) {
990 ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
991 ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
992} else {
993 ld ("%f8","$FRAME-8*8($sp)");
994 ld ("%f9","$FRAME-8*7($sp)");
995 ld ("%f10","$FRAME-8*6($sp)");
996 ld ("%f11","$FRAME-8*5($sp)");
997 ld ("%f12","$FRAME-8*4($sp)");
998 ld ("%f13","$FRAME-8*3($sp)");
999 ld ("%f14","$FRAME-8*2($sp)");
1000 ld ("%f15","$FRAME-8*1($sp)");
1001}
1002&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1003 la ($sp,"$FRAME($sp)");
1004 br ("%r14");
1005
1006ALIGN (16);
1007LABEL (".Ltail_vx");
1008if (!$z) {
1009 ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
1010 ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
1011} else {
1012 ld ("%f8","$FRAME-8*8($sp)");
1013 ld ("%f9","$FRAME-8*7($sp)");
1014 ld ("%f10","$FRAME-8*6($sp)");
1015 ld ("%f11","$FRAME-8*5($sp)");
1016 ld ("%f12","$FRAME-8*4($sp)");
1017 ld ("%f13","$FRAME-8*3($sp)");
1018 ld ("%f14","$FRAME-8*2($sp)");
1019 ld ("%f15","$FRAME-8*1($sp)");
1020}
1021 vstm ($a0,$d0,"$stdframe($sp)");
1022 lghi ("%r1",0);
1023
1024LABEL (".Loop_tail_vx");
1025 llgc ("%r5","0(%r1,$inp)");
1026 llgc ("%r6","$stdframe(%r1,$sp)");
1027 xr ("%r6","%r5");
1028 stc ("%r6","0(%r1,$out)");
1029 la ("%r1","1(%r1)");
1030 brct ($len,".Loop_tail_vx");
1031
1032&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1033 la ($sp,"$FRAME($sp)");
1034 br ("%r14");
1035SIZE ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
1036}
1037################
1038
1039ALIGN (32);
1040LABEL (".Lsigma");
1041LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
1042LONG (1,0,0,0);
1043LONG (2,0,0,0);
1044LONG (3,0,0,0);
1045LONG (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c); # byte swap
1046
1047LONG (0,1,2,3);
1048LONG (0x61707865,0x61707865,0x61707865,0x61707865); # smashed sigma
1049LONG (0x3320646e,0x3320646e,0x3320646e,0x3320646e);
1050LONG (0x79622d32,0x79622d32,0x79622d32,0x79622d32);
1051LONG (0x6b206574,0x6b206574,0x6b206574,0x6b206574);
1052
1053ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
1054ALIGN (4);
1055
1056PERLASM_END();
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette