chacha-s390x.pl@ 95087

Last change on this file since 95087 was 94082, checked in by vboxsync, 3 years ago
libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128
Property svn:executable set to ``*
File size: 26.0 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16	#
17	# December 2015
18	#
19	# ChaCha20 for s390x.
20	#
21	# 3 times faster than compiler-generated code.
22
23	#
24	# August 2018
25	#
26	# Add vx code path: 4x"vertical".
27	#
28	# Copyright IBM Corp. 2018
29	# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
30
31	#
32	# February 2019
33	#
34	# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
35	# 4x"vertical" submission [on z13] and >3 faster than scalar code.
36	# But to harness overheads revert to transliteration of VSX code path
37	# from chacha-ppc module, which is also 4x"vertical", to handle inputs
38	# not longer than 256 bytes.
39
40	use strict;
41	use FindBin qw($Bin);
42	use lib "$Bin/../..";
43	use perlasm::s390x qw(:DEFAULT :VX :EI AUTOLOAD LABEL INCLUDE);
44
45	# $output is the last argument if it looks like a file (it has an extension)
46	# $flavour is the first argument if it doesn't look like a file
47	my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
48	my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;
49
50	my ($z,$SIZE_T);
51	if ($flavour =~ /3[12]/) {
52	$z=0; # S/390 ABI
53	$SIZE_T=4;
54	} else {
55	$z=1; # zSeries ABI
56	$SIZE_T=8;
57	}
58
59	my $sp="%r15";
60	my $stdframe=16$SIZE_T+48;
61
62	sub ROUND {
63	my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
64	my @t=map("%r$_",(8,9));
65	my ($a0,$b0,$c0,$d0)=@_;
66	my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
67	my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
68	my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
69	my ($xc,$xc_)=map("$_",@t);
70
71	# Consider order in which variables are addressed by their
72	# index:
73	#
74	# a b c d
75	#
76	# 0 4 8 12 < even round
77	# 1 5 9 13
78	# 2 6 10 14
79	# 3 7 11 15
80	# 0 5 10 15 < odd round
81	# 1 6 11 12
82	# 2 7 8 13
83	# 3 4 9 14
84	#
85	# 'a', 'b' and 'd's are permanently allocated in registers,
86	# @x[0..7,12..15], while 'c's are maintained in memory. If
87	# you observe 'c' column, you'll notice that pair of 'c's is
88	# invariant between rounds. This means that we have to reload
89	# them once per round, in the middle. This is why you'll see
90	# 'c' stores and loads in the middle, but none in the beginning
91	# or end.
92
93	alr (@x[$a0],@x[$b0]); # Q1
94	alr (@x[$a1],@x[$b1]); # Q2
95	xr (@x[$d0],@x[$a0]);
96	xr (@x[$d1],@x[$a1]);
97	rll (@x[$d0],@x[$d0],16);
98	rll (@x[$d1],@x[$d1],16);
99
100	alr ($xc,@x[$d0]);
101	alr ($xc_,@x[$d1]);
102	xr (@x[$b0],$xc);
103	xr (@x[$b1],$xc_);
104	rll (@x[$b0],@x[$b0],12);
105	rll (@x[$b1],@x[$b1],12);
106
107	alr (@x[$a0],@x[$b0]);
108	alr (@x[$a1],@x[$b1]);
109	xr (@x[$d0],@x[$a0]);
110	xr (@x[$d1],@x[$a1]);
111	rll (@x[$d0],@x[$d0],8);
112	rll (@x[$d1],@x[$d1],8);
113
114	alr ($xc,@x[$d0]);
115	alr ($xc_,@x[$d1]);
116	xr (@x[$b0],$xc);
117	xr (@x[$b1],$xc_);
118	rll (@x[$b0],@x[$b0],7);
119	rll (@x[$b1],@x[$b1],7);
120
121	stm ($xc,$xc_,"$stdframe+48+4$c0($sp)"); # reload pair of 'c's
122	lm ($xc,$xc_,"$stdframe+48+4$c2($sp)");
123
124	alr (@x[$a2],@x[$b2]); # Q3
125	alr (@x[$a3],@x[$b3]); # Q4
126	xr (@x[$d2],@x[$a2]);
127	xr (@x[$d3],@x[$a3]);
128	rll (@x[$d2],@x[$d2],16);
129	rll (@x[$d3],@x[$d3],16);
130
131	alr ($xc,@x[$d2]);
132	alr ($xc_,@x[$d3]);
133	xr (@x[$b2],$xc);
134	xr (@x[$b3],$xc_);
135	rll (@x[$b2],@x[$b2],12);
136	rll (@x[$b3],@x[$b3],12);
137
138	alr (@x[$a2],@x[$b2]);
139	alr (@x[$a3],@x[$b3]);
140	xr (@x[$d2],@x[$a2]);
141	xr (@x[$d3],@x[$a3]);
142	rll (@x[$d2],@x[$d2],8);
143	rll (@x[$d3],@x[$d3],8);
144
145	alr ($xc,@x[$d2]);
146	alr ($xc_,@x[$d3]);
147	xr (@x[$b2],$xc);
148	xr (@x[$b3],$xc_);
149	rll (@x[$b2],@x[$b2],7);
150	rll (@x[$b3],@x[$b3],7);
151	}
152
153	sub VX_lane_ROUND {
154	my ($a0,$b0,$c0,$d0)=@_;
155	my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
156	my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
157	my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
158	my @x=map("%v$_",(0..15));
159
160	vaf (@x[$a0],@x[$a0],@x[$b0]); # Q1
161	vx (@x[$d0],@x[$d0],@x[$a0]);
162	verllf (@x[$d0],@x[$d0],16);
163	vaf (@x[$a1],@x[$a1],@x[$b1]); # Q2
164	vx (@x[$d1],@x[$d1],@x[$a1]);
165	verllf (@x[$d1],@x[$d1],16);
166	vaf (@x[$a2],@x[$a2],@x[$b2]); # Q3
167	vx (@x[$d2],@x[$d2],@x[$a2]);
168	verllf (@x[$d2],@x[$d2],16);
169	vaf (@x[$a3],@x[$a3],@x[$b3]); # Q4
170	vx (@x[$d3],@x[$d3],@x[$a3]);
171	verllf (@x[$d3],@x[$d3],16);
172
173	vaf (@x[$c0],@x[$c0],@x[$d0]);
174	vx (@x[$b0],@x[$b0],@x[$c0]);
175	verllf (@x[$b0],@x[$b0],12);
176	vaf (@x[$c1],@x[$c1],@x[$d1]);
177	vx (@x[$b1],@x[$b1],@x[$c1]);
178	verllf (@x[$b1],@x[$b1],12);
179	vaf (@x[$c2],@x[$c2],@x[$d2]);
180	vx (@x[$b2],@x[$b2],@x[$c2]);
181	verllf (@x[$b2],@x[$b2],12);
182	vaf (@x[$c3],@x[$c3],@x[$d3]);
183	vx (@x[$b3],@x[$b3],@x[$c3]);
184	verllf (@x[$b3],@x[$b3],12);
185
186	vaf (@x[$a0],@x[$a0],@x[$b0]);
187	vx (@x[$d0],@x[$d0],@x[$a0]);
188	verllf (@x[$d0],@x[$d0],8);
189	vaf (@x[$a1],@x[$a1],@x[$b1]);
190	vx (@x[$d1],@x[$d1],@x[$a1]);
191	verllf (@x[$d1],@x[$d1],8);
192	vaf (@x[$a2],@x[$a2],@x[$b2]);
193	vx (@x[$d2],@x[$d2],@x[$a2]);
194	verllf (@x[$d2],@x[$d2],8);
195	vaf (@x[$a3],@x[$a3],@x[$b3]);
196	vx (@x[$d3],@x[$d3],@x[$a3]);
197	verllf (@x[$d3],@x[$d3],8);
198
199	vaf (@x[$c0],@x[$c0],@x[$d0]);
200	vx (@x[$b0],@x[$b0],@x[$c0]);
201	verllf (@x[$b0],@x[$b0],7);
202	vaf (@x[$c1],@x[$c1],@x[$d1]);
203	vx (@x[$b1],@x[$b1],@x[$c1]);
204	verllf (@x[$b1],@x[$b1],7);
205	vaf (@x[$c2],@x[$c2],@x[$d2]);
206	vx (@x[$b2],@x[$b2],@x[$c2]);
207	verllf (@x[$b2],@x[$b2],7);
208	vaf (@x[$c3],@x[$c3],@x[$d3]);
209	vx (@x[$b3],@x[$b3],@x[$c3]);
210	verllf (@x[$b3],@x[$b3],7);
211	}
212
213	sub VX_ROUND {
214	my @a=@_[0..5];
215	my @b=@_[6..11];
216	my @c=@_[12..17];
217	my @d=@_[18..23];
218	my $odd=@_[24];
219
220	vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
221	vx (@d[$_],@d[$_],@a[$_]) for (0..5);
222	verllf (@d[$_],@d[$_],16) for (0..5);
223
224	vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
225	vx (@b[$_],@b[$_],@c[$_]) for (0..5);
226	verllf (@b[$_],@b[$_],12) for (0..5);
227
228	vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
229	vx (@d[$_],@d[$_],@a[$_]) for (0..5);
230	verllf (@d[$_],@d[$_],8) for (0..5);
231
232	vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
233	vx (@b[$_],@b[$_],@c[$_]) for (0..5);
234	verllf (@b[$_],@b[$_],7) for (0..5);
235
236	vsldb (@c[$_],@c[$_],@c[$_],8) for (0..5);
237	vsldb (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
238	vsldb (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
239	}
240
241	PERLASM_BEGIN($output);
242
243	INCLUDE ("s390x_arch.h");
244	TEXT ();
245
246	################
247	# void ChaCha20_ctr32(unsigned char out, const unsigned char inp, size_t len,
248	# const unsigned int key[8], const unsigned int counter[4])
249	my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
250	{
251	my $frame=$stdframe+4*20;
252	my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
253	my @t=map("%r$_",(8,9));
254
255	GLOBL ("ChaCha20_ctr32");
256	TYPE ("ChaCha20_ctr32","\@function");
257	ALIGN (32);
258	LABEL ("ChaCha20_ctr32");
259	larl ("%r1","OPENSSL_s390xcap_P");
260
261	lghi ("%r0",64);
262	&{$z? \&ltgr:\&ltr} ($len,$len); # len==0?
263	bzr ("%r14");
264	lg ("%r1","S390X_STFLE+16(%r1)");
265	&{$z? \&clgr:\&clr} ($len,"%r0");
266	jle (".Lshort");
267
268	tmhh ("%r1",0x4000); # check for vx bit
269	jnz (".LChaCha20_ctr32_vx");
270
271	LABEL (".Lshort");
272	&{$z? \&aghi:\&ahi} ($len,-64);
273	&{$z? \&lghi:\&lhi} ("%r1",-$frame);
274	&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
275	&{$z? \&slgr:\&slr} ($out,$inp); # difference
276	la ($len,"0($inp,$len)"); # end of input minus 64
277	larl ("%r7",".Lsigma");
278	lgr ("%r0",$sp);
279	la ($sp,"0(%r1,$sp)");
280	&{$z? \&stg:\&st} ("%r0","0($sp)");
281
282	lmg ("%r8","%r11","0($key)"); # load key
283	lmg ("%r12","%r13","0($counter)"); # load counter
284	lmg ("%r6","%r7","0(%r7)"); # load sigma constant
285
286	la ("%r14","0($inp)");
287	&{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)");
288	&{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)");
289	stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
290	srlg (@x[12],"%r12",32); # 32-bit counter value
291	j (".Loop_outer");
292
293	ALIGN (16);
294	LABEL (".Loop_outer");
295	lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7]
296	lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11]
297	lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15]
298	stm (@t[0],@t[1],"$stdframe+48+410($sp)");# offload x[10]-x[11]
299	lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9]
300	st (@x[12],"$stdframe+4*12($sp)"); # save counter
301	&{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
302	lhi ("%r14",10);
303	j (".Loop");
304
305	ALIGN (4);
306	LABEL (".Loop");
307	ROUND (0, 4, 8,12);
308	ROUND (0, 5,10,15);
309	brct ("%r14",".Loop");
310
311	&{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
312	stm (@t[0],@t[1],"$stdframe+48+48($sp)"); # offload x[8]-x[9]
313	&{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
314
315	al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule
316	al (@x[1],"$stdframe+4*1($sp)");
317	al (@x[2],"$stdframe+4*2($sp)");
318	al (@x[3],"$stdframe+4*3($sp)");
319	al (@x[4],"$stdframe+4*4($sp)");
320	al (@x[5],"$stdframe+4*5($sp)");
321	al (@x[6],"$stdframe+4*6($sp)");
322	al (@x[7],"$stdframe+4*7($sp)");
323	lrvr (@x[0],@x[0]);
324	lrvr (@x[1],@x[1]);
325	lrvr (@x[2],@x[2]);
326	lrvr (@x[3],@x[3]);
327	lrvr (@x[4],@x[4]);
328	lrvr (@x[5],@x[5]);
329	lrvr (@x[6],@x[6]);
330	lrvr (@x[7],@x[7]);
331	al (@x[12],"$stdframe+4*12($sp)");
332	al (@x[13],"$stdframe+4*13($sp)");
333	al (@x[14],"$stdframe+4*14($sp)");
334	al (@x[15],"$stdframe+4*15($sp)");
335	lrvr (@x[12],@x[12]);
336	lrvr (@x[13],@x[13]);
337	lrvr (@x[14],@x[14]);
338	lrvr (@x[15],@x[15]);
339
340	la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer
341	&{$z? \&clgr:\&clr} ("%r14",@t[1]);
342	jh (".Ltail");
343
344	x (@x[0],"4*0(%r14)"); # xor with input
345	x (@x[1],"4*1(%r14)");
346	st (@x[0],"4*0(@t[0])"); # store output
347	x (@x[2],"4*2(%r14)");
348	st (@x[1],"4*1(@t[0])");
349	x (@x[3],"4*3(%r14)");
350	st (@x[2],"4*2(@t[0])");
351	x (@x[4],"4*4(%r14)");
352	st (@x[3],"4*3(@t[0])");
353	lm (@x[0],@x[3],"$stdframe+48+48($sp)"); # load x[8]-x[11]
354	x (@x[5],"4*5(%r14)");
355	st (@x[4],"4*4(@t[0])");
356	x (@x[6],"4*6(%r14)");
357	al (@x[0],"$stdframe+4*8($sp)");
358	st (@x[5],"4*5(@t[0])");
359	x (@x[7],"4*7(%r14)");
360	al (@x[1],"$stdframe+4*9($sp)");
361	st (@x[6],"4*6(@t[0])");
362	x (@x[12],"4*12(%r14)");
363	al (@x[2],"$stdframe+4*10($sp)");
364	st (@x[7],"4*7(@t[0])");
365	x (@x[13],"4*13(%r14)");
366	al (@x[3],"$stdframe+4*11($sp)");
367	st (@x[12],"4*12(@t[0])");
368	x (@x[14],"4*14(%r14)");
369	st (@x[13],"4*13(@t[0])");
370	x (@x[15],"4*15(%r14)");
371	st (@x[14],"4*14(@t[0])");
372	lrvr (@x[0],@x[0]);
373	st (@x[15],"4*15(@t[0])");
374	lrvr (@x[1],@x[1]);
375	lrvr (@x[2],@x[2]);
376	lrvr (@x[3],@x[3]);
377	lhi (@x[12],1);
378	x (@x[0],"4*8(%r14)");
379	al (@x[12],"$stdframe+4*12($sp)"); # increment counter
380	x (@x[1],"4*9(%r14)");
381	st (@x[0],"4*8(@t[0])");
382	x (@x[2],"4*10(%r14)");
383	st (@x[1],"4*9(@t[0])");
384	x (@x[3],"4*11(%r14)");
385	st (@x[2],"4*10(@t[0])");
386	st (@x[3],"4*11(@t[0])");
387
388	&{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet?
389	la ("%r14","64(%r14)");
390	jl (".Loop_outer");
391
392	LABEL (".Ldone");
393	xgr ("%r0","%r0");
394	xgr ("%r1","%r1");
395	xgr ("%r2","%r2");
396	xgr ("%r3","%r3");
397	stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy
398	stmg ("%r0","%r3","$stdframe+4*12($sp)");
399
400	&{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)");
401	br ("%r14");
402
403	ALIGN (16);
404	LABEL (".Ltail");
405	la (@t[1],"64($t[1])");
406	stm (@x[0],@x[7],"$stdframe+4*0($sp)");
407	&{$z? \&slgr:\&slr} (@t[1],"%r14");
408	lm (@x[0],@x[3],"$stdframe+48+48($sp)");
409	&{$z? \&lghi:\&lhi} (@x[6],0);
410	stm (@x[12],@x[15],"$stdframe+4*12($sp)");
411	al (@x[0],"$stdframe+4*8($sp)");
412	al (@x[1],"$stdframe+4*9($sp)");
413	al (@x[2],"$stdframe+4*10($sp)");
414	al (@x[3],"$stdframe+4*11($sp)");
415	lrvr (@x[0],@x[0]);
416	lrvr (@x[1],@x[1]);
417	lrvr (@x[2],@x[2]);
418	lrvr (@x[3],@x[3]);
419	stm (@x[0],@x[3],"$stdframe+4*8($sp)");
420
421	LABEL (".Loop_tail");
422	llgc (@x[4],"0(@x[6],%r14)");
423	llgc (@x[5],"$stdframe(@x[6],$sp)");
424	xr (@x[5],@x[4]);
425	stc (@x[5],"0(@x[6],@t[0])");
426	la (@x[6],"1(@x[6])");
427	brct (@t[1],".Loop_tail");
428
429	j (".Ldone");
430	SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
431	}
432
433	########################################################################
434	# 4x"vertical" layout minimizes amount of instructions, but pipeline
435	# runs underutilized [because of vector instructions' high latency].
436	# On the other hand minimum amount of data it takes to fully utilize
437	# the pipeline is higher, so that effectively, short inputs would be
438	# processed slower. Hence this code path targeting <=256 bytes lengths.
439	#
440	{
441	my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
442	$xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
443	my @K=map("%v$_",(16..19));
444	my $CTR="%v26";
445	my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
446	my $beperm="%v31";
447	my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
448	my $FRAME=$stdframe+4*16;
449
450	ALIGN (32);
451	LABEL ("ChaCha20_ctr32_4x");
452	LABEL (".LChaCha20_ctr32_4x");
453	&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
454	if (!$z) {
455	std ("%f4","16$SIZE_T+28($sp)");
456	std ("%f6","16$SIZE_T+38($sp)");
457	}
458	&{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
459	lgr ("%r0",$sp);
460	la ($sp,"0(%r1,$sp)");
461	&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
462	if ($z) {
463	std ("%f8","$stdframe+8*0($sp)");
464	std ("%f9","$stdframe+8*1($sp)");
465	std ("%f10","$stdframe+8*2($sp)");
466	std ("%f11","$stdframe+8*3($sp)");
467	std ("%f12","$stdframe+8*4($sp)");
468	std ("%f13","$stdframe+8*5($sp)");
469	std ("%f14","$stdframe+8*6($sp)");
470	std ("%f15","$stdframe+8*7($sp)");
471	}
472	larl ("%r7",".Lsigma");
473	lhi ("%r0",10);
474	lhi ("%r1",0);
475
476	vl (@K[0],"0(%r7)"); # load sigma
477	vl (@K[1],"0($key)"); # load key
478	vl (@K[2],"16($key)");
479	vl (@K[3],"0($counter)"); # load counter
480
481	vl ($beperm,"0x40(%r7)");
482	vl ($xt1,"0x50(%r7)");
483	vrepf ($CTR,@K[3],0);
484	vlvgf (@K[3],"%r1",0); # clear @K[3].word[0]
485	vaf ($CTR,$CTR,$xt1);
486
487	#LABEL (".Loop_outer_4x");
488	vlm ($xa0,$xa3,"0x60(%r7)"); # load [smashed] sigma
489
490	vrepf ($xb0,@K[1],0); # smash the key
491	vrepf ($xb1,@K[1],1);
492	vrepf ($xb2,@K[1],2);
493	vrepf ($xb3,@K[1],3);
494
495	vrepf ($xc0,@K[2],0);
496	vrepf ($xc1,@K[2],1);
497	vrepf ($xc2,@K[2],2);
498	vrepf ($xc3,@K[2],3);
499
500	vlr ($xd0,$CTR);
501	vrepf ($xd1,@K[3],1);
502	vrepf ($xd2,@K[3],2);
503	vrepf ($xd3,@K[3],3);
504
505	LABEL (".Loop_4x");
506	VX_lane_ROUND(0, 4, 8,12);
507	VX_lane_ROUND(0, 5,10,15);
508	brct ("%r0",".Loop_4x");
509
510	vaf ($xd0,$xd0,$CTR);
511
512	vmrhf ($xt0,$xa0,$xa1); # transpose data
513	vmrhf ($xt1,$xa2,$xa3);
514	vmrlf ($xt2,$xa0,$xa1);
515	vmrlf ($xt3,$xa2,$xa3);
516	vpdi ($xa0,$xt0,$xt1,0b0000);
517	vpdi ($xa1,$xt0,$xt1,0b0101);
518	vpdi ($xa2,$xt2,$xt3,0b0000);
519	vpdi ($xa3,$xt2,$xt3,0b0101);
520
521	vmrhf ($xt0,$xb0,$xb1);
522	vmrhf ($xt1,$xb2,$xb3);
523	vmrlf ($xt2,$xb0,$xb1);
524	vmrlf ($xt3,$xb2,$xb3);
525	vpdi ($xb0,$xt0,$xt1,0b0000);
526	vpdi ($xb1,$xt0,$xt1,0b0101);
527	vpdi ($xb2,$xt2,$xt3,0b0000);
528	vpdi ($xb3,$xt2,$xt3,0b0101);
529
530	vmrhf ($xt0,$xc0,$xc1);
531	vmrhf ($xt1,$xc2,$xc3);
532	vmrlf ($xt2,$xc0,$xc1);
533	vmrlf ($xt3,$xc2,$xc3);
534	vpdi ($xc0,$xt0,$xt1,0b0000);
535	vpdi ($xc1,$xt0,$xt1,0b0101);
536	vpdi ($xc2,$xt2,$xt3,0b0000);
537	vpdi ($xc3,$xt2,$xt3,0b0101);
538
539	vmrhf ($xt0,$xd0,$xd1);
540	vmrhf ($xt1,$xd2,$xd3);
541	vmrlf ($xt2,$xd0,$xd1);
542	vmrlf ($xt3,$xd2,$xd3);
543	vpdi ($xd0,$xt0,$xt1,0b0000);
544	vpdi ($xd1,$xt0,$xt1,0b0101);
545	vpdi ($xd2,$xt2,$xt3,0b0000);
546	vpdi ($xd3,$xt2,$xt3,0b0101);
547
548	#vrepif ($xt0,4);
549	#vaf ($CTR,$CTR,$xt0); # next counter value
550
551	vaf ($xa0,$xa0,@K[0]);
552	vaf ($xb0,$xb0,@K[1]);
553	vaf ($xc0,$xc0,@K[2]);
554	vaf ($xd0,$xd0,@K[3]);
555
556	vperm ($xa0,$xa0,$xa0,$beperm);
557	vperm ($xb0,$xb0,$xb0,$beperm);
558	vperm ($xc0,$xc0,$xc0,$beperm);
559	vperm ($xd0,$xd0,$xd0,$beperm);
560
561	#&{$z? \&clgfi:\&clfi} ($len,0x40);
562	#jl (".Ltail_4x");
563
564	vlm ($xt0,$xt3,"0($inp)");
565
566	vx ($xt0,$xt0,$xa0);
567	vx ($xt1,$xt1,$xb0);
568	vx ($xt2,$xt2,$xc0);
569	vx ($xt3,$xt3,$xd0);
570
571	vstm ($xt0,$xt3,"0($out)");
572
573	la ($inp,"0x40($inp)");
574	la ($out,"0x40($out)");
575	&{$z? \&aghi:\&ahi} ($len,-0x40);
576	#je (".Ldone_4x");
577
578	vaf ($xa0,$xa1,@K[0]);
579	vaf ($xb0,$xb1,@K[1]);
580	vaf ($xc0,$xc1,@K[2]);
581	vaf ($xd0,$xd1,@K[3]);
582
583	vperm ($xa0,$xa0,$xa0,$beperm);
584	vperm ($xb0,$xb0,$xb0,$beperm);
585	vperm ($xc0,$xc0,$xc0,$beperm);
586	vperm ($xd0,$xd0,$xd0,$beperm);
587
588	&{$z? \&clgfi:\&clfi} ($len,0x40);
589	jl (".Ltail_4x");
590
591	vlm ($xt0,$xt3,"0($inp)");
592
593	vx ($xt0,$xt0,$xa0);
594	vx ($xt1,$xt1,$xb0);
595	vx ($xt2,$xt2,$xc0);
596	vx ($xt3,$xt3,$xd0);
597
598	vstm ($xt0,$xt3,"0($out)");
599
600	la ($inp,"0x40($inp)");
601	la ($out,"0x40($out)");
602	&{$z? \&aghi:\&ahi} ($len,-0x40);
603	je (".Ldone_4x");
604
605	vaf ($xa0,$xa2,@K[0]);
606	vaf ($xb0,$xb2,@K[1]);
607	vaf ($xc0,$xc2,@K[2]);
608	vaf ($xd0,$xd2,@K[3]);
609
610	vperm ($xa0,$xa0,$xa0,$beperm);
611	vperm ($xb0,$xb0,$xb0,$beperm);
612	vperm ($xc0,$xc0,$xc0,$beperm);
613	vperm ($xd0,$xd0,$xd0,$beperm);
614
615	&{$z? \&clgfi:\&clfi} ($len,0x40);
616	jl (".Ltail_4x");
617
618	vlm ($xt0,$xt3,"0($inp)");
619
620	vx ($xt0,$xt0,$xa0);
621	vx ($xt1,$xt1,$xb0);
622	vx ($xt2,$xt2,$xc0);
623	vx ($xt3,$xt3,$xd0);
624
625	vstm ($xt0,$xt3,"0($out)");
626
627	la ($inp,"0x40($inp)");
628	la ($out,"0x40($out)");
629	&{$z? \&aghi:\&ahi} ($len,-0x40);
630	je (".Ldone_4x");
631
632	vaf ($xa0,$xa3,@K[0]);
633	vaf ($xb0,$xb3,@K[1]);
634	vaf ($xc0,$xc3,@K[2]);
635	vaf ($xd0,$xd3,@K[3]);
636
637	vperm ($xa0,$xa0,$xa0,$beperm);
638	vperm ($xb0,$xb0,$xb0,$beperm);
639	vperm ($xc0,$xc0,$xc0,$beperm);
640	vperm ($xd0,$xd0,$xd0,$beperm);
641
642	&{$z? \&clgfi:\&clfi} ($len,0x40);
643	jl (".Ltail_4x");
644
645	vlm ($xt0,$xt3,"0($inp)");
646
647	vx ($xt0,$xt0,$xa0);
648	vx ($xt1,$xt1,$xb0);
649	vx ($xt2,$xt2,$xc0);
650	vx ($xt3,$xt3,$xd0);
651
652	vstm ($xt0,$xt3,"0($out)");
653
654	#la $inp,0x40($inp));
655	#la $out,0x40($out));
656	#lhi %r0,10);
657	#&{$z? \&aghi:\&ahi} $len,-0x40);
658	#jne .Loop_outer_4x);
659
660	LABEL (".Ldone_4x");
661	if (!$z) {
662	ld ("%f4","$FRAME+16$SIZE_T+28($sp)");
663	ld ("%f6","$FRAME+16$SIZE_T+38($sp)");
664	} else {
665	ld ("%f8","$stdframe+8*0($sp)");
666	ld ("%f9","$stdframe+8*1($sp)");
667	ld ("%f10","$stdframe+8*2($sp)");
668	ld ("%f11","$stdframe+8*3($sp)");
669	ld ("%f12","$stdframe+8*4($sp)");
670	ld ("%f13","$stdframe+8*5($sp)");
671	ld ("%f14","$stdframe+8*6($sp)");
672	ld ("%f15","$stdframe+8*7($sp)");
673	}
674	&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
675	la ($sp,"$FRAME($sp)");
676	br ("%r14");
677
678	ALIGN (16);
679	LABEL (".Ltail_4x");
680	if (!$z) {
681	vlr ($xt0,$xb0);
682	ld ("%f4","$FRAME+16$SIZE_T+28($sp)");
683	ld ("%f6","$FRAME+16$SIZE_T+38($sp)");
684
685	vst ($xa0,"$stdframe+0x00($sp)");
686	vst ($xt0,"$stdframe+0x10($sp)");
687	vst ($xc0,"$stdframe+0x20($sp)");
688	vst ($xd0,"$stdframe+0x30($sp)");
689	} else {
690	vlr ($xt0,$xc0);
691	ld ("%f8","$stdframe+8*0($sp)");
692	ld ("%f9","$stdframe+8*1($sp)");
693	ld ("%f10","$stdframe+8*2($sp)");
694	ld ("%f11","$stdframe+8*3($sp)");
695	vlr ($xt1,$xd0);
696	ld ("%f12","$stdframe+8*4($sp)");
697	ld ("%f13","$stdframe+8*5($sp)");
698	ld ("%f14","$stdframe+8*6($sp)");
699	ld ("%f15","$stdframe+8*7($sp)");
700
701	vst ($xa0,"$stdframe+0x00($sp)");
702	vst ($xb0,"$stdframe+0x10($sp)");
703	vst ($xt0,"$stdframe+0x20($sp)");
704	vst ($xt1,"$stdframe+0x30($sp)");
705	}
706	lghi ("%r1",0);
707
708	LABEL (".Loop_tail_4x");
709	llgc ("%r5","0(%r1,$inp)");
710	llgc ("%r6","$stdframe(%r1,$sp)");
711	xr ("%r6","%r5");
712	stc ("%r6","0(%r1,$out)");
713	la ("%r1","1(%r1)");
714	brct ($len,".Loop_tail_4x");
715
716	&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
717	la ($sp,"$FRAME($sp)");
718	br ("%r14");
719	SIZE ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
720	}
721
722	########################################################################
723	# 6x"horizontal" layout is optimal fit for the platform in its current
724	# shape, more specifically for given vector instructions' latency. Well,
725	# computational part of 8x"vertical" would be faster, but it consumes
726	# all registers and dealing with that will diminish the return...
727	#
728	{
729	my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
730	$a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
731	$a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
732	my @K=map("%v$_",(27,24..26));
733	my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
734	my $beperm="%v31";
735	my $FRAME=$stdframe + 4*16;
736
737	GLOBL ("ChaCha20_ctr32_vx");
738	ALIGN (32);
739	LABEL ("ChaCha20_ctr32_vx");
740	LABEL (".LChaCha20_ctr32_vx");
741	&{$z? \&clgfi:\&clfi} ($len,256);
742	jle (".LChaCha20_ctr32_4x");
743	&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
744	if (!$z) {
745	std ("%f4","16$SIZE_T+28($sp)");
746	std ("%f6","16$SIZE_T+38($sp)");
747	}
748	&{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
749	lgr ("%r0",$sp);
750	la ($sp,"0(%r1,$sp)");
751	&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
752	if ($z) {
753	std ("%f8","$FRAME-8*8($sp)");
754	std ("%f9","$FRAME-8*7($sp)");
755	std ("%f10","$FRAME-8*6($sp)");
756	std ("%f11","$FRAME-8*5($sp)");
757	std ("%f12","$FRAME-8*4($sp)");
758	std ("%f13","$FRAME-8*3($sp)");
759	std ("%f14","$FRAME-8*2($sp)");
760	std ("%f15","$FRAME-8*1($sp)");
761	}
762	larl ("%r7",".Lsigma");
763	lhi ("%r0",10);
764
765	vlm (@K[1],@K[2],"0($key)"); # load key
766	vl (@K[3],"0($counter)"); # load counter
767
768	vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ...
769
770	LABEL (".Loop_outer_vx");
771	vlr ($a0,@K[0]);
772	vlr ($b0,@K[1]);
773	vlr ($a1,@K[0]);
774	vlr ($b1,@K[1]);
775	vlr ($a2,@K[0]);
776	vlr ($b2,@K[1]);
777	vlr ($a3,@K[0]);
778	vlr ($b3,@K[1]);
779	vlr ($a4,@K[0]);
780	vlr ($b4,@K[1]);
781	vlr ($a5,@K[0]);
782	vlr ($b5,@K[1]);
783
784	vlr ($d0,@K[3]);
785	vaf ($d1,@K[3],$t1); # K[3]+1
786	vaf ($d2,@K[3],$t2); # K[3]+2
787	vaf ($d3,@K[3],$t3); # K[3]+3
788	vaf ($d4,$d2,$t2); # K[3]+4
789	vaf ($d5,$d2,$t3); # K[3]+5
790
791	vlr ($c0,@K[2]);
792	vlr ($c1,@K[2]);
793	vlr ($c2,@K[2]);
794	vlr ($c3,@K[2]);
795	vlr ($c4,@K[2]);
796	vlr ($c5,@K[2]);
797
798	vlr ($t1,$d1);
799	vlr ($t2,$d2);
800	vlr ($t3,$d3);
801
802	ALIGN (4);
803	LABEL (".Loop_vx");
804
805	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
806	$b0,$b1,$b2,$b3,$b4,$b5,
807	$c0,$c1,$c2,$c3,$c4,$c5,
808	$d0,$d1,$d2,$d3,$d4,$d5,
809	0);
810
811	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
812	$b0,$b1,$b2,$b3,$b4,$b5,
813	$c0,$c1,$c2,$c3,$c4,$c5,
814	$d0,$d1,$d2,$d3,$d4,$d5,
815	1);
816
817	brct ("%r0",".Loop_vx");
818
819	vaf ($a0,$a0,@K[0]);
820	vaf ($b0,$b0,@K[1]);
821	vaf ($c0,$c0,@K[2]);
822	vaf ($d0,$d0,@K[3]);
823	vaf ($a1,$a1,@K[0]);
824	vaf ($d1,$d1,$t1); # +K[3]+1
825
826	vperm ($a0,$a0,$a0,$beperm);
827	vperm ($b0,$b0,$b0,$beperm);
828	vperm ($c0,$c0,$c0,$beperm);
829	vperm ($d0,$d0,$d0,$beperm);
830
831	&{$z? \&clgfi:\&clfi} ($len,0x40);
832	jl (".Ltail_vx");
833
834	vaf ($d2,$d2,$t2); # +K[3]+2
835	vaf ($d3,$d3,$t3); # +K[3]+3
836	vlm ($t0,$t3,"0($inp)");
837
838	vx ($a0,$a0,$t0);
839	vx ($b0,$b0,$t1);
840	vx ($c0,$c0,$t2);
841	vx ($d0,$d0,$t3);
842
843	vlm (@K[0],$t3,"0(%r7)"); # re-load sigma and increments
844
845	vstm ($a0,$d0,"0($out)");
846
847	la ($inp,"0x40($inp)");
848	la ($out,"0x40($out)");
849	&{$z? \&aghi:\&ahi} ($len,-0x40);
850	je (".Ldone_vx");
851
852	vaf ($b1,$b1,@K[1]);
853	vaf ($c1,$c1,@K[2]);
854
855	vperm ($a0,$a1,$a1,$beperm);
856	vperm ($b0,$b1,$b1,$beperm);
857	vperm ($c0,$c1,$c1,$beperm);
858	vperm ($d0,$d1,$d1,$beperm);
859
860	&{$z? \&clgfi:\&clfi} ($len,0x40);
861	jl (".Ltail_vx");
862
863	vlm ($a1,$d1,"0($inp)");
864
865	vx ($a0,$a0,$a1);
866	vx ($b0,$b0,$b1);
867	vx ($c0,$c0,$c1);
868	vx ($d0,$d0,$d1);
869
870	vstm ($a0,$d0,"0($out)");
871
872	la ($inp,"0x40($inp)");
873	la ($out,"0x40($out)");
874	&{$z? \&aghi:\&ahi} ($len,-0x40);
875	je (".Ldone_vx");
876
877	vaf ($a2,$a2,@K[0]);
878	vaf ($b2,$b2,@K[1]);
879	vaf ($c2,$c2,@K[2]);
880
881	vperm ($a0,$a2,$a2,$beperm);
882	vperm ($b0,$b2,$b2,$beperm);
883	vperm ($c0,$c2,$c2,$beperm);
884	vperm ($d0,$d2,$d2,$beperm);
885
886	&{$z? \&clgfi:\&clfi} ($len,0x40);
887	jl (".Ltail_vx");
888
889	vlm ($a1,$d1,"0($inp)");
890
891	vx ($a0,$a0,$a1);
892	vx ($b0,$b0,$b1);
893	vx ($c0,$c0,$c1);
894	vx ($d0,$d0,$d1);
895
896	vstm ($a0,$d0,"0($out)");
897
898	la ($inp,"0x40($inp)");
899	la ($out,"0x40($out)");
900	&{$z? \&aghi:\&ahi} ($len,-0x40);
901	je (".Ldone_vx");
902
903	vaf ($a3,$a3,@K[0]);
904	vaf ($b3,$b3,@K[1]);
905	vaf ($c3,$c3,@K[2]);
906	vaf ($d2,@K[3],$t3); # K[3]+3
907
908	vperm ($a0,$a3,$a3,$beperm);
909	vperm ($b0,$b3,$b3,$beperm);
910	vperm ($c0,$c3,$c3,$beperm);
911	vperm ($d0,$d3,$d3,$beperm);
912
913	&{$z? \&clgfi:\&clfi} ($len,0x40);
914	jl (".Ltail_vx");
915
916	vaf ($d3,$d2,$t1); # K[3]+4
917	vlm ($a1,$d1,"0($inp)");
918
919	vx ($a0,$a0,$a1);
920	vx ($b0,$b0,$b1);
921	vx ($c0,$c0,$c1);
922	vx ($d0,$d0,$d1);
923
924	vstm ($a0,$d0,"0($out)");
925
926	la ($inp,"0x40($inp)");
927	la ($out,"0x40($out)");
928	&{$z? \&aghi:\&ahi} ($len,-0x40);
929	je (".Ldone_vx");
930
931	vaf ($a4,$a4,@K[0]);
932	vaf ($b4,$b4,@K[1]);
933	vaf ($c4,$c4,@K[2]);
934	vaf ($d4,$d4,$d3); # +K[3]+4
935	vaf ($d3,$d3,$t1); # K[3]+5
936	vaf (@K[3],$d2,$t3); # K[3]+=6
937
938	vperm ($a0,$a4,$a4,$beperm);
939	vperm ($b0,$b4,$b4,$beperm);
940	vperm ($c0,$c4,$c4,$beperm);
941	vperm ($d0,$d4,$d4,$beperm);
942
943	&{$z? \&clgfi:\&clfi} ($len,0x40);
944	jl (".Ltail_vx");
945
946	vlm ($a1,$d1,"0($inp)");
947
948	vx ($a0,$a0,$a1);
949	vx ($b0,$b0,$b1);
950	vx ($c0,$c0,$c1);
951	vx ($d0,$d0,$d1);
952
953	vstm ($a0,$d0,"0($out)");
954
955	la ($inp,"0x40($inp)");
956	la ($out,"0x40($out)");
957	&{$z? \&aghi:\&ahi} ($len,-0x40);
958	je (".Ldone_vx");
959
960	vaf ($a5,$a5,@K[0]);
961	vaf ($b5,$b5,@K[1]);
962	vaf ($c5,$c5,@K[2]);
963	vaf ($d5,$d5,$d3); # +K[3]+5
964
965	vperm ($a0,$a5,$a5,$beperm);
966	vperm ($b0,$b5,$b5,$beperm);
967	vperm ($c0,$c5,$c5,$beperm);
968	vperm ($d0,$d5,$d5,$beperm);
969
970	&{$z? \&clgfi:\&clfi} ($len,0x40);
971	jl (".Ltail_vx");
972
973	vlm ($a1,$d1,"0($inp)");
974
975	vx ($a0,$a0,$a1);
976	vx ($b0,$b0,$b1);
977	vx ($c0,$c0,$c1);
978	vx ($d0,$d0,$d1);
979
980	vstm ($a0,$d0,"0($out)");
981
982	la ($inp,"0x40($inp)");
983	la ($out,"0x40($out)");
984	lhi ("%r0",10);
985	&{$z? \&aghi:\&ahi} ($len,-0x40);
986	jne (".Loop_outer_vx");
987
988	LABEL (".Ldone_vx");
989	if (!$z) {
990	ld ("%f4","$FRAME+16$SIZE_T+28($sp)");
991	ld ("%f6","$FRAME+16$SIZE_T+38($sp)");
992	} else {
993	ld ("%f8","$FRAME-8*8($sp)");
994	ld ("%f9","$FRAME-8*7($sp)");
995	ld ("%f10","$FRAME-8*6($sp)");
996	ld ("%f11","$FRAME-8*5($sp)");
997	ld ("%f12","$FRAME-8*4($sp)");
998	ld ("%f13","$FRAME-8*3($sp)");
999	ld ("%f14","$FRAME-8*2($sp)");
1000	ld ("%f15","$FRAME-8*1($sp)");
1001	}
1002	&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1003	la ($sp,"$FRAME($sp)");
1004	br ("%r14");
1005
1006	ALIGN (16);
1007	LABEL (".Ltail_vx");
1008	if (!$z) {
1009	ld ("%f4","$FRAME+16$SIZE_T+28($sp)");
1010	ld ("%f6","$FRAME+16$SIZE_T+38($sp)");
1011	} else {
1012	ld ("%f8","$FRAME-8*8($sp)");
1013	ld ("%f9","$FRAME-8*7($sp)");
1014	ld ("%f10","$FRAME-8*6($sp)");
1015	ld ("%f11","$FRAME-8*5($sp)");
1016	ld ("%f12","$FRAME-8*4($sp)");
1017	ld ("%f13","$FRAME-8*3($sp)");
1018	ld ("%f14","$FRAME-8*2($sp)");
1019	ld ("%f15","$FRAME-8*1($sp)");
1020	}
1021	vstm ($a0,$d0,"$stdframe($sp)");
1022	lghi ("%r1",0);
1023
1024	LABEL (".Loop_tail_vx");
1025	llgc ("%r5","0(%r1,$inp)");
1026	llgc ("%r6","$stdframe(%r1,$sp)");
1027	xr ("%r6","%r5");
1028	stc ("%r6","0(%r1,$out)");
1029	la ("%r1","1(%r1)");
1030	brct ($len,".Loop_tail_vx");
1031
1032	&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1033	la ($sp,"$FRAME($sp)");
1034	br ("%r14");
1035	SIZE ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
1036	}
1037	################
1038
1039	ALIGN (32);
1040	LABEL (".Lsigma");
1041	LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
1042	LONG (1,0,0,0);
1043	LONG (2,0,0,0);
1044	LONG (3,0,0,0);
1045	LONG (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c); # byte swap
1046
1047	LONG (0,1,2,3);
1048	LONG (0x61707865,0x61707865,0x61707865,0x61707865); # smashed sigma
1049	LONG (0x3320646e,0x3320646e,0x3320646e,0x3320646e);
1050	LONG (0x79622d32,0x79622d32,0x79622d32,0x79622d32);
1051	LONG (0x6b206574,0x6b206574,0x6b206574,0x6b206574);
1052
1053	ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
1054	ALIGN (4);
1055
1056	PERLASM_END();

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.0.2/crypto/chacha/asm/chacha-s390x.pl@ 95087

Download in other formats: