pnggccrd.c@ 13371

Last change on this file since 13371 was 6393, checked in by vboxsync, 17 years ago
export libpng and zlib so Windows and OS/2 builds cleanly.
Property svn:eol-style set to `native`
File size: 229.6 KB

Line
1	/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
2	*
3	* For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
4	*
5	* See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6	* and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7	* for Intel's performance analysis of the MMX vs. non-MMX code.
8	*
9	* libpng version 1.2.8 - December 3, 2004
10	* For conditions of distribution and use, see copyright notice in png.h
11	* Copyright (c) 1998-2004 Glenn Randers-Pehrson
12	* Copyright (c) 1998, Intel Corporation
13	*
14	* Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15	* Interface to libpng contributed by Gilles Vollant, 1999.
16	* GNU C port by Greg Roelofs, 1999-2001.
17	*
18	* Lines 2350-4300 converted in place with intel2gas 1.3.1:
19	*
20	* intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
21	*
22	* and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
23	*
24	* NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25	* is required to assemble the newer MMX instructions such as movq.
26	* For djgpp, see
27	*
28	* ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
29	*
30	* (or a later version in the same directory). For Linux, check your
31	* distribution's web site(s) or try these links:
32	*
33	* http://rufus.w3.org/linux/RPM/binutils.html
34	* http://www.debian.org/Packages/stable/devel/binutils.html
35	* ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
36	* binutils.tgz
37	*
38	* For other platforms, see the main GNU site:
39	*
40	* ftp://ftp.gnu.org/pub/gnu/binutils/
41	*
42	* Version 2.5.2l.15 is definitely too old...
43	*/
44
45	/*
46	* TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47	* =====================================
48	*
49	* 19991006:
50	* - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51	*
52	* 19991007:
53	* - additional optimizations (possible or definite):
54	* x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55	* - write MMX code for 48-bit case (pixel_bytes == 6)
56	* - figure out what's up with 24-bit case (pixel_bytes == 3):
57	* why subtract 8 from width_mmx in the pass 4/5 case?
58	* (only width_mmx case) (near line 1606)
59	* x [DONE] replace pixel_bytes within each block with the true
60	* constant value (or are compilers smart enough to do that?)
61	* - rewrite all MMX interlacing code so it's aligned with
62	* the beginning of the row buffer, not the end. This
63	* would not only allow one to eliminate half of the memory
64	* writes for odd passes (that is, pass == odd), it may also
65	* eliminate some unaligned-data-access exceptions (assuming
66	* there's a penalty for not aligning 64-bit accesses on
67	* 64-bit boundaries). The only catch is that the "leftover"
68	* pixel(s) at the end of the row would have to be saved,
69	* but there are enough unused MMX registers in every case,
70	* so this is not a problem. A further benefit is that the
71	* post-MMX cleanup code (C code) in at least some of the
72	* cases could be done within the assembler block.
73	* x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74	* inconsistent, and don't match the MMX Programmer's Reference
75	* Manual conventions anyway. They should be changed to
76	* "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77	* was lowest in memory (e.g., corresponding to a left pixel)
78	* and b7 is the byte that was highest (e.g., a right pixel).
79	*
80	* 19991016:
81	* - Brennan's Guide notwithstanding, gcc under Linux does not
82	* want globals prefixed by underscores when referencing them--
83	* i.e., if the variable is const4, then refer to it as const4,
84	* not _const4. This seems to be a djgpp-specific requirement.
85	* Also, such variables apparently must be declared outside
86	* of functions; neither static nor automatic variables work if
87	* defined within the scope of a single function, but both
88	* static and truly global (multi-module) variables work fine.
89	*
90	* 19991023:
91	* - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92	* - switched from string-concatenation-with-macros to cleaner method of
93	* renaming global variables for djgpp--i.e., always use prefixes in
94	* inlined assembler code (== strings) and conditionally rename the
95	* variables, not the other way around. Hence _const4, _mask8_0, etc.
96	*
97	* 19991024:
98	* - fixed mmxsupport()/png_do_read_interlace() first-row bug
99	* This one was severely weird: even though mmxsupport() doesn't touch
100	* ebx (where "row" pointer was stored), it nevertheless managed to zero
101	* the register (even in static/non-fPIC code--see below), which in turn
102	* caused png_do_read_interlace() to return prematurely on the first row of
103	* interlaced images (i.e., without expanding the interlaced pixels).
104	* Inspection of the generated assembly code didn't turn up any clues,
105	* although it did point at a minor optimization (i.e., get rid of
106	* mmx_supported_local variable and just use eax). Possibly the CPUID
107	* instruction is more destructive than it looks? (Not yet checked.)
108	* - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109	* listings... Apparently register spillage has to do with ebx, since
110	* it's used to index the global offset table. Commenting it out of the
111	* input-reg lists in png_combine_row() eliminated compiler barfage, so
112	* ifdef'd with __PIC__ macro: if defined, use a global for unmask
113	*
114	* 19991107:
115	* - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
116	* "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
117	*
118	* 19991120:
119	* - made "diff" variable (now "_dif") global to simplify conversion of
120	* filtering routines (running out of regs, sigh). "diff" is still used
121	* in interlacing routines, however.
122	* - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123	* macro determines which is used); original not yet tested.
124	*
125	* 20000213:
126	* - when compiling with gcc, be sure to use -fomit-frame-pointer
127	*
128	* 20000319:
129	* - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130	* pass == 4 or 5, that caused visible corruption of interlaced images
131	*
132	* 20000623:
133	* - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134	* many of the form "forbidden register 0 (ax) was spilled for class AREG."
135	* This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136	* Chuck Wilson supplied a patch involving dummy output registers. See
137	* http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138	* for the original (anonymous) SourceForge bug report.
139	*
140	* 20000706:
141	* - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142	* pnggccrd.c: In function `png_combine_row':
143	* pnggccrd.c:525: more than 10 operands in `asm'
144	* pnggccrd.c:669: more than 10 operands in `asm'
145	* pnggccrd.c:828: more than 10 operands in `asm'
146	* pnggccrd.c:994: more than 10 operands in `asm'
147	* pnggccrd.c:1177: more than 10 operands in `asm'
148	* They are all the same problem and can be worked around by using the
149	* global _unmask variable unconditionally, not just in the -fPIC case.
150	* Reportedly earlier versions of gcc also have the problem with more than
151	* 10 operands; they just don't report it. Much strangeness ensues, etc.
152	*
153	* 20000729:
154	* - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155	* MMX routine); began converting png_read_filter_row_mmx_sub()
156	* - to finish remaining sections:
157	* - clean up indentation and comments
158	* - preload local variables
159	* - add output and input regs (order of former determines numerical
160	* mapping of latter)
161	* - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162	* - remove "$" from addressing of Shift and Mask variables [20000823]
163	*
164	* 20000731:
165	* - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
166	*
167	* 20000822:
168	* - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169	* shared-library (-fPIC) version! Code works just fine as part of static
170	* library. Damn damn damn damn damn, should have tested that sooner.
171	* ebx is getting clobbered again (explicitly this time); need to save it
172	* on stack or rewrite asm code to avoid using it altogether. Blargh!
173	*
174	* 20000823:
175	* - first section was trickiest; all remaining sections have ebx -> edx now.
176	* (-fPIC works again.) Also added missing underscores to various Shift*
177	* and Mask globals and got rid of leading "$" signs.
178	*
179	* 20000826:
180	* - added visual separators to help navigate microscopic printed copies
181	* (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182	* on png_read_filter_row_mmx_avg()
183	*
184	* 20000828:
185	* - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
186	* What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
187	* cleaned up/shortened in either routine, but functionality is complete
188	* and seems to be working fine.
189	*
190	* 20000829:
191	* - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
192	* as an input reg (with dummy output variables, etc.), then it cannot
193	* also appear in the clobber list or gcc 2.95.2 will barf. The solution
194	* is simple enough...
195	*
196	* 20000914:
197	* - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
198	* correctly (but 48-bit RGB just fine)
199	*
200	* 20000916:
201	* - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202	* - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
203	* - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
204	* - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
205	*
206	* 20010101:
207	* - added new png_init_mmx_flags() function (here only because it needs to
208	* call mmxsupport(), which should probably become global png_mmxsupport());
209	* modified other MMX routines to run conditionally (png_ptr->asm_flags)
210	*
211	* 20010103:
212	* - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213	* and made it public; moved png_init_mmx_flags() to png.c as internal func
214	*
215	* 20010104:
216	* - removed dependency on png_read_filter_row_c() (C code already duplicated
217	* within MMX version of png_read_filter_row()) so no longer necessary to
218	* compile it into pngrutil.o
219	*
220	* 20010310:
221	* - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
222	*
223	* 20020304:
224	* - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
225	*
226	* 20040724:
227	* - more tinkering with clobber list at lines 4529 and 5033, to get
228	* it to compile on gcc-3.4.
229	*
230	* STILL TO DO:
231	* - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
232	* - write MMX code for 48-bit case (pixel_bytes == 6)
233	* - figure out what's up with 24-bit case (pixel_bytes == 3):
234	* why subtract 8 from width_mmx in the pass 4/5 case?
235	* (only width_mmx case) (near line 1606)
236	* - rewrite all MMX interlacing code so it's aligned with beginning
237	* of the row buffer, not the end (see 19991007 for details)
238	* x pick one version of mmxsupport() and get rid of the other
239	* - add error messages to any remaining bogus default cases
240	* - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
241	* x add support for runtime enable/disable/query of various MMX routines
242	*/
243
244	#define PNG_INTERNAL
245	#include "png.h"
246
247	#if defined(PNG_USE_PNGGCCRD)
248
249	int PNGAPI png_mmx_support(void);
250
251	#ifdef PNG_USE_LOCAL_ARRAYS
252	static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
253	static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
254	static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
255	#endif
256
257	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
258	/* djgpp, Win32, and Cygwin add their own underscores to global variables,
259	* so define them without: */
260	#if defined(__DJGPP__) \|\| defined(WIN32) \|\| defined(__CYGWIN__)
261	# define _mmx_supported mmx_supported
262	# define _const4 const4
263	# define _const6 const6
264	# define _mask8_0 mask8_0
265	# define _mask16_1 mask16_1
266	# define _mask16_0 mask16_0
267	# define _mask24_2 mask24_2
268	# define _mask24_1 mask24_1
269	# define _mask24_0 mask24_0
270	# define _mask32_3 mask32_3
271	# define _mask32_2 mask32_2
272	# define _mask32_1 mask32_1
273	# define _mask32_0 mask32_0
274	# define _mask48_5 mask48_5
275	# define _mask48_4 mask48_4
276	# define _mask48_3 mask48_3
277	# define _mask48_2 mask48_2
278	# define _mask48_1 mask48_1
279	# define _mask48_0 mask48_0
280	# define _LBCarryMask LBCarryMask
281	# define _HBClearMask HBClearMask
282	# define _ActiveMask ActiveMask
283	# define _ActiveMask2 ActiveMask2
284	# define _ActiveMaskEnd ActiveMaskEnd
285	# define _ShiftBpp ShiftBpp
286	# define _ShiftRem ShiftRem
287	#ifdef PNG_THREAD_UNSAFE_OK
288	# define _unmask unmask
289	# define _FullLength FullLength
290	# define _MMXLength MMXLength
291	# define _dif dif
292	# define _patemp patemp
293	# define _pbtemp pbtemp
294	# define _pctemp pctemp
295	#endif
296	#endif
297
298
299	/* These constants are used in the inlined MMX assembly code.
300	Ignore gcc's "At top level: defined but not used" warnings. */
301
302	/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
303	* since that case uses the %ebx register for indexing the Global Offset Table
304	* and there were no other registers available. But gcc 2.95 and later emit
305	* "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
306	* in the non-PIC case, so we'll just use the global unconditionally now.
307	*/
308	#ifdef PNG_THREAD_UNSAFE_OK
309	static int _unmask;
310	#endif
311
312	static unsigned long long _mask8_0 = 0x0102040810204080LL;
313
314	static unsigned long long _mask16_1 = 0x0101020204040808LL;
315	static unsigned long long _mask16_0 = 0x1010202040408080LL;
316
317	static unsigned long long _mask24_2 = 0x0101010202020404LL;
318	static unsigned long long _mask24_1 = 0x0408080810101020LL;
319	static unsigned long long _mask24_0 = 0x2020404040808080LL;
320
321	static unsigned long long _mask32_3 = 0x0101010102020202LL;
322	static unsigned long long _mask32_2 = 0x0404040408080808LL;
323	static unsigned long long _mask32_1 = 0x1010101020202020LL;
324	static unsigned long long _mask32_0 = 0x4040404080808080LL;
325
326	static unsigned long long _mask48_5 = 0x0101010101010202LL;
327	static unsigned long long _mask48_4 = 0x0202020204040404LL;
328	static unsigned long long _mask48_3 = 0x0404080808080808LL;
329	static unsigned long long _mask48_2 = 0x1010101010102020LL;
330	static unsigned long long _mask48_1 = 0x2020202040404040LL;
331	static unsigned long long _mask48_0 = 0x4040808080808080LL;
332
333	static unsigned long long _const4 = 0x0000000000FFFFFFLL;
334	//static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
335	static unsigned long long _const6 = 0x00000000000000FFLL;
336
337	// These are used in the row-filter routines and should/would be local
338	// variables if not for gcc addressing limitations.
339	// WARNING: Their presence probably defeats the thread safety of libpng.
340
341	#ifdef PNG_THREAD_UNSAFE_OK
342	static png_uint_32 _FullLength;
343	static png_uint_32 _MMXLength;
344	static int _dif;
345	static int _patemp; // temp variables for Paeth routine
346	static int _pbtemp;
347	static int _pctemp;
348	#endif
349
350	void /* PRIVATE */
351	png_squelch_warnings(void)
352	{
353	#ifdef PNG_THREAD_UNSAFE_OK
354	_dif = _dif;
355	_patemp = _patemp;
356	_pbtemp = _pbtemp;
357	_pctemp = _pctemp;
358	_MMXLength = _MMXLength;
359	#endif
360	_const4 = _const4;
361	_const6 = _const6;
362	_mask8_0 = _mask8_0;
363	_mask16_1 = _mask16_1;
364	_mask16_0 = _mask16_0;
365	_mask24_2 = _mask24_2;
366	_mask24_1 = _mask24_1;
367	_mask24_0 = _mask24_0;
368	_mask32_3 = _mask32_3;
369	_mask32_2 = _mask32_2;
370	_mask32_1 = _mask32_1;
371	_mask32_0 = _mask32_0;
372	_mask48_5 = _mask48_5;
373	_mask48_4 = _mask48_4;
374	_mask48_3 = _mask48_3;
375	_mask48_2 = _mask48_2;
376	_mask48_1 = _mask48_1;
377	_mask48_0 = _mask48_0;
378	}
379	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
380
381
382	static int _mmx_supported = 2;
383
384	/===========================================================================/
385	/* */
386	/* P N G _ C O M B I N E _ R O W */
387	/* */
388	/===========================================================================/
389
390	#if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
391
392	#define BPP2 2
393	#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
394	#define BPP4 4
395	#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
396	#define BPP8 8
397
398	/* Combines the row recently read in with the previous row.
399	This routine takes care of alpha and transparency if requested.
400	This routine also handles the two methods of progressive display
401	of interlaced images, depending on the mask value.
402	The mask value describes which pixels are to be combined with
403	the row. The pattern always repeats every 8 pixels, so just 8
404	bits are needed. A one indicates the pixel is to be combined; a
405	zero indicates the pixel is to be skipped. This is in addition
406	to any alpha or transparency value associated with the pixel.
407	If you want all pixels to be combined, pass 0xff (255) in mask. */
408
409	/* Use this routine for the x86 platform - it uses a faster MMX routine
410	if the machine supports MMX. */
411
412	void /* PRIVATE */
413	png_combine_row(png_structp png_ptr, png_bytep row, int mask)
414	{
415	png_debug(1, "in png_combine_row (pnggccrd.c)\n");
416
417	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
418	if (_mmx_supported == 2) {
419	#if !defined(PNG_1_0_X)
420	/* this should have happened in png_init_mmx_flags() already */
421	png_warning(png_ptr, "asm_flags may not have been initialized");
422	#endif
423	png_mmx_support();
424	}
425	#endif
426
427	if (mask == 0xff)
428	{
429	png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
430	png_memcpy(row, png_ptr->row_buf + 1,
431	(png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
432	}
433	else /* (png_combine_row() is never called with mask == 0) */
434	{
435	switch (png_ptr->row_info.pixel_depth)
436	{
437	case 1: /* png_ptr->row_info.pixel_depth */
438	{
439	png_bytep sp;
440	png_bytep dp;
441	int s_inc, s_start, s_end;
442	int m;
443	int shift;
444	png_uint_32 i;
445
446	sp = png_ptr->row_buf + 1;
447	dp = row;
448	m = 0x80;
449	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
450	if (png_ptr->transformations & PNG_PACKSWAP)
451	{
452	s_start = 0;
453	s_end = 7;
454	s_inc = 1;
455	}
456	else
457	#endif
458	{
459	s_start = 7;
460	s_end = 0;
461	s_inc = -1;
462	}
463
464	shift = s_start;
465
466	for (i = 0; i < png_ptr->width; i++)
467	{
468	if (m & mask)
469	{
470	int value;
471
472	value = (*sp >> shift) & 0x1;
473	*dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
474	*dp \|= (png_byte)(value << shift);
475	}
476
477	if (shift == s_end)
478	{
479	shift = s_start;
480	sp++;
481	dp++;
482	}
483	else
484	shift += s_inc;
485
486	if (m == 1)
487	m = 0x80;
488	else
489	m >>= 1;
490	}
491	break;
492	}
493
494	case 2: /* png_ptr->row_info.pixel_depth */
495	{
496	png_bytep sp;
497	png_bytep dp;
498	int s_start, s_end, s_inc;
499	int m;
500	int shift;
501	png_uint_32 i;
502	int value;
503
504	sp = png_ptr->row_buf + 1;
505	dp = row;
506	m = 0x80;
507	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
508	if (png_ptr->transformations & PNG_PACKSWAP)
509	{
510	s_start = 0;
511	s_end = 6;
512	s_inc = 2;
513	}
514	else
515	#endif
516	{
517	s_start = 6;
518	s_end = 0;
519	s_inc = -2;
520	}
521
522	shift = s_start;
523
524	for (i = 0; i < png_ptr->width; i++)
525	{
526	if (m & mask)
527	{
528	value = (*sp >> shift) & 0x3;
529	*dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
530	*dp \|= (png_byte)(value << shift);
531	}
532
533	if (shift == s_end)
534	{
535	shift = s_start;
536	sp++;
537	dp++;
538	}
539	else
540	shift += s_inc;
541	if (m == 1)
542	m = 0x80;
543	else
544	m >>= 1;
545	}
546	break;
547	}
548
549	case 4: /* png_ptr->row_info.pixel_depth */
550	{
551	png_bytep sp;
552	png_bytep dp;
553	int s_start, s_end, s_inc;
554	int m;
555	int shift;
556	png_uint_32 i;
557	int value;
558
559	sp = png_ptr->row_buf + 1;
560	dp = row;
561	m = 0x80;
562	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
563	if (png_ptr->transformations & PNG_PACKSWAP)
564	{
565	s_start = 0;
566	s_end = 4;
567	s_inc = 4;
568	}
569	else
570	#endif
571	{
572	s_start = 4;
573	s_end = 0;
574	s_inc = -4;
575	}
576	shift = s_start;
577
578	for (i = 0; i < png_ptr->width; i++)
579	{
580	if (m & mask)
581	{
582	value = (*sp >> shift) & 0xf;
583	*dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
584	*dp \|= (png_byte)(value << shift);
585	}
586
587	if (shift == s_end)
588	{
589	shift = s_start;
590	sp++;
591	dp++;
592	}
593	else
594	shift += s_inc;
595	if (m == 1)
596	m = 0x80;
597	else
598	m >>= 1;
599	}
600	break;
601	}
602
603	case 8: /* png_ptr->row_info.pixel_depth */
604	{
605	png_bytep srcptr;
606	png_bytep dstptr;
607
608	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
609	#if !defined(PNG_1_0_X)
610	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
611	/* && _mmx_supported */ )
612	#else
613	if (_mmx_supported)
614	#endif
615	{
616	png_uint_32 len;
617	int diff;
618	int dummy_value_a; // fix 'forbidden register spilled' error
619	int dummy_value_d;
620	int dummy_value_c;
621	int dummy_value_S;
622	int dummy_value_D;
623	_unmask = ~mask; // global variable for -fPIC version
624	srcptr = png_ptr->row_buf + 1;
625	dstptr = row;
626	len = png_ptr->width &~7; // reduce to multiple of 8
627	diff = (int) (png_ptr->width & 7); // amount lost
628
629	__asm__ __volatile__ (
630	"movd _unmask, %%mm7 \n\t" // load bit pattern
631	"psubb %%mm6, %%mm6 \n\t" // zero mm6
632	"punpcklbw %%mm7, %%mm7 \n\t"
633	"punpcklwd %%mm7, %%mm7 \n\t"
634	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
635
636	"movq _mask8_0, %%mm0 \n\t"
637	"pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
638	"pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
639
640	// preload "movl len, %%ecx \n\t" // load length of line
641	// preload "movl srcptr, %%esi \n\t" // load source
642	// preload "movl dstptr, %%edi \n\t" // load dest
643
644	"cmpl $0, %%ecx \n\t" // len == 0 ?
645	"je mainloop8end \n\t"
646
647	"mainloop8: \n\t"
648	"movq (%%esi), %%mm4 \n\t" // *srcptr
649	"pand %%mm0, %%mm4 \n\t"
650	"movq %%mm0, %%mm6 \n\t"
651	"pandn (%%edi), %%mm6 \n\t" // *dstptr
652	"por %%mm6, %%mm4 \n\t"
653	"movq %%mm4, (%%edi) \n\t"
654	"addl $8, %%esi \n\t" // inc by 8 bytes processed
655	"addl $8, %%edi \n\t"
656	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
657	"ja mainloop8 \n\t"
658
659	"mainloop8end: \n\t"
660	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
661	"movl %%eax, %%ecx \n\t"
662	"cmpl $0, %%ecx \n\t"
663	"jz end8 \n\t"
664	// preload "movl mask, %%edx \n\t"
665	"sall $24, %%edx \n\t" // make low byte, high byte
666
667	"secondloop8: \n\t"
668	"sall %%edx \n\t" // move high bit to CF
669	"jnc skip8 \n\t" // if CF = 0
670	"movb (%%esi), %%al \n\t"
671	"movb %%al, (%%edi) \n\t"
672
673	"skip8: \n\t"
674	"incl %%esi \n\t"
675	"incl %%edi \n\t"
676	"decl %%ecx \n\t"
677	"jnz secondloop8 \n\t"
678
679	"end8: \n\t"
680	"EMMS \n\t" // DONE
681
682	: "=a" (dummy_value_a), // output regs (dummy)
683	"=d" (dummy_value_d),
684	"=c" (dummy_value_c),
685	"=S" (dummy_value_S),
686	"=D" (dummy_value_D)
687
688	: "3" (srcptr), // esi // input regs
689	"4" (dstptr), // edi
690	"0" (diff), // eax
691	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
692	"2" (len), // ecx
693	"1" (mask) // edx
694
695	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
696	: "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
697	#endif
698	);
699	}
700	else /* mmx _not supported - Use modified C routine */
701	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
702	{
703	register png_uint_32 i;
704	png_uint_32 initial_val = png_pass_start[png_ptr->pass];
705	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
706	register int stride = png_pass_inc[png_ptr->pass];
707	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
708	register int rep_bytes = png_pass_width[png_ptr->pass];
709	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
710	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
711	int diff = (int) (png_ptr->width & 7); /* amount lost */
712	register png_uint_32 final_val = len; /* GRR bugfix */
713
714	srcptr = png_ptr->row_buf + 1 + initial_val;
715	dstptr = row + initial_val;
716
717	for (i = initial_val; i < final_val; i += stride)
718	{
719	png_memcpy(dstptr, srcptr, rep_bytes);
720	srcptr += stride;
721	dstptr += stride;
722	}
723	if (diff) /* number of leftover pixels: 3 for pngtest */
724	{
725	final_val+=diff /* BPP1 / ;
726	for (; i < final_val; i += stride)
727	{
728	if (rep_bytes > (int)(final_val-i))
729	rep_bytes = (int)(final_val-i);
730	png_memcpy(dstptr, srcptr, rep_bytes);
731	srcptr += stride;
732	dstptr += stride;
733	}
734	}
735
736	} /* end of else (_mmx_supported) */
737
738	break;
739	} /* end 8 bpp */
740
741	case 16: /* png_ptr->row_info.pixel_depth */
742	{
743	png_bytep srcptr;
744	png_bytep dstptr;
745
746	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
747	#if !defined(PNG_1_0_X)
748	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
749	/* && _mmx_supported */ )
750	#else
751	if (_mmx_supported)
752	#endif
753	{
754	png_uint_32 len;
755	int diff;
756	int dummy_value_a; // fix 'forbidden register spilled' error
757	int dummy_value_d;
758	int dummy_value_c;
759	int dummy_value_S;
760	int dummy_value_D;
761	_unmask = ~mask; // global variable for -fPIC version
762	srcptr = png_ptr->row_buf + 1;
763	dstptr = row;
764	len = png_ptr->width &~7; // reduce to multiple of 8
765	diff = (int) (png_ptr->width & 7); // amount lost //
766
767	__asm__ __volatile__ (
768	"movd _unmask, %%mm7 \n\t" // load bit pattern
769	"psubb %%mm6, %%mm6 \n\t" // zero mm6
770	"punpcklbw %%mm7, %%mm7 \n\t"
771	"punpcklwd %%mm7, %%mm7 \n\t"
772	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
773
774	"movq _mask16_0, %%mm0 \n\t"
775	"movq _mask16_1, %%mm1 \n\t"
776
777	"pand %%mm7, %%mm0 \n\t"
778	"pand %%mm7, %%mm1 \n\t"
779
780	"pcmpeqb %%mm6, %%mm0 \n\t"
781	"pcmpeqb %%mm6, %%mm1 \n\t"
782
783	// preload "movl len, %%ecx \n\t" // load length of line
784	// preload "movl srcptr, %%esi \n\t" // load source
785	// preload "movl dstptr, %%edi \n\t" // load dest
786
787	"cmpl $0, %%ecx \n\t"
788	"jz mainloop16end \n\t"
789
790	"mainloop16: \n\t"
791	"movq (%%esi), %%mm4 \n\t"
792	"pand %%mm0, %%mm4 \n\t"
793	"movq %%mm0, %%mm6 \n\t"
794	"movq (%%edi), %%mm7 \n\t"
795	"pandn %%mm7, %%mm6 \n\t"
796	"por %%mm6, %%mm4 \n\t"
797	"movq %%mm4, (%%edi) \n\t"
798
799	"movq 8(%%esi), %%mm5 \n\t"
800	"pand %%mm1, %%mm5 \n\t"
801	"movq %%mm1, %%mm7 \n\t"
802	"movq 8(%%edi), %%mm6 \n\t"
803	"pandn %%mm6, %%mm7 \n\t"
804	"por %%mm7, %%mm5 \n\t"
805	"movq %%mm5, 8(%%edi) \n\t"
806
807	"addl $16, %%esi \n\t" // inc by 16 bytes processed
808	"addl $16, %%edi \n\t"
809	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
810	"ja mainloop16 \n\t"
811
812	"mainloop16end: \n\t"
813	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
814	"movl %%eax, %%ecx \n\t"
815	"cmpl $0, %%ecx \n\t"
816	"jz end16 \n\t"
817	// preload "movl mask, %%edx \n\t"
818	"sall $24, %%edx \n\t" // make low byte, high byte
819
820	"secondloop16: \n\t"
821	"sall %%edx \n\t" // move high bit to CF
822	"jnc skip16 \n\t" // if CF = 0
823	"movw (%%esi), %%ax \n\t"
824	"movw %%ax, (%%edi) \n\t"
825
826	"skip16: \n\t"
827	"addl $2, %%esi \n\t"
828	"addl $2, %%edi \n\t"
829	"decl %%ecx \n\t"
830	"jnz secondloop16 \n\t"
831
832	"end16: \n\t"
833	"EMMS \n\t" // DONE
834
835	: "=a" (dummy_value_a), // output regs (dummy)
836	"=c" (dummy_value_c),
837	"=d" (dummy_value_d),
838	"=S" (dummy_value_S),
839	"=D" (dummy_value_D)
840
841	: "0" (diff), // eax // input regs
842	// was (unmask) " " RESERVED // ebx // Global Offset Table idx
843	"1" (len), // ecx
844	"2" (mask), // edx
845	"3" (srcptr), // esi
846	"4" (dstptr) // edi
847
848	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
849	: "%mm0", "%mm1", "%mm4" // clobber list
850	, "%mm5", "%mm6", "%mm7"
851	#endif
852	);
853	}
854	else /* mmx _not supported - Use modified C routine */
855	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
856	{
857	register png_uint_32 i;
858	png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
859	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
860	register int stride = BPP2 * png_pass_inc[png_ptr->pass];
861	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
862	register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
863	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
864	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
865	int diff = (int) (png_ptr->width & 7); /* amount lost */
866	register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
867
868	srcptr = png_ptr->row_buf + 1 + initial_val;
869	dstptr = row + initial_val;
870
871	for (i = initial_val; i < final_val; i += stride)
872	{
873	png_memcpy(dstptr, srcptr, rep_bytes);
874	srcptr += stride;
875	dstptr += stride;
876	}
877	if (diff) /* number of leftover pixels: 3 for pngtest */
878	{
879	final_val+=diff*BPP2;
880	for (; i < final_val; i += stride)
881	{
882	if (rep_bytes > (int)(final_val-i))
883	rep_bytes = (int)(final_val-i);
884	png_memcpy(dstptr, srcptr, rep_bytes);
885	srcptr += stride;
886	dstptr += stride;
887	}
888	}
889	} /* end of else (_mmx_supported) */
890
891	break;
892	} /* end 16 bpp */
893
894	case 24: /* png_ptr->row_info.pixel_depth */
895	{
896	png_bytep srcptr;
897	png_bytep dstptr;
898
899	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
900	#if !defined(PNG_1_0_X)
901	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
902	/* && _mmx_supported */ )
903	#else
904	if (_mmx_supported)
905	#endif
906	{
907	png_uint_32 len;
908	int diff;
909	int dummy_value_a; // fix 'forbidden register spilled' error
910	int dummy_value_d;
911	int dummy_value_c;
912	int dummy_value_S;
913	int dummy_value_D;
914	_unmask = ~mask; // global variable for -fPIC version
915	srcptr = png_ptr->row_buf + 1;
916	dstptr = row;
917	len = png_ptr->width &~7; // reduce to multiple of 8
918	diff = (int) (png_ptr->width & 7); // amount lost //
919
920	__asm__ __volatile__ (
921	"movd _unmask, %%mm7 \n\t" // load bit pattern
922	"psubb %%mm6, %%mm6 \n\t" // zero mm6
923	"punpcklbw %%mm7, %%mm7 \n\t"
924	"punpcklwd %%mm7, %%mm7 \n\t"
925	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
926
927	"movq _mask24_0, %%mm0 \n\t"
928	"movq _mask24_1, %%mm1 \n\t"
929	"movq _mask24_2, %%mm2 \n\t"
930
931	"pand %%mm7, %%mm0 \n\t"
932	"pand %%mm7, %%mm1 \n\t"
933	"pand %%mm7, %%mm2 \n\t"
934
935	"pcmpeqb %%mm6, %%mm0 \n\t"
936	"pcmpeqb %%mm6, %%mm1 \n\t"
937	"pcmpeqb %%mm6, %%mm2 \n\t"
938
939	// preload "movl len, %%ecx \n\t" // load length of line
940	// preload "movl srcptr, %%esi \n\t" // load source
941	// preload "movl dstptr, %%edi \n\t" // load dest
942
943	"cmpl $0, %%ecx \n\t"
944	"jz mainloop24end \n\t"
945
946	"mainloop24: \n\t"
947	"movq (%%esi), %%mm4 \n\t"
948	"pand %%mm0, %%mm4 \n\t"
949	"movq %%mm0, %%mm6 \n\t"
950	"movq (%%edi), %%mm7 \n\t"
951	"pandn %%mm7, %%mm6 \n\t"
952	"por %%mm6, %%mm4 \n\t"
953	"movq %%mm4, (%%edi) \n\t"
954
955	"movq 8(%%esi), %%mm5 \n\t"
956	"pand %%mm1, %%mm5 \n\t"
957	"movq %%mm1, %%mm7 \n\t"
958	"movq 8(%%edi), %%mm6 \n\t"
959	"pandn %%mm6, %%mm7 \n\t"
960	"por %%mm7, %%mm5 \n\t"
961	"movq %%mm5, 8(%%edi) \n\t"
962
963	"movq 16(%%esi), %%mm6 \n\t"
964	"pand %%mm2, %%mm6 \n\t"
965	"movq %%mm2, %%mm4 \n\t"
966	"movq 16(%%edi), %%mm7 \n\t"
967	"pandn %%mm7, %%mm4 \n\t"
968	"por %%mm4, %%mm6 \n\t"
969	"movq %%mm6, 16(%%edi) \n\t"
970
971	"addl $24, %%esi \n\t" // inc by 24 bytes processed
972	"addl $24, %%edi \n\t"
973	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
974
975	"ja mainloop24 \n\t"
976
977	"mainloop24end: \n\t"
978	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
979	"movl %%eax, %%ecx \n\t"
980	"cmpl $0, %%ecx \n\t"
981	"jz end24 \n\t"
982	// preload "movl mask, %%edx \n\t"
983	"sall $24, %%edx \n\t" // make low byte, high byte
984
985	"secondloop24: \n\t"
986	"sall %%edx \n\t" // move high bit to CF
987	"jnc skip24 \n\t" // if CF = 0
988	"movw (%%esi), %%ax \n\t"
989	"movw %%ax, (%%edi) \n\t"
990	"xorl %%eax, %%eax \n\t"
991	"movb 2(%%esi), %%al \n\t"
992	"movb %%al, 2(%%edi) \n\t"
993
994	"skip24: \n\t"
995	"addl $3, %%esi \n\t"
996	"addl $3, %%edi \n\t"
997	"decl %%ecx \n\t"
998	"jnz secondloop24 \n\t"
999
1000	"end24: \n\t"
1001	"EMMS \n\t" // DONE
1002
1003	: "=a" (dummy_value_a), // output regs (dummy)
1004	"=d" (dummy_value_d),
1005	"=c" (dummy_value_c),
1006	"=S" (dummy_value_S),
1007	"=D" (dummy_value_D)
1008
1009	: "3" (srcptr), // esi // input regs
1010	"4" (dstptr), // edi
1011	"0" (diff), // eax
1012	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1013	"2" (len), // ecx
1014	"1" (mask) // edx
1015
1016	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1017	: "%mm0", "%mm1", "%mm2" // clobber list
1018	, "%mm4", "%mm5", "%mm6", "%mm7"
1019	#endif
1020	);
1021	}
1022	else /* mmx _not supported - Use modified C routine */
1023	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1024	{
1025	register png_uint_32 i;
1026	png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1027	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1028	register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1029	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1030	register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1031	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1032	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1033	int diff = (int) (png_ptr->width & 7); /* amount lost */
1034	register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
1035
1036	srcptr = png_ptr->row_buf + 1 + initial_val;
1037	dstptr = row + initial_val;
1038
1039	for (i = initial_val; i < final_val; i += stride)
1040	{
1041	png_memcpy(dstptr, srcptr, rep_bytes);
1042	srcptr += stride;
1043	dstptr += stride;
1044	}
1045	if (diff) /* number of leftover pixels: 3 for pngtest */
1046	{
1047	final_val+=diff*BPP3;
1048	for (; i < final_val; i += stride)
1049	{
1050	if (rep_bytes > (int)(final_val-i))
1051	rep_bytes = (int)(final_val-i);
1052	png_memcpy(dstptr, srcptr, rep_bytes);
1053	srcptr += stride;
1054	dstptr += stride;
1055	}
1056	}
1057	} /* end of else (_mmx_supported) */
1058
1059	break;
1060	} /* end 24 bpp */
1061
1062	case 32: /* png_ptr->row_info.pixel_depth */
1063	{
1064	png_bytep srcptr;
1065	png_bytep dstptr;
1066
1067	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1068	#if !defined(PNG_1_0_X)
1069	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1070	/* && _mmx_supported */ )
1071	#else
1072	if (_mmx_supported)
1073	#endif
1074	{
1075	png_uint_32 len;
1076	int diff;
1077	int dummy_value_a; // fix 'forbidden register spilled' error
1078	int dummy_value_d;
1079	int dummy_value_c;
1080	int dummy_value_S;
1081	int dummy_value_D;
1082	_unmask = ~mask; // global variable for -fPIC version
1083	srcptr = png_ptr->row_buf + 1;
1084	dstptr = row;
1085	len = png_ptr->width &~7; // reduce to multiple of 8
1086	diff = (int) (png_ptr->width & 7); // amount lost //
1087
1088	__asm__ __volatile__ (
1089	"movd _unmask, %%mm7 \n\t" // load bit pattern
1090	"psubb %%mm6, %%mm6 \n\t" // zero mm6
1091	"punpcklbw %%mm7, %%mm7 \n\t"
1092	"punpcklwd %%mm7, %%mm7 \n\t"
1093	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1094
1095	"movq _mask32_0, %%mm0 \n\t"
1096	"movq _mask32_1, %%mm1 \n\t"
1097	"movq _mask32_2, %%mm2 \n\t"
1098	"movq _mask32_3, %%mm3 \n\t"
1099
1100	"pand %%mm7, %%mm0 \n\t"
1101	"pand %%mm7, %%mm1 \n\t"
1102	"pand %%mm7, %%mm2 \n\t"
1103	"pand %%mm7, %%mm3 \n\t"
1104
1105	"pcmpeqb %%mm6, %%mm0 \n\t"
1106	"pcmpeqb %%mm6, %%mm1 \n\t"
1107	"pcmpeqb %%mm6, %%mm2 \n\t"
1108	"pcmpeqb %%mm6, %%mm3 \n\t"
1109
1110	// preload "movl len, %%ecx \n\t" // load length of line
1111	// preload "movl srcptr, %%esi \n\t" // load source
1112	// preload "movl dstptr, %%edi \n\t" // load dest
1113
1114	"cmpl $0, %%ecx \n\t" // lcr
1115	"jz mainloop32end \n\t"
1116
1117	"mainloop32: \n\t"
1118	"movq (%%esi), %%mm4 \n\t"
1119	"pand %%mm0, %%mm4 \n\t"
1120	"movq %%mm0, %%mm6 \n\t"
1121	"movq (%%edi), %%mm7 \n\t"
1122	"pandn %%mm7, %%mm6 \n\t"
1123	"por %%mm6, %%mm4 \n\t"
1124	"movq %%mm4, (%%edi) \n\t"
1125
1126	"movq 8(%%esi), %%mm5 \n\t"
1127	"pand %%mm1, %%mm5 \n\t"
1128	"movq %%mm1, %%mm7 \n\t"
1129	"movq 8(%%edi), %%mm6 \n\t"
1130	"pandn %%mm6, %%mm7 \n\t"
1131	"por %%mm7, %%mm5 \n\t"
1132	"movq %%mm5, 8(%%edi) \n\t"
1133
1134	"movq 16(%%esi), %%mm6 \n\t"
1135	"pand %%mm2, %%mm6 \n\t"
1136	"movq %%mm2, %%mm4 \n\t"
1137	"movq 16(%%edi), %%mm7 \n\t"
1138	"pandn %%mm7, %%mm4 \n\t"
1139	"por %%mm4, %%mm6 \n\t"
1140	"movq %%mm6, 16(%%edi) \n\t"
1141
1142	"movq 24(%%esi), %%mm7 \n\t"
1143	"pand %%mm3, %%mm7 \n\t"
1144	"movq %%mm3, %%mm5 \n\t"
1145	"movq 24(%%edi), %%mm4 \n\t"
1146	"pandn %%mm4, %%mm5 \n\t"
1147	"por %%mm5, %%mm7 \n\t"
1148	"movq %%mm7, 24(%%edi) \n\t"
1149
1150	"addl $32, %%esi \n\t" // inc by 32 bytes processed
1151	"addl $32, %%edi \n\t"
1152	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
1153	"ja mainloop32 \n\t"
1154
1155	"mainloop32end: \n\t"
1156	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1157	"movl %%eax, %%ecx \n\t"
1158	"cmpl $0, %%ecx \n\t"
1159	"jz end32 \n\t"
1160	// preload "movl mask, %%edx \n\t"
1161	"sall $24, %%edx \n\t" // low byte => high byte
1162
1163	"secondloop32: \n\t"
1164	"sall %%edx \n\t" // move high bit to CF
1165	"jnc skip32 \n\t" // if CF = 0
1166	"movl (%%esi), %%eax \n\t"
1167	"movl %%eax, (%%edi) \n\t"
1168
1169	"skip32: \n\t"
1170	"addl $4, %%esi \n\t"
1171	"addl $4, %%edi \n\t"
1172	"decl %%ecx \n\t"
1173	"jnz secondloop32 \n\t"
1174
1175	"end32: \n\t"
1176	"EMMS \n\t" // DONE
1177
1178	: "=a" (dummy_value_a), // output regs (dummy)
1179	"=d" (dummy_value_d),
1180	"=c" (dummy_value_c),
1181	"=S" (dummy_value_S),
1182	"=D" (dummy_value_D)
1183
1184	: "3" (srcptr), // esi // input regs
1185	"4" (dstptr), // edi
1186	"0" (diff), // eax
1187	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1188	"2" (len), // ecx
1189	"1" (mask) // edx
1190
1191	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1192	: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1193	, "%mm4", "%mm5", "%mm6", "%mm7"
1194	#endif
1195	);
1196	}
1197	else /* mmx _not supported - Use modified C routine */
1198	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1199	{
1200	register png_uint_32 i;
1201	png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1202	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1203	register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1204	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1205	register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1206	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1207	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1208	int diff = (int) (png_ptr->width & 7); /* amount lost */
1209	register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
1210
1211	srcptr = png_ptr->row_buf + 1 + initial_val;
1212	dstptr = row + initial_val;
1213
1214	for (i = initial_val; i < final_val; i += stride)
1215	{
1216	png_memcpy(dstptr, srcptr, rep_bytes);
1217	srcptr += stride;
1218	dstptr += stride;
1219	}
1220	if (diff) /* number of leftover pixels: 3 for pngtest */
1221	{
1222	final_val+=diff*BPP4;
1223	for (; i < final_val; i += stride)
1224	{
1225	if (rep_bytes > (int)(final_val-i))
1226	rep_bytes = (int)(final_val-i);
1227	png_memcpy(dstptr, srcptr, rep_bytes);
1228	srcptr += stride;
1229	dstptr += stride;
1230	}
1231	}
1232	} /* end of else (_mmx_supported) */
1233
1234	break;
1235	} /* end 32 bpp */
1236
1237	case 48: /* png_ptr->row_info.pixel_depth */
1238	{
1239	png_bytep srcptr;
1240	png_bytep dstptr;
1241
1242	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1243	#if !defined(PNG_1_0_X)
1244	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1245	/* && _mmx_supported */ )
1246	#else
1247	if (_mmx_supported)
1248	#endif
1249	{
1250	png_uint_32 len;
1251	int diff;
1252	int dummy_value_a; // fix 'forbidden register spilled' error
1253	int dummy_value_d;
1254	int dummy_value_c;
1255	int dummy_value_S;
1256	int dummy_value_D;
1257	_unmask = ~mask; // global variable for -fPIC version
1258	srcptr = png_ptr->row_buf + 1;
1259	dstptr = row;
1260	len = png_ptr->width &~7; // reduce to multiple of 8
1261	diff = (int) (png_ptr->width & 7); // amount lost //
1262
1263	__asm__ __volatile__ (
1264	"movd _unmask, %%mm7 \n\t" // load bit pattern
1265	"psubb %%mm6, %%mm6 \n\t" // zero mm6
1266	"punpcklbw %%mm7, %%mm7 \n\t"
1267	"punpcklwd %%mm7, %%mm7 \n\t"
1268	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1269
1270	"movq _mask48_0, %%mm0 \n\t"
1271	"movq _mask48_1, %%mm1 \n\t"
1272	"movq _mask48_2, %%mm2 \n\t"
1273	"movq _mask48_3, %%mm3 \n\t"
1274	"movq _mask48_4, %%mm4 \n\t"
1275	"movq _mask48_5, %%mm5 \n\t"
1276
1277	"pand %%mm7, %%mm0 \n\t"
1278	"pand %%mm7, %%mm1 \n\t"
1279	"pand %%mm7, %%mm2 \n\t"
1280	"pand %%mm7, %%mm3 \n\t"
1281	"pand %%mm7, %%mm4 \n\t"
1282	"pand %%mm7, %%mm5 \n\t"
1283
1284	"pcmpeqb %%mm6, %%mm0 \n\t"
1285	"pcmpeqb %%mm6, %%mm1 \n\t"
1286	"pcmpeqb %%mm6, %%mm2 \n\t"
1287	"pcmpeqb %%mm6, %%mm3 \n\t"
1288	"pcmpeqb %%mm6, %%mm4 \n\t"
1289	"pcmpeqb %%mm6, %%mm5 \n\t"
1290
1291	// preload "movl len, %%ecx \n\t" // load length of line
1292	// preload "movl srcptr, %%esi \n\t" // load source
1293	// preload "movl dstptr, %%edi \n\t" // load dest
1294
1295	"cmpl $0, %%ecx \n\t"
1296	"jz mainloop48end \n\t"
1297
1298	"mainloop48: \n\t"
1299	"movq (%%esi), %%mm7 \n\t"
1300	"pand %%mm0, %%mm7 \n\t"
1301	"movq %%mm0, %%mm6 \n\t"
1302	"pandn (%%edi), %%mm6 \n\t"
1303	"por %%mm6, %%mm7 \n\t"
1304	"movq %%mm7, (%%edi) \n\t"
1305
1306	"movq 8(%%esi), %%mm6 \n\t"
1307	"pand %%mm1, %%mm6 \n\t"
1308	"movq %%mm1, %%mm7 \n\t"
1309	"pandn 8(%%edi), %%mm7 \n\t"
1310	"por %%mm7, %%mm6 \n\t"
1311	"movq %%mm6, 8(%%edi) \n\t"
1312
1313	"movq 16(%%esi), %%mm6 \n\t"
1314	"pand %%mm2, %%mm6 \n\t"
1315	"movq %%mm2, %%mm7 \n\t"
1316	"pandn 16(%%edi), %%mm7 \n\t"
1317	"por %%mm7, %%mm6 \n\t"
1318	"movq %%mm6, 16(%%edi) \n\t"
1319
1320	"movq 24(%%esi), %%mm7 \n\t"
1321	"pand %%mm3, %%mm7 \n\t"
1322	"movq %%mm3, %%mm6 \n\t"
1323	"pandn 24(%%edi), %%mm6 \n\t"
1324	"por %%mm6, %%mm7 \n\t"
1325	"movq %%mm7, 24(%%edi) \n\t"
1326
1327	"movq 32(%%esi), %%mm6 \n\t"
1328	"pand %%mm4, %%mm6 \n\t"
1329	"movq %%mm4, %%mm7 \n\t"
1330	"pandn 32(%%edi), %%mm7 \n\t"
1331	"por %%mm7, %%mm6 \n\t"
1332	"movq %%mm6, 32(%%edi) \n\t"
1333
1334	"movq 40(%%esi), %%mm7 \n\t"
1335	"pand %%mm5, %%mm7 \n\t"
1336	"movq %%mm5, %%mm6 \n\t"
1337	"pandn 40(%%edi), %%mm6 \n\t"
1338	"por %%mm6, %%mm7 \n\t"
1339	"movq %%mm7, 40(%%edi) \n\t"
1340
1341	"addl $48, %%esi \n\t" // inc by 48 bytes processed
1342	"addl $48, %%edi \n\t"
1343	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
1344
1345	"ja mainloop48 \n\t"
1346
1347	"mainloop48end: \n\t"
1348	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1349	"movl %%eax, %%ecx \n\t"
1350	"cmpl $0, %%ecx \n\t"
1351	"jz end48 \n\t"
1352	// preload "movl mask, %%edx \n\t"
1353	"sall $24, %%edx \n\t" // make low byte, high byte
1354
1355	"secondloop48: \n\t"
1356	"sall %%edx \n\t" // move high bit to CF
1357	"jnc skip48 \n\t" // if CF = 0
1358	"movl (%%esi), %%eax \n\t"
1359	"movl %%eax, (%%edi) \n\t"
1360
1361	"skip48: \n\t"
1362	"addl $4, %%esi \n\t"
1363	"addl $4, %%edi \n\t"
1364	"decl %%ecx \n\t"
1365	"jnz secondloop48 \n\t"
1366
1367	"end48: \n\t"
1368	"EMMS \n\t" // DONE
1369
1370	: "=a" (dummy_value_a), // output regs (dummy)
1371	"=d" (dummy_value_d),
1372	"=c" (dummy_value_c),
1373	"=S" (dummy_value_S),
1374	"=D" (dummy_value_D)
1375
1376	: "3" (srcptr), // esi // input regs
1377	"4" (dstptr), // edi
1378	"0" (diff), // eax
1379	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1380	"2" (len), // ecx
1381	"1" (mask) // edx
1382
1383	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1384	: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1385	, "%mm4", "%mm5", "%mm6", "%mm7"
1386	#endif
1387	);
1388	}
1389	else /* mmx _not supported - Use modified C routine */
1390	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1391	{
1392	register png_uint_32 i;
1393	png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1394	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1395	register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1396	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1397	register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1398	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1399	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1400	int diff = (int) (png_ptr->width & 7); /* amount lost */
1401	register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
1402
1403	srcptr = png_ptr->row_buf + 1 + initial_val;
1404	dstptr = row + initial_val;
1405
1406	for (i = initial_val; i < final_val; i += stride)
1407	{
1408	png_memcpy(dstptr, srcptr, rep_bytes);
1409	srcptr += stride;
1410	dstptr += stride;
1411	}
1412	if (diff) /* number of leftover pixels: 3 for pngtest */
1413	{
1414	final_val+=diff*BPP6;
1415	for (; i < final_val; i += stride)
1416	{
1417	if (rep_bytes > (int)(final_val-i))
1418	rep_bytes = (int)(final_val-i);
1419	png_memcpy(dstptr, srcptr, rep_bytes);
1420	srcptr += stride;
1421	dstptr += stride;
1422	}
1423	}
1424	} /* end of else (_mmx_supported) */
1425
1426	break;
1427	} /* end 48 bpp */
1428
1429	case 64: /* png_ptr->row_info.pixel_depth */
1430	{
1431	png_bytep srcptr;
1432	png_bytep dstptr;
1433	register png_uint_32 i;
1434	png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1435	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1436	register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1437	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1438	register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1439	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1440	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1441	int diff = (int) (png_ptr->width & 7); /* amount lost */
1442	register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
1443
1444	srcptr = png_ptr->row_buf + 1 + initial_val;
1445	dstptr = row + initial_val;
1446
1447	for (i = initial_val; i < final_val; i += stride)
1448	{
1449	png_memcpy(dstptr, srcptr, rep_bytes);
1450	srcptr += stride;
1451	dstptr += stride;
1452	}
1453	if (diff) /* number of leftover pixels: 3 for pngtest */
1454	{
1455	final_val+=diff*BPP8;
1456	for (; i < final_val; i += stride)
1457	{
1458	if (rep_bytes > (int)(final_val-i))
1459	rep_bytes = (int)(final_val-i);
1460	png_memcpy(dstptr, srcptr, rep_bytes);
1461	srcptr += stride;
1462	dstptr += stride;
1463	}
1464	}
1465
1466	break;
1467	} /* end 64 bpp */
1468
1469	default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1470	{
1471	/* this should never happen */
1472	png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1473	break;
1474	}
1475	} /* end switch (png_ptr->row_info.pixel_depth) */
1476
1477	} /* end if (non-trivial mask) */
1478
1479	} /* end png_combine_row() */
1480
1481	#endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1482
1483
1484
1485
1486	/===========================================================================/
1487	/* */
1488	/* P N G _ D O _ R E A D _ I N T E R L A C E */
1489	/* */
1490	/===========================================================================/
1491
1492	#if defined(PNG_READ_INTERLACING_SUPPORTED)
1493	#if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1494
1495	/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1496	* has taken place. [GRR: what other steps come before and/or after?]
1497	*/
1498
1499	void /* PRIVATE */
1500	png_do_read_interlace(png_structp png_ptr)
1501	{
1502	png_row_infop row_info = &(png_ptr->row_info);
1503	png_bytep row = png_ptr->row_buf + 1;
1504	int pass = png_ptr->pass;
1505	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1506	png_uint_32 transformations = png_ptr->transformations;
1507	#endif
1508
1509	png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1510
1511	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1512	if (_mmx_supported == 2) {
1513	#if !defined(PNG_1_0_X)
1514	/* this should have happened in png_init_mmx_flags() already */
1515	png_warning(png_ptr, "asm_flags may not have been initialized");
1516	#endif
1517	png_mmx_support();
1518	}
1519	#endif
1520
1521	if (row != NULL && row_info != NULL)
1522	{
1523	png_uint_32 final_width;
1524
1525	final_width = row_info->width * png_pass_inc[pass];
1526
1527	switch (row_info->pixel_depth)
1528	{
1529	case 1:
1530	{
1531	png_bytep sp, dp;
1532	int sshift, dshift;
1533	int s_start, s_end, s_inc;
1534	png_byte v;
1535	png_uint_32 i;
1536	int j;
1537
1538	sp = row + (png_size_t)((row_info->width - 1) >> 3);
1539	dp = row + (png_size_t)((final_width - 1) >> 3);
1540	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1541	if (transformations & PNG_PACKSWAP)
1542	{
1543	sshift = (int)((row_info->width + 7) & 7);
1544	dshift = (int)((final_width + 7) & 7);
1545	s_start = 7;
1546	s_end = 0;
1547	s_inc = -1;
1548	}
1549	else
1550	#endif
1551	{
1552	sshift = 7 - (int)((row_info->width + 7) & 7);
1553	dshift = 7 - (int)((final_width + 7) & 7);
1554	s_start = 0;
1555	s_end = 7;
1556	s_inc = 1;
1557	}
1558
1559	for (i = row_info->width; i; i--)
1560	{
1561	v = (png_byte)((*sp >> sshift) & 0x1);
1562	for (j = 0; j < png_pass_inc[pass]; j++)
1563	{
1564	*dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1565	*dp \|= (png_byte)(v << dshift);
1566	if (dshift == s_end)
1567	{
1568	dshift = s_start;
1569	dp--;
1570	}
1571	else
1572	dshift += s_inc;
1573	}
1574	if (sshift == s_end)
1575	{
1576	sshift = s_start;
1577	sp--;
1578	}
1579	else
1580	sshift += s_inc;
1581	}
1582	break;
1583	}
1584
1585	case 2:
1586	{
1587	png_bytep sp, dp;
1588	int sshift, dshift;
1589	int s_start, s_end, s_inc;
1590	png_uint_32 i;
1591
1592	sp = row + (png_size_t)((row_info->width - 1) >> 2);
1593	dp = row + (png_size_t)((final_width - 1) >> 2);
1594	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1595	if (transformations & PNG_PACKSWAP)
1596	{
1597	sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1598	dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1599	s_start = 6;
1600	s_end = 0;
1601	s_inc = -2;
1602	}
1603	else
1604	#endif
1605	{
1606	sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1607	dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1608	s_start = 0;
1609	s_end = 6;
1610	s_inc = 2;
1611	}
1612
1613	for (i = row_info->width; i; i--)
1614	{
1615	png_byte v;
1616	int j;
1617
1618	v = (png_byte)((*sp >> sshift) & 0x3);
1619	for (j = 0; j < png_pass_inc[pass]; j++)
1620	{
1621	*dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1622	*dp \|= (png_byte)(v << dshift);
1623	if (dshift == s_end)
1624	{
1625	dshift = s_start;
1626	dp--;
1627	}
1628	else
1629	dshift += s_inc;
1630	}
1631	if (sshift == s_end)
1632	{
1633	sshift = s_start;
1634	sp--;
1635	}
1636	else
1637	sshift += s_inc;
1638	}
1639	break;
1640	}
1641
1642	case 4:
1643	{
1644	png_bytep sp, dp;
1645	int sshift, dshift;
1646	int s_start, s_end, s_inc;
1647	png_uint_32 i;
1648
1649	sp = row + (png_size_t)((row_info->width - 1) >> 1);
1650	dp = row + (png_size_t)((final_width - 1) >> 1);
1651	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1652	if (transformations & PNG_PACKSWAP)
1653	{
1654	sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1655	dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1656	s_start = 4;
1657	s_end = 0;
1658	s_inc = -4;
1659	}
1660	else
1661	#endif
1662	{
1663	sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1664	dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1665	s_start = 0;
1666	s_end = 4;
1667	s_inc = 4;
1668	}
1669
1670	for (i = row_info->width; i; i--)
1671	{
1672	png_byte v;
1673	int j;
1674
1675	v = (png_byte)((*sp >> sshift) & 0xf);
1676	for (j = 0; j < png_pass_inc[pass]; j++)
1677	{
1678	*dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1679	*dp \|= (png_byte)(v << dshift);
1680	if (dshift == s_end)
1681	{
1682	dshift = s_start;
1683	dp--;
1684	}
1685	else
1686	dshift += s_inc;
1687	}
1688	if (sshift == s_end)
1689	{
1690	sshift = s_start;
1691	sp--;
1692	}
1693	else
1694	sshift += s_inc;
1695	}
1696	break;
1697	}
1698
1699	/====================================================================/
1700
1701	default: /* 8-bit or larger (this is where the routine is modified) */
1702	{
1703	#if 0
1704	// static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1705	// static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1706	// unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1707	// unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1708	#endif
1709	png_bytep sptr, dp;
1710	png_uint_32 i;
1711	png_size_t pixel_bytes;
1712	int width = (int)row_info->width;
1713
1714	pixel_bytes = (row_info->pixel_depth >> 3);
1715
1716	/* point sptr at the last pixel in the pre-expanded row: */
1717	sptr = row + (width - 1) * pixel_bytes;
1718
1719	/* point dp at the last pixel position in the expanded row: */
1720	dp = row + (final_width - 1) * pixel_bytes;
1721
1722	/* New code by Nirav Chhatrapati - Intel Corporation */
1723
1724	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1725	#if !defined(PNG_1_0_X)
1726	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1727	/* && _mmx_supported */ )
1728	#else
1729	if (_mmx_supported)
1730	#endif
1731	{
1732	//--------------------------------------------------------------
1733	if (pixel_bytes == 3)
1734	{
1735	if (((pass == 0) \|\| (pass == 1)) && width)
1736	{
1737	int dummy_value_c; // fix 'forbidden register spilled'
1738	int dummy_value_S;
1739	int dummy_value_D;
1740
1741	__asm__ __volatile__ (
1742	"subl $21, %%edi \n\t"
1743	// (png_pass_inc[pass] - 1)*pixel_bytes
1744
1745	".loop3_pass0: \n\t"
1746	"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1747	"pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1748	"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1749	"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1750	"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1751	"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1752	"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1753	"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1754	"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1755	"movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1756	"psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1757	"movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1758	"punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1759	"movq %%mm4, 16(%%edi) \n\t"
1760	"psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1761	"movq %%mm3, 8(%%edi) \n\t"
1762	"punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1763	"subl $3, %%esi \n\t"
1764	"movq %%mm0, (%%edi) \n\t"
1765	"subl $24, %%edi \n\t"
1766	"decl %%ecx \n\t"
1767	"jnz .loop3_pass0 \n\t"
1768	"EMMS \n\t" // DONE
1769
1770	: "=c" (dummy_value_c), // output regs (dummy)
1771	"=S" (dummy_value_S),
1772	"=D" (dummy_value_D)
1773
1774	: "1" (sptr), // esi // input regs
1775	"2" (dp), // edi
1776	"0" (width), // ecx
1777	"rim" (_const4) // %1(?) (0x0000000000FFFFFFLL)
1778
1779	#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1780	: "%mm0", "%mm1", "%mm2" // clobber list
1781	, "%mm3", "%mm4"
1782	#endif
1783	);
1784	}
1785	else if (((pass == 2) \|\| (pass == 3)) && width)
1786	{
1787	int dummy_value_c; // fix 'forbidden register spilled'
1788	int dummy_value_S;
1789	int dummy_value_D;
1790
1791	__asm__ __volatile__ (
1792	"subl $9, %%edi \n\t"
1793	// (png_pass_inc[pass] - 1)*pixel_bytes
1794
1795	".loop3_pass2: \n\t"
1796	"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1797	"pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1798	"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1799	"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1800	"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1801	"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1802	"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1803	"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1804	"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1805	"movq %%mm0, 4(%%edi) \n\t"
1806	"psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1807	"subl $3, %%esi \n\t"
1808	"movd %%mm0, (%%edi) \n\t"
1809	"subl $12, %%edi \n\t"
1810	"decl %%ecx \n\t"
1811	"jnz .loop3_pass2 \n\t"
1812	"EMMS \n\t" // DONE
1813
1814	: "=c" (dummy_value_c), // output regs (dummy)
1815	"=S" (dummy_value_S),
1816	"=D" (dummy_value_D)
1817
1818	: "1" (sptr), // esi // input regs
1819	"2" (dp), // edi
1820	"0" (width), // ecx
1821	"rim" (_const4) // (0x0000000000FFFFFFLL)
1822
1823	#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1824	: "%mm0", "%mm1", "%mm2" // clobber list
1825	#endif
1826	);
1827	}
1828	else if (width) /* && ((pass == 4) \|\| (pass == 5)) */
1829	{
1830	int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
1831	if (width_mmx < 0)
1832	width_mmx = 0;
1833	width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1834	if (width_mmx)
1835	{
1836	// png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1837	// sptr points at last pixel in pre-expanded row
1838	// dp points at last pixel position in expanded row
1839	int dummy_value_c; // fix 'forbidden register spilled'
1840	int dummy_value_S;
1841	int dummy_value_D;
1842
1843	__asm__ __volatile__ (
1844	"subl $3, %%esi \n\t"
1845	"subl $9, %%edi \n\t"
1846	// (png_pass_inc[pass] + 1)*pixel_bytes
1847
1848	".loop3_pass4: \n\t"
1849	"movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1850	"movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1851	"movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1852	"psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1853	"pand _const4, %%mm1 \n\t" // z z z z z 2 1 0
1854	"psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1855	"por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1856	"movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1857	"psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1858	"movq %%mm0, (%%edi) \n\t"
1859	"psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1860	"pand _const6, %%mm3 \n\t" // z z z z z z z 5
1861	"por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1862	"subl $6, %%esi \n\t"
1863	"movd %%mm2, 8(%%edi) \n\t"
1864	"subl $12, %%edi \n\t"
1865	"subl $2, %%ecx \n\t"
1866	"jnz .loop3_pass4 \n\t"
1867	"EMMS \n\t" // DONE
1868
1869	: "=c" (dummy_value_c), // output regs (dummy)
1870	"=S" (dummy_value_S),
1871	"=D" (dummy_value_D)
1872
1873	: "1" (sptr), // esi // input regs
1874	"2" (dp), // edi
1875	"0" (width_mmx), // ecx
1876	"rim" (_const4), // 0x0000000000FFFFFFLL
1877	"rim" (_const6) // 0x00000000000000FFLL
1878
1879	#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1880	: "%mm0", "%mm1" // clobber list
1881	, "%mm2", "%mm3"
1882	#endif
1883	);
1884	}
1885
1886	sptr -= width_mmx*3;
1887	dp -= width_mmx*6;
1888	for (i = width; i; i--)
1889	{
1890	png_byte v[8];
1891	int j;
1892
1893	png_memcpy(v, sptr, 3);
1894	for (j = 0; j < png_pass_inc[pass]; j++)
1895	{
1896	png_memcpy(dp, v, 3);
1897	dp -= 3;
1898	}
1899	sptr -= 3;
1900	}
1901	}
1902	} /* end of pixel_bytes == 3 */
1903
1904	//--------------------------------------------------------------
1905	else if (pixel_bytes == 1)
1906	{
1907	if (((pass == 0) \|\| (pass == 1)) && width)
1908	{
1909	int width_mmx = ((width >> 2) << 2);
1910	width -= width_mmx; // 0-3 pixels => 0-3 bytes
1911	if (width_mmx)
1912	{
1913	int dummy_value_c; // fix 'forbidden register spilled'
1914	int dummy_value_S;
1915	int dummy_value_D;
1916
1917	__asm__ __volatile__ (
1918	"subl $3, %%esi \n\t"
1919	"subl $31, %%edi \n\t"
1920
1921	".loop1_pass0: \n\t"
1922	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1923	"movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1924	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1925	"movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1926	"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1927	"movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1928	"punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1929	"punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1930	"movq %%mm0, (%%edi) \n\t"
1931	"punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1932	"movq %%mm3, 8(%%edi) \n\t"
1933	"movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1934	"punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1935	"punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1936	"movq %%mm2, 16(%%edi) \n\t"
1937	"subl $4, %%esi \n\t"
1938	"movq %%mm4, 24(%%edi) \n\t"
1939	"subl $32, %%edi \n\t"
1940	"subl $4, %%ecx \n\t"
1941	"jnz .loop1_pass0 \n\t"
1942	"EMMS \n\t" // DONE
1943
1944	: "=c" (dummy_value_c), // output regs (dummy)
1945	"=S" (dummy_value_S),
1946	"=D" (dummy_value_D)
1947
1948	: "1" (sptr), // esi // input regs
1949	"2" (dp), // edi
1950	"0" (width_mmx) // ecx
1951
1952	#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1953	: "%mm0", "%mm1", "%mm2" // clobber list
1954	, "%mm3", "%mm4"
1955	#endif
1956	);
1957	}
1958
1959	sptr -= width_mmx;
1960	dp -= width_mmx*8;
1961	for (i = width; i; i--)
1962	{
1963	int j;
1964
1965	/* I simplified this part in version 1.0.4e
1966	* here and in several other instances where
1967	* pixel_bytes == 1 -- GR-P
1968	*
1969	* Original code:
1970	*
1971	* png_byte v[8];
1972	* png_memcpy(v, sptr, pixel_bytes);
1973	* for (j = 0; j < png_pass_inc[pass]; j++)
1974	* {
1975	* png_memcpy(dp, v, pixel_bytes);
1976	* dp -= pixel_bytes;
1977	* }
1978	* sptr -= pixel_bytes;
1979	*
1980	* Replacement code is in the next three lines:
1981	*/
1982
1983	for (j = 0; j < png_pass_inc[pass]; j++)
1984	{
1985	dp-- = sptr;
1986	}
1987	--sptr;
1988	}
1989	}
1990	else if (((pass == 2) \|\| (pass == 3)) && width)
1991	{
1992	int width_mmx = ((width >> 2) << 2);
1993	width -= width_mmx; // 0-3 pixels => 0-3 bytes
1994	if (width_mmx)
1995	{
1996	int dummy_value_c; // fix 'forbidden register spilled'
1997	int dummy_value_S;
1998	int dummy_value_D;
1999
2000	__asm__ __volatile__ (
2001	"subl $3, %%esi \n\t"
2002	"subl $15, %%edi \n\t"
2003
2004	".loop1_pass2: \n\t"
2005	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2006	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2007	"movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
2008	"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
2009	"punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
2010	"movq %%mm0, (%%edi) \n\t"
2011	"subl $4, %%esi \n\t"
2012	"movq %%mm1, 8(%%edi) \n\t"
2013	"subl $16, %%edi \n\t"
2014	"subl $4, %%ecx \n\t"
2015	"jnz .loop1_pass2 \n\t"
2016	"EMMS \n\t" // DONE
2017
2018	: "=c" (dummy_value_c), // output regs (dummy)
2019	"=S" (dummy_value_S),
2020	"=D" (dummy_value_D)
2021
2022	: "1" (sptr), // esi // input regs
2023	"2" (dp), // edi
2024	"0" (width_mmx) // ecx
2025
2026	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2027	: "%mm0", "%mm1" // clobber list
2028	#endif
2029	);
2030	}
2031
2032	sptr -= width_mmx;
2033	dp -= width_mmx*4;
2034	for (i = width; i; i--)
2035	{
2036	int j;
2037
2038	for (j = 0; j < png_pass_inc[pass]; j++)
2039	{
2040	dp-- = sptr;
2041	}
2042	--sptr;
2043	}
2044	}
2045	else if (width) /* && ((pass == 4) \|\| (pass == 5)) */
2046	{
2047	int width_mmx = ((width >> 3) << 3);
2048	width -= width_mmx; // 0-3 pixels => 0-3 bytes
2049	if (width_mmx)
2050	{
2051	int dummy_value_c; // fix 'forbidden register spilled'
2052	int dummy_value_S;
2053	int dummy_value_D;
2054
2055	__asm__ __volatile__ (
2056	"subl $7, %%esi \n\t"
2057	"subl $15, %%edi \n\t"
2058
2059	".loop1_pass4: \n\t"
2060	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2061	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2062	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2063	"punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
2064	"movq %%mm1, 8(%%edi) \n\t"
2065	"subl $8, %%esi \n\t"
2066	"movq %%mm0, (%%edi) \n\t"
2067	"subl $16, %%edi \n\t"
2068	"subl $8, %%ecx \n\t"
2069	"jnz .loop1_pass4 \n\t"
2070	"EMMS \n\t" // DONE
2071
2072	: "=c" (dummy_value_c), // output regs (none)
2073	"=S" (dummy_value_S),
2074	"=D" (dummy_value_D)
2075
2076	: "1" (sptr), // esi // input regs
2077	"2" (dp), // edi
2078	"0" (width_mmx) // ecx
2079
2080	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2081	: "%mm0", "%mm1" // clobber list
2082	#endif
2083	);
2084	}
2085
2086	sptr -= width_mmx;
2087	dp -= width_mmx*2;
2088	for (i = width; i; i--)
2089	{
2090	int j;
2091
2092	for (j = 0; j < png_pass_inc[pass]; j++)
2093	{
2094	dp-- = sptr;
2095	}
2096	--sptr;
2097	}
2098	}
2099	} /* end of pixel_bytes == 1 */
2100
2101	//--------------------------------------------------------------
2102	else if (pixel_bytes == 2)
2103	{
2104	if (((pass == 0) \|\| (pass == 1)) && width)
2105	{
2106	int width_mmx = ((width >> 1) << 1);
2107	width -= width_mmx; // 0,1 pixels => 0,2 bytes
2108	if (width_mmx)
2109	{
2110	int dummy_value_c; // fix 'forbidden register spilled'
2111	int dummy_value_S;
2112	int dummy_value_D;
2113
2114	__asm__ __volatile__ (
2115	"subl $2, %%esi \n\t"
2116	"subl $30, %%edi \n\t"
2117
2118	".loop2_pass0: \n\t"
2119	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2120	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2121	"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2122	"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2123	"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2124	"movq %%mm0, (%%edi) \n\t"
2125	"movq %%mm0, 8(%%edi) \n\t"
2126	"movq %%mm1, 16(%%edi) \n\t"
2127	"subl $4, %%esi \n\t"
2128	"movq %%mm1, 24(%%edi) \n\t"
2129	"subl $32, %%edi \n\t"
2130	"subl $2, %%ecx \n\t"
2131	"jnz .loop2_pass0 \n\t"
2132	"EMMS \n\t" // DONE
2133
2134	: "=c" (dummy_value_c), // output regs (dummy)
2135	"=S" (dummy_value_S),
2136	"=D" (dummy_value_D)
2137
2138	: "1" (sptr), // esi // input regs
2139	"2" (dp), // edi
2140	"0" (width_mmx) // ecx
2141
2142	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2143	: "%mm0", "%mm1" // clobber list
2144	#endif
2145	);
2146	}
2147
2148	sptr -= (width_mmx*2 - 2); // sign fixed
2149	dp -= (width_mmx*16 - 2); // sign fixed
2150	for (i = width; i; i--)
2151	{
2152	png_byte v[8];
2153	int j;
2154	sptr -= 2;
2155	png_memcpy(v, sptr, 2);
2156	for (j = 0; j < png_pass_inc[pass]; j++)
2157	{
2158	dp -= 2;
2159	png_memcpy(dp, v, 2);
2160	}
2161	}
2162	}
2163	else if (((pass == 2) \|\| (pass == 3)) && width)
2164	{
2165	int width_mmx = ((width >> 1) << 1) ;
2166	width -= width_mmx; // 0,1 pixels => 0,2 bytes
2167	if (width_mmx)
2168	{
2169	int dummy_value_c; // fix 'forbidden register spilled'
2170	int dummy_value_S;
2171	int dummy_value_D;
2172
2173	__asm__ __volatile__ (
2174	"subl $2, %%esi \n\t"
2175	"subl $14, %%edi \n\t"
2176
2177	".loop2_pass2: \n\t"
2178	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2179	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2180	"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2181	"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2182	"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2183	"movq %%mm0, (%%edi) \n\t"
2184	"subl $4, %%esi \n\t"
2185	"movq %%mm1, 8(%%edi) \n\t"
2186	"subl $16, %%edi \n\t"
2187	"subl $2, %%ecx \n\t"
2188	"jnz .loop2_pass2 \n\t"
2189	"EMMS \n\t" // DONE
2190
2191	: "=c" (dummy_value_c), // output regs (dummy)
2192	"=S" (dummy_value_S),
2193	"=D" (dummy_value_D)
2194
2195	: "1" (sptr), // esi // input regs
2196	"2" (dp), // edi
2197	"0" (width_mmx) // ecx
2198
2199	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2200	: "%mm0", "%mm1" // clobber list
2201	#endif
2202	);
2203	}
2204
2205	sptr -= (width_mmx*2 - 2); // sign fixed
2206	dp -= (width_mmx*8 - 2); // sign fixed
2207	for (i = width; i; i--)
2208	{
2209	png_byte v[8];
2210	int j;
2211	sptr -= 2;
2212	png_memcpy(v, sptr, 2);
2213	for (j = 0; j < png_pass_inc[pass]; j++)
2214	{
2215	dp -= 2;
2216	png_memcpy(dp, v, 2);
2217	}
2218	}
2219	}
2220	else if (width) // pass == 4 or 5
2221	{
2222	int width_mmx = ((width >> 1) << 1) ;
2223	width -= width_mmx; // 0,1 pixels => 0,2 bytes
2224	if (width_mmx)
2225	{
2226	int dummy_value_c; // fix 'forbidden register spilled'
2227	int dummy_value_S;
2228	int dummy_value_D;
2229
2230	__asm__ __volatile__ (
2231	"subl $2, %%esi \n\t"
2232	"subl $6, %%edi \n\t"
2233
2234	".loop2_pass4: \n\t"
2235	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2236	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2237	"subl $4, %%esi \n\t"
2238	"movq %%mm0, (%%edi) \n\t"
2239	"subl $8, %%edi \n\t"
2240	"subl $2, %%ecx \n\t"
2241	"jnz .loop2_pass4 \n\t"
2242	"EMMS \n\t" // DONE
2243
2244	: "=c" (dummy_value_c), // output regs (dummy)
2245	"=S" (dummy_value_S),
2246	"=D" (dummy_value_D)
2247
2248	: "1" (sptr), // esi // input regs
2249	"2" (dp), // edi
2250	"0" (width_mmx) // ecx
2251
2252	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2253	: "%mm0" // clobber list
2254	#endif
2255	);
2256	}
2257
2258	sptr -= (width_mmx*2 - 2); // sign fixed
2259	dp -= (width_mmx*4 - 2); // sign fixed
2260	for (i = width; i; i--)
2261	{
2262	png_byte v[8];
2263	int j;
2264	sptr -= 2;
2265	png_memcpy(v, sptr, 2);
2266	for (j = 0; j < png_pass_inc[pass]; j++)
2267	{
2268	dp -= 2;
2269	png_memcpy(dp, v, 2);
2270	}
2271	}
2272	}
2273	} /* end of pixel_bytes == 2 */
2274
2275	//--------------------------------------------------------------
2276	else if (pixel_bytes == 4)
2277	{
2278	if (((pass == 0) \|\| (pass == 1)) && width)
2279	{
2280	int width_mmx = ((width >> 1) << 1);
2281	width -= width_mmx; // 0,1 pixels => 0,4 bytes
2282	if (width_mmx)
2283	{
2284	int dummy_value_c; // fix 'forbidden register spilled'
2285	int dummy_value_S;
2286	int dummy_value_D;
2287
2288	__asm__ __volatile__ (
2289	"subl $4, %%esi \n\t"
2290	"subl $60, %%edi \n\t"
2291
2292	".loop4_pass0: \n\t"
2293	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2294	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2295	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2296	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2297	"movq %%mm0, (%%edi) \n\t"
2298	"movq %%mm0, 8(%%edi) \n\t"
2299	"movq %%mm0, 16(%%edi) \n\t"
2300	"movq %%mm0, 24(%%edi) \n\t"
2301	"movq %%mm1, 32(%%edi) \n\t"
2302	"movq %%mm1, 40(%%edi) \n\t"
2303	"movq %%mm1, 48(%%edi) \n\t"
2304	"subl $8, %%esi \n\t"
2305	"movq %%mm1, 56(%%edi) \n\t"
2306	"subl $64, %%edi \n\t"
2307	"subl $2, %%ecx \n\t"
2308	"jnz .loop4_pass0 \n\t"
2309	"EMMS \n\t" // DONE
2310
2311	: "=c" (dummy_value_c), // output regs (dummy)
2312	"=S" (dummy_value_S),
2313	"=D" (dummy_value_D)
2314
2315	: "1" (sptr), // esi // input regs
2316	"2" (dp), // edi
2317	"0" (width_mmx) // ecx
2318
2319	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2320	: "%mm0", "%mm1" // clobber list
2321	#endif
2322	);
2323	}
2324
2325	sptr -= (width_mmx*4 - 4); // sign fixed
2326	dp -= (width_mmx*32 - 4); // sign fixed
2327	for (i = width; i; i--)
2328	{
2329	png_byte v[8];
2330	int j;
2331	sptr -= 4;
2332	png_memcpy(v, sptr, 4);
2333	for (j = 0; j < png_pass_inc[pass]; j++)
2334	{
2335	dp -= 4;
2336	png_memcpy(dp, v, 4);
2337	}
2338	}
2339	}
2340	else if (((pass == 2) \|\| (pass == 3)) && width)
2341	{
2342	int width_mmx = ((width >> 1) << 1);
2343	width -= width_mmx; // 0,1 pixels => 0,4 bytes
2344	if (width_mmx)
2345	{
2346	int dummy_value_c; // fix 'forbidden register spilled'
2347	int dummy_value_S;
2348	int dummy_value_D;
2349
2350	__asm__ __volatile__ (
2351	"subl $4, %%esi \n\t"
2352	"subl $28, %%edi \n\t"
2353
2354	".loop4_pass2: \n\t"
2355	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2356	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2357	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2358	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2359	"movq %%mm0, (%%edi) \n\t"
2360	"movq %%mm0, 8(%%edi) \n\t"
2361	"movq %%mm1, 16(%%edi) \n\t"
2362	"movq %%mm1, 24(%%edi) \n\t"
2363	"subl $8, %%esi \n\t"
2364	"subl $32, %%edi \n\t"
2365	"subl $2, %%ecx \n\t"
2366	"jnz .loop4_pass2 \n\t"
2367	"EMMS \n\t" // DONE
2368
2369	: "=c" (dummy_value_c), // output regs (dummy)
2370	"=S" (dummy_value_S),
2371	"=D" (dummy_value_D)
2372
2373	: "1" (sptr), // esi // input regs
2374	"2" (dp), // edi
2375	"0" (width_mmx) // ecx
2376
2377	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2378	: "%mm0", "%mm1" // clobber list
2379	#endif
2380	);
2381	}
2382
2383	sptr -= (width_mmx*4 - 4); // sign fixed
2384	dp -= (width_mmx*16 - 4); // sign fixed
2385	for (i = width; i; i--)
2386	{
2387	png_byte v[8];
2388	int j;
2389	sptr -= 4;
2390	png_memcpy(v, sptr, 4);
2391	for (j = 0; j < png_pass_inc[pass]; j++)
2392	{
2393	dp -= 4;
2394	png_memcpy(dp, v, 4);
2395	}
2396	}
2397	}
2398	else if (width) // pass == 4 or 5
2399	{
2400	int width_mmx = ((width >> 1) << 1) ;
2401	width -= width_mmx; // 0,1 pixels => 0,4 bytes
2402	if (width_mmx)
2403	{
2404	int dummy_value_c; // fix 'forbidden register spilled'
2405	int dummy_value_S;
2406	int dummy_value_D;
2407
2408	__asm__ __volatile__ (
2409	"subl $4, %%esi \n\t"
2410	"subl $12, %%edi \n\t"
2411
2412	".loop4_pass4: \n\t"
2413	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2414	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2415	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2416	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2417	"movq %%mm0, (%%edi) \n\t"
2418	"subl $8, %%esi \n\t"
2419	"movq %%mm1, 8(%%edi) \n\t"
2420	"subl $16, %%edi \n\t"
2421	"subl $2, %%ecx \n\t"
2422	"jnz .loop4_pass4 \n\t"
2423	"EMMS \n\t" // DONE
2424
2425	: "=c" (dummy_value_c), // output regs (dummy)
2426	"=S" (dummy_value_S),
2427	"=D" (dummy_value_D)
2428
2429	: "1" (sptr), // esi // input regs
2430	"2" (dp), // edi
2431	"0" (width_mmx) // ecx
2432
2433	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2434	: "%mm0", "%mm1" // clobber list
2435	#endif
2436	);
2437	}
2438
2439	sptr -= (width_mmx*4 - 4); // sign fixed
2440	dp -= (width_mmx*8 - 4); // sign fixed
2441	for (i = width; i; i--)
2442	{
2443	png_byte v[8];
2444	int j;
2445	sptr -= 4;
2446	png_memcpy(v, sptr, 4);
2447	for (j = 0; j < png_pass_inc[pass]; j++)
2448	{
2449	dp -= 4;
2450	png_memcpy(dp, v, 4);
2451	}
2452	}
2453	}
2454	} /* end of pixel_bytes == 4 */
2455
2456	//--------------------------------------------------------------
2457	else if (pixel_bytes == 8)
2458	{
2459	// GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
2460	// GRR NOTE: no need to combine passes here!
2461	if (((pass == 0) \|\| (pass == 1)) && width)
2462	{
2463	int dummy_value_c; // fix 'forbidden register spilled'
2464	int dummy_value_S;
2465	int dummy_value_D;
2466
2467	// source is 8-byte RRGGBBAA
2468	// dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2469	__asm__ __volatile__ (
2470	"subl $56, %%edi \n\t" // start of last block
2471
2472	".loop8_pass0: \n\t"
2473	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2474	"movq %%mm0, (%%edi) \n\t"
2475	"movq %%mm0, 8(%%edi) \n\t"
2476	"movq %%mm0, 16(%%edi) \n\t"
2477	"movq %%mm0, 24(%%edi) \n\t"
2478	"movq %%mm0, 32(%%edi) \n\t"
2479	"movq %%mm0, 40(%%edi) \n\t"
2480	"movq %%mm0, 48(%%edi) \n\t"
2481	"subl $8, %%esi \n\t"
2482	"movq %%mm0, 56(%%edi) \n\t"
2483	"subl $64, %%edi \n\t"
2484	"decl %%ecx \n\t"
2485	"jnz .loop8_pass0 \n\t"
2486	"EMMS \n\t" // DONE
2487
2488	: "=c" (dummy_value_c), // output regs (dummy)
2489	"=S" (dummy_value_S),
2490	"=D" (dummy_value_D)
2491
2492	: "1" (sptr), // esi // input regs
2493	"2" (dp), // edi
2494	"0" (width) // ecx
2495
2496	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2497	: "%mm0" // clobber list
2498	#endif
2499	);
2500	}
2501	else if (((pass == 2) \|\| (pass == 3)) && width)
2502	{
2503	// source is 8-byte RRGGBBAA
2504	// dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2505	// (recall that expansion is _in place_: sptr and dp
2506	// both point at locations within same row buffer)
2507	{
2508	int dummy_value_c; // fix 'forbidden register spilled'
2509	int dummy_value_S;
2510	int dummy_value_D;
2511
2512	__asm__ __volatile__ (
2513	"subl $24, %%edi \n\t" // start of last block
2514
2515	".loop8_pass2: \n\t"
2516	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2517	"movq %%mm0, (%%edi) \n\t"
2518	"movq %%mm0, 8(%%edi) \n\t"
2519	"movq %%mm0, 16(%%edi) \n\t"
2520	"subl $8, %%esi \n\t"
2521	"movq %%mm0, 24(%%edi) \n\t"
2522	"subl $32, %%edi \n\t"
2523	"decl %%ecx \n\t"
2524	"jnz .loop8_pass2 \n\t"
2525	"EMMS \n\t" // DONE
2526
2527	: "=c" (dummy_value_c), // output regs (dummy)
2528	"=S" (dummy_value_S),
2529	"=D" (dummy_value_D)
2530
2531	: "1" (sptr), // esi // input regs
2532	"2" (dp), // edi
2533	"0" (width) // ecx
2534
2535	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2536	: "%mm0" // clobber list
2537	#endif
2538	);
2539	}
2540	}
2541	else if (width) // pass == 4 or 5
2542	{
2543	// source is 8-byte RRGGBBAA
2544	// dest is 16-byte RRGGBBAA RRGGBBAA
2545	{
2546	int dummy_value_c; // fix 'forbidden register spilled'
2547	int dummy_value_S;
2548	int dummy_value_D;
2549
2550	__asm__ __volatile__ (
2551	"subl $8, %%edi \n\t" // start of last block
2552
2553	".loop8_pass4: \n\t"
2554	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2555	"movq %%mm0, (%%edi) \n\t"
2556	"subl $8, %%esi \n\t"
2557	"movq %%mm0, 8(%%edi) \n\t"
2558	"subl $16, %%edi \n\t"
2559	"decl %%ecx \n\t"
2560	"jnz .loop8_pass4 \n\t"
2561	"EMMS \n\t" // DONE
2562
2563	: "=c" (dummy_value_c), // output regs (dummy)
2564	"=S" (dummy_value_S),
2565	"=D" (dummy_value_D)
2566
2567	: "1" (sptr), // esi // input regs
2568	"2" (dp), // edi
2569	"0" (width) // ecx
2570
2571	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2572	: "%mm0" // clobber list
2573	#endif
2574	);
2575	}
2576	}
2577
2578	} /* end of pixel_bytes == 8 */
2579
2580	//--------------------------------------------------------------
2581	else if (pixel_bytes == 6)
2582	{
2583	for (i = width; i; i--)
2584	{
2585	png_byte v[8];
2586	int j;
2587	png_memcpy(v, sptr, 6);
2588	for (j = 0; j < png_pass_inc[pass]; j++)
2589	{
2590	png_memcpy(dp, v, 6);
2591	dp -= 6;
2592	}
2593	sptr -= 6;
2594	}
2595	} /* end of pixel_bytes == 6 */
2596
2597	//--------------------------------------------------------------
2598	else
2599	{
2600	for (i = width; i; i--)
2601	{
2602	png_byte v[8];
2603	int j;
2604	png_memcpy(v, sptr, pixel_bytes);
2605	for (j = 0; j < png_pass_inc[pass]; j++)
2606	{
2607	png_memcpy(dp, v, pixel_bytes);
2608	dp -= pixel_bytes;
2609	}
2610	sptr-= pixel_bytes;
2611	}
2612	}
2613	} // end of _mmx_supported ========================================
2614
2615	else /* MMX not supported: use modified C code - takes advantage
2616	* of inlining of png_memcpy for a constant */
2617	/* GRR 19991007: does it? or should pixel_bytes in each
2618	* block be replaced with immediate value (e.g., 1)? */
2619	/* GRR 19991017: replaced with constants in each case */
2620	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2621	{
2622	if (pixel_bytes == 1)
2623	{
2624	for (i = width; i; i--)
2625	{
2626	int j;
2627	for (j = 0; j < png_pass_inc[pass]; j++)
2628	{
2629	dp-- = sptr;
2630	}
2631	--sptr;
2632	}
2633	}
2634	else if (pixel_bytes == 3)
2635	{
2636	for (i = width; i; i--)
2637	{
2638	png_byte v[8];
2639	int j;
2640	png_memcpy(v, sptr, 3);
2641	for (j = 0; j < png_pass_inc[pass]; j++)
2642	{
2643	png_memcpy(dp, v, 3);
2644	dp -= 3;
2645	}
2646	sptr -= 3;
2647	}
2648	}
2649	else if (pixel_bytes == 2)
2650	{
2651	for (i = width; i; i--)
2652	{
2653	png_byte v[8];
2654	int j;
2655	png_memcpy(v, sptr, 2);
2656	for (j = 0; j < png_pass_inc[pass]; j++)
2657	{
2658	png_memcpy(dp, v, 2);
2659	dp -= 2;
2660	}
2661	sptr -= 2;
2662	}
2663	}
2664	else if (pixel_bytes == 4)
2665	{
2666	for (i = width; i; i--)
2667	{
2668	png_byte v[8];
2669	int j;
2670	png_memcpy(v, sptr, 4);
2671	for (j = 0; j < png_pass_inc[pass]; j++)
2672	{
2673	#ifdef PNG_DEBUG
2674	if (dp < row \|\| dp+3 > row+png_ptr->row_buf_size)
2675	{
2676	printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2677	row, dp, row+png_ptr->row_buf_size);
2678	printf("row_buf=%d\n",png_ptr->row_buf_size);
2679	}
2680	#endif
2681	png_memcpy(dp, v, 4);
2682	dp -= 4;
2683	}
2684	sptr -= 4;
2685	}
2686	}
2687	else if (pixel_bytes == 6)
2688	{
2689	for (i = width; i; i--)
2690	{
2691	png_byte v[8];
2692	int j;
2693	png_memcpy(v, sptr, 6);
2694	for (j = 0; j < png_pass_inc[pass]; j++)
2695	{
2696	png_memcpy(dp, v, 6);
2697	dp -= 6;
2698	}
2699	sptr -= 6;
2700	}
2701	}
2702	else if (pixel_bytes == 8)
2703	{
2704	for (i = width; i; i--)
2705	{
2706	png_byte v[8];
2707	int j;
2708	png_memcpy(v, sptr, 8);
2709	for (j = 0; j < png_pass_inc[pass]; j++)
2710	{
2711	png_memcpy(dp, v, 8);
2712	dp -= 8;
2713	}
2714	sptr -= 8;
2715	}
2716	}
2717	else /* GRR: should never be reached */
2718	{
2719	for (i = width; i; i--)
2720	{
2721	png_byte v[8];
2722	int j;
2723	png_memcpy(v, sptr, pixel_bytes);
2724	for (j = 0; j < png_pass_inc[pass]; j++)
2725	{
2726	png_memcpy(dp, v, pixel_bytes);
2727	dp -= pixel_bytes;
2728	}
2729	sptr -= pixel_bytes;
2730	}
2731	}
2732
2733	} /* end if (MMX not supported) */
2734	break;
2735	}
2736	} /* end switch (row_info->pixel_depth) */
2737
2738	row_info->width = final_width;
2739
2740	row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
2741	}
2742
2743	} /* end png_do_read_interlace() */
2744
2745	#endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2746	#endif /* PNG_READ_INTERLACING_SUPPORTED */
2747
2748
2749
2750	#if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2751	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2752
2753	// These variables are utilized in the functions below. They are declared
2754	// globally here to ensure alignment on 8-byte boundaries.
2755
2756	union uAll {
2757	long long use;
2758	double align;
2759	} _LBCarryMask = {0x0101010101010101LL},
2760	_HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2761	_ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2762
2763	#ifdef PNG_THREAD_UNSAFE_OK
2764	//===========================================================================//
2765	// //
2766	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
2767	// //
2768	//===========================================================================//
2769
2770	// Optimized code for PNG Average filter decoder
2771
2772	static void /* PRIVATE */
2773	png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2774	png_bytep prev_row)
2775	{
2776	int bpp;
2777	int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
2778	int dummy_value_S;
2779	int dummy_value_D;
2780
2781	bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
2782	_FullLength = row_info->rowbytes; // # of bytes to filter
2783
2784	__asm__ __volatile__ (
2785	// initialize address pointers and offset
2786	#ifdef __PIC__
2787	"pushl %%ebx \n\t" // save index to Global Offset Table
2788	#endif
2789	//pre "movl row, %%edi \n\t" // edi: Avg(x)
2790	"xorl %%ebx, %%ebx \n\t" // ebx: x
2791	"movl %%edi, %%edx \n\t"
2792	//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
2793	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2794	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2795
2796	"xorl %%eax,%%eax \n\t"
2797
2798	// Compute the Raw value for the first bpp bytes
2799	// Raw(x) = Avg(x) + (Prior(x)/2)
2800	"avg_rlp: \n\t"
2801	"movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
2802	"incl %%ebx \n\t"
2803	"shrb %%al \n\t" // divide by 2
2804	"addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2805	//pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2806	"cmpl %%ecx, %%ebx \n\t"
2807	"movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2808	"jb avg_rlp \n\t" // mov does not affect flags
2809
2810	// get # of bytes to alignment
2811	"movl %%edi, _dif \n\t" // take start of row
2812	"addl %%ebx, _dif \n\t" // add bpp
2813	"addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2814	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2815	"subl %%edi, _dif \n\t" // subtract from start => value ebx at
2816	"jz avg_go \n\t" // alignment
2817
2818	// fix alignment
2819	// Compute the Raw value for the bytes up to the alignment boundary
2820	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2821	"xorl %%ecx, %%ecx \n\t"
2822
2823	"avg_lp1: \n\t"
2824	"xorl %%eax, %%eax \n\t"
2825	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2826	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2827	"addw %%cx, %%ax \n\t"
2828	"incl %%ebx \n\t"
2829	"shrw %%ax \n\t" // divide by 2
2830	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2831	"cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2832	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2833	"jb avg_lp1 \n\t" // repeat until at alignment boundary
2834
2835	"avg_go: \n\t"
2836	"movl _FullLength, %%eax \n\t"
2837	"movl %%eax, %%ecx \n\t"
2838	"subl %%ebx, %%eax \n\t" // subtract alignment fix
2839	"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2840	"subl %%eax, %%ecx \n\t" // drop over bytes from original length
2841	"movl %%ecx, _MMXLength \n\t"
2842	#ifdef __PIC__
2843	"popl %%ebx \n\t" // restore index to Global Offset Table
2844	#endif
2845
2846	: "=c" (dummy_value_c), // output regs (dummy)
2847	"=S" (dummy_value_S),
2848	"=D" (dummy_value_D)
2849
2850	: "0" (bpp), // ecx // input regs
2851	"1" (prev_row), // esi
2852	"2" (row) // edi
2853
2854	: "%eax", "%edx" // clobber list
2855	#ifndef __PIC__
2856	, "%ebx"
2857	#endif
2858	// GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2859	// (seems to work fine without...)
2860	);
2861
2862	// now do the math for the rest of the row
2863	switch (bpp)
2864	{
2865	case 3:
2866	{
2867	_ActiveMask.use = 0x0000000000ffffffLL;
2868	_ShiftBpp.use = 24; // == 3 * 8
2869	_ShiftRem.use = 40; // == 64 - 24
2870
2871	__asm__ __volatile__ (
2872	// re-init address pointers and offset
2873	"movq _ActiveMask, %%mm7 \n\t"
2874	"movl _dif, %%ecx \n\t" // ecx: x = offset to
2875	"movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
2876	// preload "movl row, %%edi \n\t" // edi: Avg(x)
2877	"movq _HBClearMask, %%mm4 \n\t"
2878	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2879
2880	// prime the pump: load the first Raw(x-bpp) data set
2881	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2882	// (correct pos. in loop below)
2883	"avg_3lp: \n\t"
2884	"movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
2885	"movq %%mm5, %%mm3 \n\t"
2886	"psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
2887	// data
2888	"movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
2889	"movq %%mm7, %%mm6 \n\t"
2890	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2891	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2892	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
2893	// byte
2894	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
2895	// each byte
2896	// add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2897	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2898	// LBCarrys
2899	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2900	// where both
2901	// lsb's were == 1 (only valid for active group)
2902	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2903	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2904	// byte
2905	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2906	// for each byte
2907	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
2908	// bytes to add to Avg
2909	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2910	// Avg for each Active
2911	// byte
2912	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2913	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
2914	// bytes 3-5
2915	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2916	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2917	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2918	// LBCarrys
2919	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2920	// where both
2921	// lsb's were == 1 (only valid for active group)
2922	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2923	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2924	// byte
2925	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2926	// for each byte
2927	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2928	// bytes to add to Avg
2929	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2930	// Avg for each Active
2931	// byte
2932
2933	// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2934	"psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
2935	// two
2936	// bytes
2937	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2938	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2939	// Data only needs to be shifted once here to
2940	// get the correct x-bpp offset.
2941	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2942	// LBCarrys
2943	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2944	// where both
2945	// lsb's were == 1 (only valid for active group)
2946	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2947	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2948	// byte
2949	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2950	// for each byte
2951	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2952	// bytes to add to Avg
2953	"addl $8, %%ecx \n\t"
2954	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2955	// Avg for each Active
2956	// byte
2957	// now ready to write back to memory
2958	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2959	// move updated Raw(x) to use as Raw(x-bpp) for next loop
2960	"cmpl _MMXLength, %%ecx \n\t"
2961	"movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
2962	"jb avg_3lp \n\t"
2963
2964	: "=S" (dummy_value_S), // output regs (dummy)
2965	"=D" (dummy_value_D)
2966
2967	: "0" (prev_row), // esi // input regs
2968	"1" (row) // edi
2969
2970	: "%ecx" // clobber list
2971	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2972	, "%mm0", "%mm1", "%mm2", "%mm3"
2973	, "%mm4", "%mm5", "%mm6", "%mm7"
2974	#endif
2975	);
2976	}
2977	break; // end 3 bpp
2978
2979	case 6:
2980	case 4:
2981	//case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
2982	//case 5: // GRR BOGUS
2983	{
2984	_ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
2985	// appropriate inactive bytes
2986	_ShiftBpp.use = bpp << 3;
2987	_ShiftRem.use = 64 - _ShiftBpp.use;
2988
2989	__asm__ __volatile__ (
2990	"movq _HBClearMask, %%mm4 \n\t"
2991
2992	// re-init address pointers and offset
2993	"movl _dif, %%ecx \n\t" // ecx: x = offset to
2994	// alignment boundary
2995
2996	// load _ActiveMask and clear all bytes except for 1st active group
2997	"movq _ActiveMask, %%mm7 \n\t"
2998	// preload "movl row, %%edi \n\t" // edi: Avg(x)
2999	"psrlq _ShiftRem, %%mm7 \n\t"
3000	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3001	"movq %%mm7, %%mm6 \n\t"
3002	"movq _LBCarryMask, %%mm5 \n\t"
3003	"psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
3004	// group
3005
3006	// prime the pump: load the first Raw(x-bpp) data set
3007	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3008	// (we correct pos. in loop below)
3009	"avg_4lp: \n\t"
3010	"movq (%%edi,%%ecx,), %%mm0 \n\t"
3011	"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3012	"movq (%%esi,%%ecx,), %%mm1 \n\t"
3013	// add (Prev_row/2) to average
3014	"movq %%mm5, %%mm3 \n\t"
3015	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3016	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3017	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3018	// byte
3019	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3020	// each byte
3021	// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3022	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3023	// LBCarrys
3024	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3025	// where both
3026	// lsb's were == 1 (only valid for active group)
3027	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3028	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3029	// byte
3030	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3031	// for each byte
3032	"pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
3033	// bytes to add to Avg
3034	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3035	// for each Active
3036	// byte
3037	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3038	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3039	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3040	"addl $8, %%ecx \n\t"
3041	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3042	// LBCarrys
3043	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3044	// where both
3045	// lsb's were == 1 (only valid for active group)
3046	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3047	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3048	// byte
3049	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3050	// for each byte
3051	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3052	// bytes to add to Avg
3053	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3054	// Avg for each Active
3055	// byte
3056	"cmpl _MMXLength, %%ecx \n\t"
3057	// now ready to write back to memory
3058	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3059	// prep Raw(x-bpp) for next loop
3060	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3061	"jb avg_4lp \n\t"
3062
3063	: "=S" (dummy_value_S), // output regs (dummy)
3064	"=D" (dummy_value_D)
3065
3066	: "0" (prev_row), // esi // input regs
3067	"1" (row) // edi
3068
3069	: "%ecx" // clobber list
3070	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3071	, "%mm0", "%mm1", "%mm2", "%mm3"
3072	, "%mm4", "%mm5", "%mm6", "%mm7"
3073	#endif
3074	);
3075	}
3076	break; // end 4,6 bpp
3077
3078	case 2:
3079	{
3080	_ActiveMask.use = 0x000000000000ffffLL;
3081	_ShiftBpp.use = 16; // == 2 * 8
3082	_ShiftRem.use = 48; // == 64 - 16
3083
3084	__asm__ __volatile__ (
3085	// load _ActiveMask
3086	"movq _ActiveMask, %%mm7 \n\t"
3087	// re-init address pointers and offset
3088	"movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
3089	// boundary
3090	"movq _LBCarryMask, %%mm5 \n\t"
3091	// preload "movl row, %%edi \n\t" // edi: Avg(x)
3092	"movq _HBClearMask, %%mm4 \n\t"
3093	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3094
3095	// prime the pump: load the first Raw(x-bpp) data set
3096	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3097	// (we correct pos. in loop below)
3098	"avg_2lp: \n\t"
3099	"movq (%%edi,%%ecx,), %%mm0 \n\t"
3100	"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3101	"movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
3102	// add (Prev_row/2) to average
3103	"movq %%mm5, %%mm3 \n\t"
3104	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3105	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3106	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3107	// byte
3108	"movq %%mm7, %%mm6 \n\t"
3109	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3110	// each byte
3111
3112	// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3113	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3114	// LBCarrys
3115	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3116	// where both
3117	// lsb's were == 1 (only valid
3118	// for active group)
3119	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3120	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3121	// byte
3122	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3123	// for each byte
3124	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
3125	// bytes to add to Avg
3126	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3127	// for each Active byte
3128
3129	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3130	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3131	// bytes 2 & 3
3132	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3133	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3134	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3135	// LBCarrys
3136	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3137	// where both
3138	// lsb's were == 1 (only valid
3139	// for active group)
3140	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3141	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3142	// byte
3143	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3144	// for each byte
3145	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3146	// bytes to add to Avg
3147	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3148	// Avg for each Active byte
3149
3150	// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3151	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3152	// bytes 4 & 5
3153	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3154	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3155	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3156	// LBCarrys
3157	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3158	// where both lsb's were == 1
3159	// (only valid for active group)
3160	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3161	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3162	// byte
3163	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3164	// for each byte
3165	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3166	// bytes to add to Avg
3167	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3168	// Avg for each Active byte
3169
3170	// add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3171	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3172	// bytes 6 & 7
3173	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3174	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3175	"addl $8, %%ecx \n\t"
3176	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3177	// LBCarrys
3178	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3179	// where both
3180	// lsb's were == 1 (only valid
3181	// for active group)
3182	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3183	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3184	// byte
3185	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3186	// for each byte
3187	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3188	// bytes to add to Avg
3189	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3190	// Avg for each Active byte
3191
3192	"cmpl _MMXLength, %%ecx \n\t"
3193	// now ready to write back to memory
3194	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3195	// prep Raw(x-bpp) for next loop
3196	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3197	"jb avg_2lp \n\t"
3198
3199	: "=S" (dummy_value_S), // output regs (dummy)
3200	"=D" (dummy_value_D)
3201
3202	: "0" (prev_row), // esi // input regs
3203	"1" (row) // edi
3204
3205	: "%ecx" // clobber list
3206	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3207	, "%mm0", "%mm1", "%mm2", "%mm3"
3208	, "%mm4", "%mm5", "%mm6", "%mm7"
3209	#endif
3210	);
3211	}
3212	break; // end 2 bpp
3213
3214	case 1:
3215	{
3216	__asm__ __volatile__ (
3217	// re-init address pointers and offset
3218	#ifdef __PIC__
3219	"pushl %%ebx \n\t" // save Global Offset Table index
3220	#endif
3221	"movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
3222	// boundary
3223	// preload "movl row, %%edi \n\t" // edi: Avg(x)
3224	"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3225	"jnb avg_1end \n\t"
3226	// do Paeth decode for remaining bytes
3227	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3228	"movl %%edi, %%edx \n\t"
3229	// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3230	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3231	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
3232	// in loop below
3233	"avg_1lp: \n\t"
3234	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3235	"xorl %%eax, %%eax \n\t"
3236	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3237	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3238	"addw %%cx, %%ax \n\t"
3239	"incl %%ebx \n\t"
3240	"shrw %%ax \n\t" // divide by 2
3241	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3242	// inc ebx
3243	"cmpl _FullLength, %%ebx \n\t" // check if at end of array
3244	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3245	// mov does not affect flags; -1 to offset inc ebx
3246	"jb avg_1lp \n\t"
3247
3248	"avg_1end: \n\t"
3249	#ifdef __PIC__
3250	"popl %%ebx \n\t" // Global Offset Table index
3251	#endif
3252
3253	: "=c" (dummy_value_c), // output regs (dummy)
3254	"=S" (dummy_value_S),
3255	"=D" (dummy_value_D)
3256
3257	: "0" (bpp), // ecx // input regs
3258	"1" (prev_row), // esi
3259	"2" (row) // edi
3260
3261	: "%eax", "%edx" // clobber list
3262	#ifndef __PIC__
3263	, "%ebx"
3264	#endif
3265	);
3266	}
3267	return; // end 1 bpp
3268
3269	case 8:
3270	{
3271	__asm__ __volatile__ (
3272	// re-init address pointers and offset
3273	"movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
3274	"movq _LBCarryMask, %%mm5 \n\t" // boundary
3275	// preload "movl row, %%edi \n\t" // edi: Avg(x)
3276	"movq _HBClearMask, %%mm4 \n\t"
3277	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3278
3279	// prime the pump: load the first Raw(x-bpp) data set
3280	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3281	// (NO NEED to correct pos. in loop below)
3282
3283	"avg_8lp: \n\t"
3284	"movq (%%edi,%%ecx,), %%mm0 \n\t"
3285	"movq %%mm5, %%mm3 \n\t"
3286	"movq (%%esi,%%ecx,), %%mm1 \n\t"
3287	"addl $8, %%ecx \n\t"
3288	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3289	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3290	"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3291	// where both lsb's were == 1
3292	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3293	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
3294	"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
3295	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
3296	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
3297	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3298	"cmpl _MMXLength, %%ecx \n\t"
3299	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3300	"movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
3301	"jb avg_8lp \n\t"
3302
3303	: "=S" (dummy_value_S), // output regs (dummy)
3304	"=D" (dummy_value_D)
3305
3306	: "0" (prev_row), // esi // input regs
3307	"1" (row) // edi
3308
3309	: "%ecx" // clobber list
3310	#if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3311	, "%mm0", "%mm1", "%mm2"
3312	, "%mm3", "%mm4", "%mm5"
3313	#endif
3314	);
3315	}
3316	break; // end 8 bpp
3317
3318	default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3319	{
3320
3321	#ifdef PNG_DEBUG
3322	// GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
3323	png_debug(1,
3324	"Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3325	#endif
3326
3327	#if 0
3328	__asm__ __volatile__ (
3329	"movq _LBCarryMask, %%mm5 \n\t"
3330	// re-init address pointers and offset
3331	"movl _dif, %%ebx \n\t" // ebx: x = offset to
3332	// alignment boundary
3333	"movl row, %%edi \n\t" // edi: Avg(x)
3334	"movq _HBClearMask, %%mm4 \n\t"
3335	"movl %%edi, %%edx \n\t"
3336	"movl prev_row, %%esi \n\t" // esi: Prior(x)
3337	"subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
3338	"avg_Alp: \n\t"
3339	"movq (%%edi,%%ebx,), %%mm0 \n\t"
3340	"movq %%mm5, %%mm3 \n\t"
3341	"movq (%%esi,%%ebx,), %%mm1 \n\t"
3342	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3343	"movq (%%edx,%%ebx,), %%mm2 \n\t"
3344	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3345	"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3346	// where both lsb's were == 1
3347	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3348	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3349	// byte
3350	"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
3351	// byte
3352	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3353	// byte
3354	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3355	// each byte
3356	"addl $8, %%ebx \n\t"
3357	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3358	// byte
3359	"cmpl _MMXLength, %%ebx \n\t"
3360	"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3361	"jb avg_Alp \n\t"
3362
3363	: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3364
3365	: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3366
3367	: "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3368	);
3369	#endif /* 0 - NEVER REACHED */
3370	}
3371	break;
3372
3373	} // end switch (bpp)
3374
3375	__asm__ __volatile__ (
3376	// MMX acceleration complete; now do clean-up
3377	// check if any remaining bytes left to decode
3378	#ifdef __PIC__
3379	"pushl %%ebx \n\t" // save index to Global Offset Table
3380	#endif
3381	"movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
3382	//pre "movl row, %%edi \n\t" // edi: Avg(x)
3383	"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3384	"jnb avg_end \n\t"
3385
3386	// do Avg decode for remaining bytes
3387	//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
3388	"movl %%edi, %%edx \n\t"
3389	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3390	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3391	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
3392
3393	"avg_lp2: \n\t"
3394	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3395	"xorl %%eax, %%eax \n\t"
3396	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3397	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3398	"addw %%cx, %%ax \n\t"
3399	"incl %%ebx \n\t"
3400	"shrw %%ax \n\t" // divide by 2
3401	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3402	"cmpl _FullLength, %%ebx \n\t" // check if at end of array
3403	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3404	"jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
3405
3406	"avg_end: \n\t"
3407	"EMMS \n\t" // end MMX; prep for poss. FP instrs.
3408	#ifdef __PIC__
3409	"popl %%ebx \n\t" // restore index to Global Offset Table
3410	#endif
3411
3412	: "=c" (dummy_value_c), // output regs (dummy)
3413	"=S" (dummy_value_S),
3414	"=D" (dummy_value_D)
3415
3416	: "0" (bpp), // ecx // input regs
3417	"1" (prev_row), // esi
3418	"2" (row) // edi
3419
3420	: "%eax", "%edx" // clobber list
3421	#ifndef __PIC__
3422	, "%ebx"
3423	#endif
3424	);
3425
3426	} /* end png_read_filter_row_mmx_avg() */
3427	#endif
3428
3429
3430
3431	#ifdef PNG_THREAD_UNSAFE_OK
3432	//===========================================================================//
3433	// //
3434	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
3435	// //
3436	//===========================================================================//
3437
3438	// Optimized code for PNG Paeth filter decoder
3439
3440	static void /* PRIVATE */
3441	png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3442	png_bytep prev_row)
3443	{
3444	int bpp;
3445	int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
3446	int dummy_value_S;
3447	int dummy_value_D;
3448
3449	bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3450	_FullLength = row_info->rowbytes; // # of bytes to filter
3451
3452	__asm__ __volatile__ (
3453	#ifdef __PIC__
3454	"pushl %%ebx \n\t" // save index to Global Offset Table
3455	#endif
3456	"xorl %%ebx, %%ebx \n\t" // ebx: x offset
3457	//pre "movl row, %%edi \n\t"
3458	"xorl %%edx, %%edx \n\t" // edx: x-bpp offset
3459	//pre "movl prev_row, %%esi \n\t"
3460	"xorl %%eax, %%eax \n\t"
3461
3462	// Compute the Raw value for the first bpp bytes
3463	// Note: the formula works out to be always
3464	// Paeth(x) = Raw(x) + Prior(x) where x < bpp
3465	"paeth_rlp: \n\t"
3466	"movb (%%edi,%%ebx,), %%al \n\t"
3467	"addb (%%esi,%%ebx,), %%al \n\t"
3468	"incl %%ebx \n\t"
3469	//pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
3470	"cmpl %%ecx, %%ebx \n\t"
3471	"movb %%al, -1(%%edi,%%ebx,) \n\t"
3472	"jb paeth_rlp \n\t"
3473	// get # of bytes to alignment
3474	"movl %%edi, _dif \n\t" // take start of row
3475	"addl %%ebx, _dif \n\t" // add bpp
3476	"xorl %%ecx, %%ecx \n\t"
3477	"addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
3478	// boundary
3479	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3480	"subl %%edi, _dif \n\t" // subtract from start ==> value ebx
3481	// at alignment
3482	"jz paeth_go \n\t"
3483	// fix alignment
3484
3485	"paeth_lp1: \n\t"
3486	"xorl %%eax, %%eax \n\t"
3487	// pav = p - a = (a + b - c) - a = b - c
3488	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3489	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3490	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3491	"movl %%eax, _patemp \n\t" // Save pav for later use
3492	"xorl %%eax, %%eax \n\t"
3493	// pbv = p - b = (a + b - c) - b = a - c
3494	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3495	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3496	"movl %%eax, %%ecx \n\t"
3497	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3498	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
3499	// pc = abs(pcv)
3500	"testl $0x80000000, %%eax \n\t"
3501	"jz paeth_pca \n\t"
3502	"negl %%eax \n\t" // reverse sign of neg values
3503
3504	"paeth_pca: \n\t"
3505	"movl %%eax, _pctemp \n\t" // save pc for later use
3506	// pb = abs(pbv)
3507	"testl $0x80000000, %%ecx \n\t"
3508	"jz paeth_pba \n\t"
3509	"negl %%ecx \n\t" // reverse sign of neg values
3510
3511	"paeth_pba: \n\t"
3512	"movl %%ecx, _pbtemp \n\t" // save pb for later use
3513	// pa = abs(pav)
3514	"movl _patemp, %%eax \n\t"
3515	"testl $0x80000000, %%eax \n\t"
3516	"jz paeth_paa \n\t"
3517	"negl %%eax \n\t" // reverse sign of neg values
3518
3519	"paeth_paa: \n\t"
3520	"movl %%eax, _patemp \n\t" // save pa for later use
3521	// test if pa <= pb
3522	"cmpl %%ecx, %%eax \n\t"
3523	"jna paeth_abb \n\t"
3524	// pa > pb; now test if pb <= pc
3525	"cmpl _pctemp, %%ecx \n\t"
3526	"jna paeth_bbc \n\t"
3527	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3528	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3529	"jmp paeth_paeth \n\t"
3530
3531	"paeth_bbc: \n\t"
3532	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3533	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3534	"jmp paeth_paeth \n\t"
3535
3536	"paeth_abb: \n\t"
3537	// pa <= pb; now test if pa <= pc
3538	"cmpl _pctemp, %%eax \n\t"
3539	"jna paeth_abc \n\t"
3540	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3541	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3542	"jmp paeth_paeth \n\t"
3543
3544	"paeth_abc: \n\t"
3545	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3546	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3547
3548	"paeth_paeth: \n\t"
3549	"incl %%ebx \n\t"
3550	"incl %%edx \n\t"
3551	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3552	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
3553	"cmpl _dif, %%ebx \n\t"
3554	"jb paeth_lp1 \n\t"
3555
3556	"paeth_go: \n\t"
3557	"movl _FullLength, %%ecx \n\t"
3558	"movl %%ecx, %%eax \n\t"
3559	"subl %%ebx, %%eax \n\t" // subtract alignment fix
3560	"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3561	"subl %%eax, %%ecx \n\t" // drop over bytes from original length
3562	"movl %%ecx, _MMXLength \n\t"
3563	#ifdef __PIC__
3564	"popl %%ebx \n\t" // restore index to Global Offset Table
3565	#endif
3566
3567	: "=c" (dummy_value_c), // output regs (dummy)
3568	"=S" (dummy_value_S),
3569	"=D" (dummy_value_D)
3570
3571	: "0" (bpp), // ecx // input regs
3572	"1" (prev_row), // esi
3573	"2" (row) // edi
3574
3575	: "%eax", "%edx" // clobber list
3576	#ifndef __PIC__
3577	, "%ebx"
3578	#endif
3579	);
3580
3581	// now do the math for the rest of the row
3582	switch (bpp)
3583	{
3584	case 3:
3585	{
3586	_ActiveMask.use = 0x0000000000ffffffLL;
3587	_ActiveMaskEnd.use = 0xffff000000000000LL;
3588	_ShiftBpp.use = 24; // == bpp(3) * 8
3589	_ShiftRem.use = 40; // == 64 - 24
3590
3591	__asm__ __volatile__ (
3592	"movl _dif, %%ecx \n\t"
3593	// preload "movl row, %%edi \n\t"
3594	// preload "movl prev_row, %%esi \n\t"
3595	"pxor %%mm0, %%mm0 \n\t"
3596	// prime the pump: load the first Raw(x-bpp) data set
3597	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3598	"paeth_3lp: \n\t"
3599	"psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
3600	// 3 bytes
3601	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3602	"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3603	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3604	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3605	"psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
3606	// 3 bytes
3607	// pav = p - a = (a + b - c) - a = b - c
3608	"movq %%mm2, %%mm4 \n\t"
3609	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3610	// pbv = p - b = (a + b - c) - b = a - c
3611	"movq %%mm1, %%mm5 \n\t"
3612	"psubw %%mm3, %%mm4 \n\t"
3613	"pxor %%mm7, %%mm7 \n\t"
3614	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3615	"movq %%mm4, %%mm6 \n\t"
3616	"psubw %%mm3, %%mm5 \n\t"
3617
3618	// pa = abs(p-a) = abs(pav)
3619	// pb = abs(p-b) = abs(pbv)
3620	// pc = abs(p-c) = abs(pcv)
3621	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3622	"paddw %%mm5, %%mm6 \n\t"
3623	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3624	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3625	"psubw %%mm0, %%mm4 \n\t"
3626	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3627	"psubw %%mm0, %%mm4 \n\t"
3628	"psubw %%mm7, %%mm5 \n\t"
3629	"pxor %%mm0, %%mm0 \n\t"
3630	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3631	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3632	"psubw %%mm7, %%mm5 \n\t"
3633	"psubw %%mm0, %%mm6 \n\t"
3634	// test pa <= pb
3635	"movq %%mm4, %%mm7 \n\t"
3636	"psubw %%mm0, %%mm6 \n\t"
3637	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3638	"movq %%mm7, %%mm0 \n\t"
3639	// use mm7 mask to merge pa & pb
3640	"pand %%mm7, %%mm5 \n\t"
3641	// use mm0 mask copy to merge a & b
3642	"pand %%mm0, %%mm2 \n\t"
3643	"pandn %%mm4, %%mm7 \n\t"
3644	"pandn %%mm1, %%mm0 \n\t"
3645	"paddw %%mm5, %%mm7 \n\t"
3646	"paddw %%mm2, %%mm0 \n\t"
3647	// test ((pa <= pb)? pa:pb) <= pc
3648	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3649	"pxor %%mm1, %%mm1 \n\t"
3650	"pand %%mm7, %%mm3 \n\t"
3651	"pandn %%mm0, %%mm7 \n\t"
3652	"paddw %%mm3, %%mm7 \n\t"
3653	"pxor %%mm0, %%mm0 \n\t"
3654	"packuswb %%mm1, %%mm7 \n\t"
3655	"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3656	"pand _ActiveMask, %%mm7 \n\t"
3657	"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3658	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3659	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3660	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3661	"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
3662	// Raw(x-bpp)
3663	// now do Paeth for 2nd set of bytes (3-5)
3664	"psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3665	"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3666	"pxor %%mm7, %%mm7 \n\t"
3667	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3668	// pbv = p - b = (a + b - c) - b = a - c
3669	"movq %%mm1, %%mm5 \n\t"
3670	// pav = p - a = (a + b - c) - a = b - c
3671	"movq %%mm2, %%mm4 \n\t"
3672	"psubw %%mm3, %%mm5 \n\t"
3673	"psubw %%mm3, %%mm4 \n\t"
3674	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3675	// pav + pbv = pbv + pav
3676	"movq %%mm5, %%mm6 \n\t"
3677	"paddw %%mm4, %%mm6 \n\t"
3678
3679	// pa = abs(p-a) = abs(pav)
3680	// pb = abs(p-b) = abs(pbv)
3681	// pc = abs(p-c) = abs(pcv)
3682	"pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
3683	"pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
3684	"pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
3685	"pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
3686	"psubw %%mm0, %%mm5 \n\t"
3687	"psubw %%mm7, %%mm4 \n\t"
3688	"psubw %%mm0, %%mm5 \n\t"
3689	"psubw %%mm7, %%mm4 \n\t"
3690	"pxor %%mm0, %%mm0 \n\t"
3691	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3692	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3693	"psubw %%mm0, %%mm6 \n\t"
3694	// test pa <= pb
3695	"movq %%mm4, %%mm7 \n\t"
3696	"psubw %%mm0, %%mm6 \n\t"
3697	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3698	"movq %%mm7, %%mm0 \n\t"
3699	// use mm7 mask to merge pa & pb
3700	"pand %%mm7, %%mm5 \n\t"
3701	// use mm0 mask copy to merge a & b
3702	"pand %%mm0, %%mm2 \n\t"
3703	"pandn %%mm4, %%mm7 \n\t"
3704	"pandn %%mm1, %%mm0 \n\t"
3705	"paddw %%mm5, %%mm7 \n\t"
3706	"paddw %%mm2, %%mm0 \n\t"
3707	// test ((pa <= pb)? pa:pb) <= pc
3708	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3709	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3710	"pand %%mm7, %%mm3 \n\t"
3711	"pandn %%mm0, %%mm7 \n\t"
3712	"pxor %%mm1, %%mm1 \n\t"
3713	"paddw %%mm3, %%mm7 \n\t"
3714	"pxor %%mm0, %%mm0 \n\t"
3715	"packuswb %%mm1, %%mm7 \n\t"
3716	"movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3717	"pand _ActiveMask, %%mm7 \n\t"
3718	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3719	"psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
3720	// 3 bytes
3721	// pav = p - a = (a + b - c) - a = b - c
3722	"movq %%mm2, %%mm4 \n\t"
3723	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3724	"psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3725	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3726	"movq %%mm7, %%mm1 \n\t"
3727	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3728	"psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
3729	// now mm1 will be used as Raw(x-bpp)
3730	// now do Paeth for 3rd, and final, set of bytes (6-7)
3731	"pxor %%mm7, %%mm7 \n\t"
3732	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3733	"psubw %%mm3, %%mm4 \n\t"
3734	// pbv = p - b = (a + b - c) - b = a - c
3735	"movq %%mm1, %%mm5 \n\t"
3736	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3737	"movq %%mm4, %%mm6 \n\t"
3738	"psubw %%mm3, %%mm5 \n\t"
3739	"pxor %%mm0, %%mm0 \n\t"
3740	"paddw %%mm5, %%mm6 \n\t"
3741
3742	// pa = abs(p-a) = abs(pav)
3743	// pb = abs(p-b) = abs(pbv)
3744	// pc = abs(p-c) = abs(pcv)
3745	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3746	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3747	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3748	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3749	"psubw %%mm0, %%mm4 \n\t"
3750	"psubw %%mm7, %%mm5 \n\t"
3751	"psubw %%mm0, %%mm4 \n\t"
3752	"psubw %%mm7, %%mm5 \n\t"
3753	"pxor %%mm0, %%mm0 \n\t"
3754	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3755	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3756	"psubw %%mm0, %%mm6 \n\t"
3757	// test pa <= pb
3758	"movq %%mm4, %%mm7 \n\t"
3759	"psubw %%mm0, %%mm6 \n\t"
3760	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3761	"movq %%mm7, %%mm0 \n\t"
3762	// use mm0 mask copy to merge a & b
3763	"pand %%mm0, %%mm2 \n\t"
3764	// use mm7 mask to merge pa & pb
3765	"pand %%mm7, %%mm5 \n\t"
3766	"pandn %%mm1, %%mm0 \n\t"
3767	"pandn %%mm4, %%mm7 \n\t"
3768	"paddw %%mm2, %%mm0 \n\t"
3769	"paddw %%mm5, %%mm7 \n\t"
3770	// test ((pa <= pb)? pa:pb) <= pc
3771	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3772	"pand %%mm7, %%mm3 \n\t"
3773	"pandn %%mm0, %%mm7 \n\t"
3774	"paddw %%mm3, %%mm7 \n\t"
3775	"pxor %%mm1, %%mm1 \n\t"
3776	"packuswb %%mm7, %%mm1 \n\t"
3777	// step ecx to next set of 8 bytes and repeat loop til done
3778	"addl $8, %%ecx \n\t"
3779	"pand _ActiveMaskEnd, %%mm1 \n\t"
3780	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3781	// Raw(x)
3782
3783	"cmpl _MMXLength, %%ecx \n\t"
3784	"pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3785	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3786	// mm1 will be used as Raw(x-bpp) next loop
3787	// mm3 ready to be used as Prior(x-bpp) next loop
3788	"jb paeth_3lp \n\t"
3789
3790	: "=S" (dummy_value_S), // output regs (dummy)
3791	"=D" (dummy_value_D)
3792
3793	: "0" (prev_row), // esi // input regs
3794	"1" (row) // edi
3795
3796	: "%ecx" // clobber list
3797	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3798	, "%mm0", "%mm1", "%mm2", "%mm3"
3799	, "%mm4", "%mm5", "%mm6", "%mm7"
3800	#endif
3801	);
3802	}
3803	break; // end 3 bpp
3804
3805	case 6:
3806	//case 7: // GRR BOGUS
3807	//case 5: // GRR BOGUS
3808	{
3809	_ActiveMask.use = 0x00000000ffffffffLL;
3810	_ActiveMask2.use = 0xffffffff00000000LL;
3811	_ShiftBpp.use = bpp << 3; // == bpp * 8
3812	_ShiftRem.use = 64 - _ShiftBpp.use;
3813
3814	__asm__ __volatile__ (
3815	"movl _dif, %%ecx \n\t"
3816	// preload "movl row, %%edi \n\t"
3817	// preload "movl prev_row, %%esi \n\t"
3818	// prime the pump: load the first Raw(x-bpp) data set
3819	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3820	"pxor %%mm0, %%mm0 \n\t"
3821
3822	"paeth_6lp: \n\t"
3823	// must shift to position Raw(x-bpp) data
3824	"psrlq _ShiftRem, %%mm1 \n\t"
3825	// do first set of 4 bytes
3826	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3827	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3828	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3829	"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3830	// must shift to position Prior(x-bpp) data
3831	"psrlq _ShiftRem, %%mm3 \n\t"
3832	// pav = p - a = (a + b - c) - a = b - c
3833	"movq %%mm2, %%mm4 \n\t"
3834	"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3835	// pbv = p - b = (a + b - c) - b = a - c
3836	"movq %%mm1, %%mm5 \n\t"
3837	"psubw %%mm3, %%mm4 \n\t"
3838	"pxor %%mm7, %%mm7 \n\t"
3839	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3840	"movq %%mm4, %%mm6 \n\t"
3841	"psubw %%mm3, %%mm5 \n\t"
3842	// pa = abs(p-a) = abs(pav)
3843	// pb = abs(p-b) = abs(pbv)
3844	// pc = abs(p-c) = abs(pcv)
3845	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3846	"paddw %%mm5, %%mm6 \n\t"
3847	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3848	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3849	"psubw %%mm0, %%mm4 \n\t"
3850	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3851	"psubw %%mm0, %%mm4 \n\t"
3852	"psubw %%mm7, %%mm5 \n\t"
3853	"pxor %%mm0, %%mm0 \n\t"
3854	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3855	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3856	"psubw %%mm7, %%mm5 \n\t"
3857	"psubw %%mm0, %%mm6 \n\t"
3858	// test pa <= pb
3859	"movq %%mm4, %%mm7 \n\t"
3860	"psubw %%mm0, %%mm6 \n\t"
3861	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3862	"movq %%mm7, %%mm0 \n\t"
3863	// use mm7 mask to merge pa & pb
3864	"pand %%mm7, %%mm5 \n\t"
3865	// use mm0 mask copy to merge a & b
3866	"pand %%mm0, %%mm2 \n\t"
3867	"pandn %%mm4, %%mm7 \n\t"
3868	"pandn %%mm1, %%mm0 \n\t"
3869	"paddw %%mm5, %%mm7 \n\t"
3870	"paddw %%mm2, %%mm0 \n\t"
3871	// test ((pa <= pb)? pa:pb) <= pc
3872	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3873	"pxor %%mm1, %%mm1 \n\t"
3874	"pand %%mm7, %%mm3 \n\t"
3875	"pandn %%mm0, %%mm7 \n\t"
3876	"paddw %%mm3, %%mm7 \n\t"
3877	"pxor %%mm0, %%mm0 \n\t"
3878	"packuswb %%mm1, %%mm7 \n\t"
3879	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3880	"pand _ActiveMask, %%mm7 \n\t"
3881	"psrlq _ShiftRem, %%mm3 \n\t"
3882	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
3883	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3884	"movq %%mm2, %%mm6 \n\t"
3885	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3886	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3887	"psllq _ShiftBpp, %%mm6 \n\t"
3888	"movq %%mm7, %%mm5 \n\t"
3889	"psrlq _ShiftRem, %%mm1 \n\t"
3890	"por %%mm6, %%mm3 \n\t"
3891	"psllq _ShiftBpp, %%mm5 \n\t"
3892	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3893	"por %%mm5, %%mm1 \n\t"
3894	// do second set of 4 bytes
3895	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3896	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3897	// pav = p - a = (a + b - c) - a = b - c
3898	"movq %%mm2, %%mm4 \n\t"
3899	// pbv = p - b = (a + b - c) - b = a - c
3900	"movq %%mm1, %%mm5 \n\t"
3901	"psubw %%mm3, %%mm4 \n\t"
3902	"pxor %%mm7, %%mm7 \n\t"
3903	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3904	"movq %%mm4, %%mm6 \n\t"
3905	"psubw %%mm3, %%mm5 \n\t"
3906	// pa = abs(p-a) = abs(pav)
3907	// pb = abs(p-b) = abs(pbv)
3908	// pc = abs(p-c) = abs(pcv)
3909	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3910	"paddw %%mm5, %%mm6 \n\t"
3911	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3912	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3913	"psubw %%mm0, %%mm4 \n\t"
3914	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3915	"psubw %%mm0, %%mm4 \n\t"
3916	"psubw %%mm7, %%mm5 \n\t"
3917	"pxor %%mm0, %%mm0 \n\t"
3918	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3919	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3920	"psubw %%mm7, %%mm5 \n\t"
3921	"psubw %%mm0, %%mm6 \n\t"
3922	// test pa <= pb
3923	"movq %%mm4, %%mm7 \n\t"
3924	"psubw %%mm0, %%mm6 \n\t"
3925	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3926	"movq %%mm7, %%mm0 \n\t"
3927	// use mm7 mask to merge pa & pb
3928	"pand %%mm7, %%mm5 \n\t"
3929	// use mm0 mask copy to merge a & b
3930	"pand %%mm0, %%mm2 \n\t"
3931	"pandn %%mm4, %%mm7 \n\t"
3932	"pandn %%mm1, %%mm0 \n\t"
3933	"paddw %%mm5, %%mm7 \n\t"
3934	"paddw %%mm2, %%mm0 \n\t"
3935	// test ((pa <= pb)? pa:pb) <= pc
3936	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3937	"pxor %%mm1, %%mm1 \n\t"
3938	"pand %%mm7, %%mm3 \n\t"
3939	"pandn %%mm0, %%mm7 \n\t"
3940	"pxor %%mm1, %%mm1 \n\t"
3941	"paddw %%mm3, %%mm7 \n\t"
3942	"pxor %%mm0, %%mm0 \n\t"
3943	// step ecx to next set of 8 bytes and repeat loop til done
3944	"addl $8, %%ecx \n\t"
3945	"packuswb %%mm7, %%mm1 \n\t"
3946	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3947	"cmpl _MMXLength, %%ecx \n\t"
3948	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3949	// mm1 will be used as Raw(x-bpp) next loop
3950	"jb paeth_6lp \n\t"
3951
3952	: "=S" (dummy_value_S), // output regs (dummy)
3953	"=D" (dummy_value_D)
3954
3955	: "0" (prev_row), // esi // input regs
3956	"1" (row) // edi
3957
3958	: "%ecx" // clobber list
3959	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3960	, "%mm0", "%mm1", "%mm2", "%mm3"
3961	, "%mm4", "%mm5", "%mm6", "%mm7"
3962	#endif
3963	);
3964	}
3965	break; // end 6 bpp
3966
3967	case 4:
3968	{
3969	_ActiveMask.use = 0x00000000ffffffffLL;
3970
3971	__asm__ __volatile__ (
3972	"movl _dif, %%ecx \n\t"
3973	// preload "movl row, %%edi \n\t"
3974	// preload "movl prev_row, %%esi \n\t"
3975	"pxor %%mm0, %%mm0 \n\t"
3976	// prime the pump: load the first Raw(x-bpp) data set
3977	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3978	// a=Raw(x-bpp) bytes
3979	"paeth_4lp: \n\t"
3980	// do first set of 4 bytes
3981	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3982	"punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3983	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3984	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3985	// pav = p - a = (a + b - c) - a = b - c
3986	"movq %%mm2, %%mm4 \n\t"
3987	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3988	// pbv = p - b = (a + b - c) - b = a - c
3989	"movq %%mm1, %%mm5 \n\t"
3990	"psubw %%mm3, %%mm4 \n\t"
3991	"pxor %%mm7, %%mm7 \n\t"
3992	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3993	"movq %%mm4, %%mm6 \n\t"
3994	"psubw %%mm3, %%mm5 \n\t"
3995	// pa = abs(p-a) = abs(pav)
3996	// pb = abs(p-b) = abs(pbv)
3997	// pc = abs(p-c) = abs(pcv)
3998	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3999	"paddw %%mm5, %%mm6 \n\t"
4000	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4001	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4002	"psubw %%mm0, %%mm4 \n\t"
4003	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4004	"psubw %%mm0, %%mm4 \n\t"
4005	"psubw %%mm7, %%mm5 \n\t"
4006	"pxor %%mm0, %%mm0 \n\t"
4007	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4008	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4009	"psubw %%mm7, %%mm5 \n\t"
4010	"psubw %%mm0, %%mm6 \n\t"
4011	// test pa <= pb
4012	"movq %%mm4, %%mm7 \n\t"
4013	"psubw %%mm0, %%mm6 \n\t"
4014	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4015	"movq %%mm7, %%mm0 \n\t"
4016	// use mm7 mask to merge pa & pb
4017	"pand %%mm7, %%mm5 \n\t"
4018	// use mm0 mask copy to merge a & b
4019	"pand %%mm0, %%mm2 \n\t"
4020	"pandn %%mm4, %%mm7 \n\t"
4021	"pandn %%mm1, %%mm0 \n\t"
4022	"paddw %%mm5, %%mm7 \n\t"
4023	"paddw %%mm2, %%mm0 \n\t"
4024	// test ((pa <= pb)? pa:pb) <= pc
4025	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4026	"pxor %%mm1, %%mm1 \n\t"
4027	"pand %%mm7, %%mm3 \n\t"
4028	"pandn %%mm0, %%mm7 \n\t"
4029	"paddw %%mm3, %%mm7 \n\t"
4030	"pxor %%mm0, %%mm0 \n\t"
4031	"packuswb %%mm1, %%mm7 \n\t"
4032	"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
4033	"pand _ActiveMask, %%mm7 \n\t"
4034	"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
4035	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4036	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4037	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4038	"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
4039	// do second set of 4 bytes
4040	"punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4041	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4042	// pav = p - a = (a + b - c) - a = b - c
4043	"movq %%mm2, %%mm4 \n\t"
4044	// pbv = p - b = (a + b - c) - b = a - c
4045	"movq %%mm1, %%mm5 \n\t"
4046	"psubw %%mm3, %%mm4 \n\t"
4047	"pxor %%mm7, %%mm7 \n\t"
4048	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4049	"movq %%mm4, %%mm6 \n\t"
4050	"psubw %%mm3, %%mm5 \n\t"
4051	// pa = abs(p-a) = abs(pav)
4052	// pb = abs(p-b) = abs(pbv)
4053	// pc = abs(p-c) = abs(pcv)
4054	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4055	"paddw %%mm5, %%mm6 \n\t"
4056	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4057	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4058	"psubw %%mm0, %%mm4 \n\t"
4059	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4060	"psubw %%mm0, %%mm4 \n\t"
4061	"psubw %%mm7, %%mm5 \n\t"
4062	"pxor %%mm0, %%mm0 \n\t"
4063	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4064	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4065	"psubw %%mm7, %%mm5 \n\t"
4066	"psubw %%mm0, %%mm6 \n\t"
4067	// test pa <= pb
4068	"movq %%mm4, %%mm7 \n\t"
4069	"psubw %%mm0, %%mm6 \n\t"
4070	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4071	"movq %%mm7, %%mm0 \n\t"
4072	// use mm7 mask to merge pa & pb
4073	"pand %%mm7, %%mm5 \n\t"
4074	// use mm0 mask copy to merge a & b
4075	"pand %%mm0, %%mm2 \n\t"
4076	"pandn %%mm4, %%mm7 \n\t"
4077	"pandn %%mm1, %%mm0 \n\t"
4078	"paddw %%mm5, %%mm7 \n\t"
4079	"paddw %%mm2, %%mm0 \n\t"
4080	// test ((pa <= pb)? pa:pb) <= pc
4081	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4082	"pxor %%mm1, %%mm1 \n\t"
4083	"pand %%mm7, %%mm3 \n\t"
4084	"pandn %%mm0, %%mm7 \n\t"
4085	"pxor %%mm1, %%mm1 \n\t"
4086	"paddw %%mm3, %%mm7 \n\t"
4087	"pxor %%mm0, %%mm0 \n\t"
4088	// step ecx to next set of 8 bytes and repeat loop til done
4089	"addl $8, %%ecx \n\t"
4090	"packuswb %%mm7, %%mm1 \n\t"
4091	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4092	"cmpl _MMXLength, %%ecx \n\t"
4093	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4094	// mm1 will be used as Raw(x-bpp) next loop
4095	"jb paeth_4lp \n\t"
4096
4097	: "=S" (dummy_value_S), // output regs (dummy)
4098	"=D" (dummy_value_D)
4099
4100	: "0" (prev_row), // esi // input regs
4101	"1" (row) // edi
4102
4103	: "%ecx" // clobber list
4104	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4105	, "%mm0", "%mm1", "%mm2", "%mm3"
4106	, "%mm4", "%mm5", "%mm6", "%mm7"
4107	#endif
4108	);
4109	}
4110	break; // end 4 bpp
4111
4112	case 8: // bpp == 8
4113	{
4114	_ActiveMask.use = 0x00000000ffffffffLL;
4115
4116	__asm__ __volatile__ (
4117	"movl _dif, %%ecx \n\t"
4118	// preload "movl row, %%edi \n\t"
4119	// preload "movl prev_row, %%esi \n\t"
4120	"pxor %%mm0, %%mm0 \n\t"
4121	// prime the pump: load the first Raw(x-bpp) data set
4122	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4123	// a=Raw(x-bpp) bytes
4124	"paeth_8lp: \n\t"
4125	// do first set of 4 bytes
4126	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4127	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4128	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4129	"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4130	// pav = p - a = (a + b - c) - a = b - c
4131	"movq %%mm2, %%mm4 \n\t"
4132	"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
4133	// pbv = p - b = (a + b - c) - b = a - c
4134	"movq %%mm1, %%mm5 \n\t"
4135	"psubw %%mm3, %%mm4 \n\t"
4136	"pxor %%mm7, %%mm7 \n\t"
4137	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4138	"movq %%mm4, %%mm6 \n\t"
4139	"psubw %%mm3, %%mm5 \n\t"
4140	// pa = abs(p-a) = abs(pav)
4141	// pb = abs(p-b) = abs(pbv)
4142	// pc = abs(p-c) = abs(pcv)
4143	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4144	"paddw %%mm5, %%mm6 \n\t"
4145	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4146	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4147	"psubw %%mm0, %%mm4 \n\t"
4148	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4149	"psubw %%mm0, %%mm4 \n\t"
4150	"psubw %%mm7, %%mm5 \n\t"
4151	"pxor %%mm0, %%mm0 \n\t"
4152	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4153	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4154	"psubw %%mm7, %%mm5 \n\t"
4155	"psubw %%mm0, %%mm6 \n\t"
4156	// test pa <= pb
4157	"movq %%mm4, %%mm7 \n\t"
4158	"psubw %%mm0, %%mm6 \n\t"
4159	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4160	"movq %%mm7, %%mm0 \n\t"
4161	// use mm7 mask to merge pa & pb
4162	"pand %%mm7, %%mm5 \n\t"
4163	// use mm0 mask copy to merge a & b
4164	"pand %%mm0, %%mm2 \n\t"
4165	"pandn %%mm4, %%mm7 \n\t"
4166	"pandn %%mm1, %%mm0 \n\t"
4167	"paddw %%mm5, %%mm7 \n\t"
4168	"paddw %%mm2, %%mm0 \n\t"
4169	// test ((pa <= pb)? pa:pb) <= pc
4170	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4171	"pxor %%mm1, %%mm1 \n\t"
4172	"pand %%mm7, %%mm3 \n\t"
4173	"pandn %%mm0, %%mm7 \n\t"
4174	"paddw %%mm3, %%mm7 \n\t"
4175	"pxor %%mm0, %%mm0 \n\t"
4176	"packuswb %%mm1, %%mm7 \n\t"
4177	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4178	"pand _ActiveMask, %%mm7 \n\t"
4179	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4180	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4181	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4182	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4183	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4184
4185	// do second set of 4 bytes
4186	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
4187	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
4188	// pav = p - a = (a + b - c) - a = b - c
4189	"movq %%mm2, %%mm4 \n\t"
4190	// pbv = p - b = (a + b - c) - b = a - c
4191	"movq %%mm1, %%mm5 \n\t"
4192	"psubw %%mm3, %%mm4 \n\t"
4193	"pxor %%mm7, %%mm7 \n\t"
4194	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4195	"movq %%mm4, %%mm6 \n\t"
4196	"psubw %%mm3, %%mm5 \n\t"
4197	// pa = abs(p-a) = abs(pav)
4198	// pb = abs(p-b) = abs(pbv)
4199	// pc = abs(p-c) = abs(pcv)
4200	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4201	"paddw %%mm5, %%mm6 \n\t"
4202	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4203	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4204	"psubw %%mm0, %%mm4 \n\t"
4205	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4206	"psubw %%mm0, %%mm4 \n\t"
4207	"psubw %%mm7, %%mm5 \n\t"
4208	"pxor %%mm0, %%mm0 \n\t"
4209	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4210	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4211	"psubw %%mm7, %%mm5 \n\t"
4212	"psubw %%mm0, %%mm6 \n\t"
4213	// test pa <= pb
4214	"movq %%mm4, %%mm7 \n\t"
4215	"psubw %%mm0, %%mm6 \n\t"
4216	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4217	"movq %%mm7, %%mm0 \n\t"
4218	// use mm7 mask to merge pa & pb
4219	"pand %%mm7, %%mm5 \n\t"
4220	// use mm0 mask copy to merge a & b
4221	"pand %%mm0, %%mm2 \n\t"
4222	"pandn %%mm4, %%mm7 \n\t"
4223	"pandn %%mm1, %%mm0 \n\t"
4224	"paddw %%mm5, %%mm7 \n\t"
4225	"paddw %%mm2, %%mm0 \n\t"
4226	// test ((pa <= pb)? pa:pb) <= pc
4227	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4228	"pxor %%mm1, %%mm1 \n\t"
4229	"pand %%mm7, %%mm3 \n\t"
4230	"pandn %%mm0, %%mm7 \n\t"
4231	"pxor %%mm1, %%mm1 \n\t"
4232	"paddw %%mm3, %%mm7 \n\t"
4233	"pxor %%mm0, %%mm0 \n\t"
4234	// step ecx to next set of 8 bytes and repeat loop til done
4235	"addl $8, %%ecx \n\t"
4236	"packuswb %%mm7, %%mm1 \n\t"
4237	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4238	"cmpl _MMXLength, %%ecx \n\t"
4239	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4240	// mm1 will be used as Raw(x-bpp) next loop
4241	"jb paeth_8lp \n\t"
4242
4243	: "=S" (dummy_value_S), // output regs (dummy)
4244	"=D" (dummy_value_D)
4245
4246	: "0" (prev_row), // esi // input regs
4247	"1" (row) // edi
4248
4249	: "%ecx" // clobber list
4250	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4251	, "%mm0", "%mm1", "%mm2", "%mm3"
4252	, "%mm4", "%mm5", "%mm6", "%mm7"
4253	#endif
4254	);
4255	}
4256	break; // end 8 bpp
4257
4258	case 1: // bpp = 1
4259	case 2: // bpp = 2
4260	default: // bpp > 8
4261	{
4262	__asm__ __volatile__ (
4263	#ifdef __PIC__
4264	"pushl %%ebx \n\t" // save Global Offset Table index
4265	#endif
4266	"movl _dif, %%ebx \n\t"
4267	"cmpl _FullLength, %%ebx \n\t"
4268	"jnb paeth_dend \n\t"
4269
4270	// preload "movl row, %%edi \n\t"
4271	// preload "movl prev_row, %%esi \n\t"
4272	// do Paeth decode for remaining bytes
4273	"movl %%ebx, %%edx \n\t"
4274	// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4275	"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4276	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
4277
4278	"paeth_dlp: \n\t"
4279	"xorl %%eax, %%eax \n\t"
4280	// pav = p - a = (a + b - c) - a = b - c
4281	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4282	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4283	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4284	"movl %%eax, _patemp \n\t" // Save pav for later use
4285	"xorl %%eax, %%eax \n\t"
4286	// pbv = p - b = (a + b - c) - b = a - c
4287	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4288	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4289	"movl %%eax, %%ecx \n\t"
4290	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4291	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
4292	// pc = abs(pcv)
4293	"testl $0x80000000, %%eax \n\t"
4294	"jz paeth_dpca \n\t"
4295	"negl %%eax \n\t" // reverse sign of neg values
4296
4297	"paeth_dpca: \n\t"
4298	"movl %%eax, _pctemp \n\t" // save pc for later use
4299	// pb = abs(pbv)
4300	"testl $0x80000000, %%ecx \n\t"
4301	"jz paeth_dpba \n\t"
4302	"negl %%ecx \n\t" // reverse sign of neg values
4303
4304	"paeth_dpba: \n\t"
4305	"movl %%ecx, _pbtemp \n\t" // save pb for later use
4306	// pa = abs(pav)
4307	"movl _patemp, %%eax \n\t"
4308	"testl $0x80000000, %%eax \n\t"
4309	"jz paeth_dpaa \n\t"
4310	"negl %%eax \n\t" // reverse sign of neg values
4311
4312	"paeth_dpaa: \n\t"
4313	"movl %%eax, _patemp \n\t" // save pa for later use
4314	// test if pa <= pb
4315	"cmpl %%ecx, %%eax \n\t"
4316	"jna paeth_dabb \n\t"
4317	// pa > pb; now test if pb <= pc
4318	"cmpl _pctemp, %%ecx \n\t"
4319	"jna paeth_dbbc \n\t"
4320	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4321	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4322	"jmp paeth_dpaeth \n\t"
4323
4324	"paeth_dbbc: \n\t"
4325	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4326	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4327	"jmp paeth_dpaeth \n\t"
4328
4329	"paeth_dabb: \n\t"
4330	// pa <= pb; now test if pa <= pc
4331	"cmpl _pctemp, %%eax \n\t"
4332	"jna paeth_dabc \n\t"
4333	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4334	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4335	"jmp paeth_dpaeth \n\t"
4336
4337	"paeth_dabc: \n\t"
4338	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4339	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4340
4341	"paeth_dpaeth: \n\t"
4342	"incl %%ebx \n\t"
4343	"incl %%edx \n\t"
4344	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4345	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
4346	"cmpl _FullLength, %%ebx \n\t"
4347	"jb paeth_dlp \n\t"
4348
4349	"paeth_dend: \n\t"
4350	#ifdef __PIC__
4351	"popl %%ebx \n\t" // index to Global Offset Table
4352	#endif
4353
4354	: "=c" (dummy_value_c), // output regs (dummy)
4355	"=S" (dummy_value_S),
4356	"=D" (dummy_value_D)
4357
4358	: "0" (bpp), // ecx // input regs
4359	"1" (prev_row), // esi
4360	"2" (row) // edi
4361
4362	: "%eax", "%edx" // clobber list
4363	#ifndef __PIC__
4364	, "%ebx"
4365	#endif
4366	);
4367	}
4368	return; // No need to go further with this one
4369
4370	} // end switch (bpp)
4371
4372	__asm__ __volatile__ (
4373	// MMX acceleration complete; now do clean-up
4374	// check if any remaining bytes left to decode
4375	#ifdef __PIC__
4376	"pushl %%ebx \n\t" // save index to Global Offset Table
4377	#endif
4378	"movl _MMXLength, %%ebx \n\t"
4379	"cmpl _FullLength, %%ebx \n\t"
4380	"jnb paeth_end \n\t"
4381	//pre "movl row, %%edi \n\t"
4382	//pre "movl prev_row, %%esi \n\t"
4383	// do Paeth decode for remaining bytes
4384	"movl %%ebx, %%edx \n\t"
4385	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4386	"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4387	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
4388
4389	"paeth_lp2: \n\t"
4390	"xorl %%eax, %%eax \n\t"
4391	// pav = p - a = (a + b - c) - a = b - c
4392	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4393	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4394	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4395	"movl %%eax, _patemp \n\t" // Save pav for later use
4396	"xorl %%eax, %%eax \n\t"
4397	// pbv = p - b = (a + b - c) - b = a - c
4398	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4399	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4400	"movl %%eax, %%ecx \n\t"
4401	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4402	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
4403	// pc = abs(pcv)
4404	"testl $0x80000000, %%eax \n\t"
4405	"jz paeth_pca2 \n\t"
4406	"negl %%eax \n\t" // reverse sign of neg values
4407
4408	"paeth_pca2: \n\t"
4409	"movl %%eax, _pctemp \n\t" // save pc for later use
4410	// pb = abs(pbv)
4411	"testl $0x80000000, %%ecx \n\t"
4412	"jz paeth_pba2 \n\t"
4413	"negl %%ecx \n\t" // reverse sign of neg values
4414
4415	"paeth_pba2: \n\t"
4416	"movl %%ecx, _pbtemp \n\t" // save pb for later use
4417	// pa = abs(pav)
4418	"movl _patemp, %%eax \n\t"
4419	"testl $0x80000000, %%eax \n\t"
4420	"jz paeth_paa2 \n\t"
4421	"negl %%eax \n\t" // reverse sign of neg values
4422
4423	"paeth_paa2: \n\t"
4424	"movl %%eax, _patemp \n\t" // save pa for later use
4425	// test if pa <= pb
4426	"cmpl %%ecx, %%eax \n\t"
4427	"jna paeth_abb2 \n\t"
4428	// pa > pb; now test if pb <= pc
4429	"cmpl _pctemp, %%ecx \n\t"
4430	"jna paeth_bbc2 \n\t"
4431	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4432	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4433	"jmp paeth_paeth2 \n\t"
4434
4435	"paeth_bbc2: \n\t"
4436	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4437	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4438	"jmp paeth_paeth2 \n\t"
4439
4440	"paeth_abb2: \n\t"
4441	// pa <= pb; now test if pa <= pc
4442	"cmpl _pctemp, %%eax \n\t"
4443	"jna paeth_abc2 \n\t"
4444	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4445	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4446	"jmp paeth_paeth2 \n\t"
4447
4448	"paeth_abc2: \n\t"
4449	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4450	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4451
4452	"paeth_paeth2: \n\t"
4453	"incl %%ebx \n\t"
4454	"incl %%edx \n\t"
4455	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4456	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
4457	"cmpl _FullLength, %%ebx \n\t"
4458	"jb paeth_lp2 \n\t"
4459
4460	"paeth_end: \n\t"
4461	"EMMS \n\t" // end MMX; prep for poss. FP instrs.
4462	#ifdef __PIC__
4463	"popl %%ebx \n\t" // restore index to Global Offset Table
4464	#endif
4465
4466	: "=c" (dummy_value_c), // output regs (dummy)
4467	"=S" (dummy_value_S),
4468	"=D" (dummy_value_D)
4469
4470	: "0" (bpp), // ecx // input regs
4471	"1" (prev_row), // esi
4472	"2" (row) // edi
4473
4474	: "%eax", "%edx" // clobber list (no input regs!)
4475	#ifndef __PIC__
4476	, "%ebx"
4477	#endif
4478	);
4479
4480	} /* end png_read_filter_row_mmx_paeth() */
4481	#endif
4482
4483
4484
4485
4486	#ifdef PNG_THREAD_UNSAFE_OK
4487	//===========================================================================//
4488	// //
4489	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
4490	// //
4491	//===========================================================================//
4492
4493	// Optimized code for PNG Sub filter decoder
4494
4495	static void /* PRIVATE */
4496	png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4497	{
4498	int bpp;
4499	int dummy_value_a;
4500	int dummy_value_D;
4501
4502	bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
4503	_FullLength = row_info->rowbytes - bpp; // number of bytes to filter
4504
4505	__asm__ __volatile__ (
4506	//pre "movl row, %%edi \n\t"
4507	"movl %%edi, %%esi \n\t" // lp = row
4508	//pre "movl bpp, %%eax \n\t"
4509	"addl %%eax, %%edi \n\t" // rp = row + bpp
4510	//irr "xorl %%eax, %%eax \n\t"
4511	// get # of bytes to alignment
4512	"movl %%edi, _dif \n\t" // take start of row
4513	"addl $0xf, _dif \n\t" // add 7 + 8 to incr past
4514	// alignment boundary
4515	"xorl %%ecx, %%ecx \n\t"
4516	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
4517	"subl %%edi, _dif \n\t" // subtract from start ==> value
4518	"jz sub_go \n\t" // ecx at alignment
4519
4520	"sub_lp1: \n\t" // fix alignment
4521	"movb (%%esi,%%ecx,), %%al \n\t"
4522	"addb %%al, (%%edi,%%ecx,) \n\t"
4523	"incl %%ecx \n\t"
4524	"cmpl _dif, %%ecx \n\t"
4525	"jb sub_lp1 \n\t"
4526
4527	"sub_go: \n\t"
4528	"movl _FullLength, %%eax \n\t"
4529	"movl %%eax, %%edx \n\t"
4530	"subl %%ecx, %%edx \n\t" // subtract alignment fix
4531	"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4532	"subl %%edx, %%eax \n\t" // drop over bytes from length
4533	"movl %%eax, _MMXLength \n\t"
4534
4535	: "=a" (dummy_value_a), // 0 // output regs (dummy)
4536	"=D" (dummy_value_D) // 1
4537
4538	: "0" (bpp), // eax // input regs
4539	"1" (row) // edi
4540
4541	: "%esi", "%ecx", "%edx" // clobber list
4542
4543	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4544	, "%mm0", "%mm1", "%mm2", "%mm3"
4545	, "%mm4", "%mm5", "%mm6", "%mm7"
4546	#endif
4547	);
4548
4549	// now do the math for the rest of the row
4550	switch (bpp)
4551	{
4552	case 3:
4553	{
4554	_ActiveMask.use = 0x0000ffffff000000LL;
4555	_ShiftBpp.use = 24; // == 3 * 8
4556	_ShiftRem.use = 40; // == 64 - 24
4557
4558	__asm__ __volatile__ (
4559	// preload "movl row, %%edi \n\t"
4560	"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4561	// active byte group
4562	"movl %%edi, %%esi \n\t" // lp = row
4563	// preload "movl bpp, %%eax \n\t"
4564	"addl %%eax, %%edi \n\t" // rp = row + bpp
4565	"movq %%mm7, %%mm6 \n\t"
4566	"movl _dif, %%edx \n\t"
4567	"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4568	// 3rd active byte group
4569	// prime the pump: load the first Raw(x-bpp) data set
4570	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
4571
4572	"sub_3lp: \n\t" // shift data for adding first
4573	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4574	// shift clears inactive bytes)
4575	// add 1st active group
4576	"movq (%%edi,%%edx,), %%mm0 \n\t"
4577	"paddb %%mm1, %%mm0 \n\t"
4578
4579	// add 2nd active group
4580	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4581	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4582	"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4583	"paddb %%mm1, %%mm0 \n\t"
4584
4585	// add 3rd active group
4586	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4587	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4588	"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4589	"addl $8, %%edx \n\t"
4590	"paddb %%mm1, %%mm0 \n\t"
4591
4592	"cmpl _MMXLength, %%edx \n\t"
4593	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4594	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4595	"jb sub_3lp \n\t"
4596
4597	: "=a" (dummy_value_a), // 0 // output regs (dummy)
4598	"=D" (dummy_value_D) // 1
4599
4600	: "0" (bpp), // eax // input regs
4601	"1" (row) // edi
4602
4603	: "%edx", "%esi" // clobber list
4604	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4605	, "%mm0", "%mm1", "%mm6", "%mm7"
4606	#endif
4607	);
4608	}
4609	break;
4610
4611	case 1:
4612	{
4613	__asm__ __volatile__ (
4614	"movl _dif, %%edx \n\t"
4615	// preload "movl row, %%edi \n\t"
4616	"cmpl _FullLength, %%edx \n\t"
4617	"jnb sub_1end \n\t"
4618	"movl %%edi, %%esi \n\t" // lp = row
4619	"xorl %%eax, %%eax \n\t"
4620	// preload "movl bpp, %%eax \n\t"
4621	"addl %%eax, %%edi \n\t" // rp = row + bpp
4622
4623	"sub_1lp: \n\t"
4624	"movb (%%esi,%%edx,), %%al \n\t"
4625	"addb %%al, (%%edi,%%edx,) \n\t"
4626	"incl %%edx \n\t"
4627	"cmpl _FullLength, %%edx \n\t"
4628	"jb sub_1lp \n\t"
4629
4630	"sub_1end: \n\t"
4631
4632	: "=a" (dummy_value_a), // 0 // output regs (dummy)
4633	"=D" (dummy_value_D) // 1
4634
4635	: "0" (bpp), // eax // input regs
4636	"1" (row) // edi
4637
4638	: "%edx", "%esi" // clobber list
4639	);
4640	}
4641	return;
4642
4643	case 6:
4644	case 4:
4645	//case 7: // GRR BOGUS
4646	//case 5: // GRR BOGUS
4647	{
4648	_ShiftBpp.use = bpp << 3;
4649	_ShiftRem.use = 64 - _ShiftBpp.use;
4650
4651	__asm__ __volatile__ (
4652	// preload "movl row, %%edi \n\t"
4653	"movl _dif, %%edx \n\t"
4654	"movl %%edi, %%esi \n\t" // lp = row
4655	// preload "movl bpp, %%eax \n\t"
4656	"addl %%eax, %%edi \n\t" // rp = row + bpp
4657
4658	// prime the pump: load the first Raw(x-bpp) data set
4659	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
4660
4661	"sub_4lp: \n\t" // shift data for adding first
4662	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4663	// shift clears inactive bytes)
4664	"movq (%%edi,%%edx,), %%mm0 \n\t"
4665	"paddb %%mm1, %%mm0 \n\t"
4666
4667	// add 2nd active group
4668	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4669	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4670	"addl $8, %%edx \n\t"
4671	"paddb %%mm1, %%mm0 \n\t"
4672
4673	"cmpl _MMXLength, %%edx \n\t"
4674	"movq %%mm0, -8(%%edi,%%edx,) \n\t"
4675	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4676	"jb sub_4lp \n\t"
4677
4678	: "=a" (dummy_value_a), // 0 // output regs (dummy)
4679	"=D" (dummy_value_D) // 1
4680
4681	: "0" (bpp), // eax // input regs
4682	"1" (row) // edi
4683
4684	: "%edx", "%esi" // clobber list
4685	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4686	, "%mm0", "%mm1"
4687	#endif
4688	);
4689	}
4690	break;
4691
4692	case 2:
4693	{
4694	_ActiveMask.use = 0x00000000ffff0000LL;
4695	_ShiftBpp.use = 16; // == 2 * 8
4696	_ShiftRem.use = 48; // == 64 - 16
4697
4698	__asm__ __volatile__ (
4699	"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4700	// active byte group
4701	"movl _dif, %%edx \n\t"
4702	"movq %%mm7, %%mm6 \n\t"
4703	// preload "movl row, %%edi \n\t"
4704	"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4705	// 3rd active byte group
4706	"movl %%edi, %%esi \n\t" // lp = row
4707	"movq %%mm6, %%mm5 \n\t"
4708	// preload "movl bpp, %%eax \n\t"
4709	"addl %%eax, %%edi \n\t" // rp = row + bpp
4710	"psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
4711	// 4th active byte group
4712	// prime the pump: load the first Raw(x-bpp) data set
4713	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
4714
4715	"sub_2lp: \n\t" // shift data for adding first
4716	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4717	// shift clears inactive bytes)
4718	// add 1st active group
4719	"movq (%%edi,%%edx,), %%mm0 \n\t"
4720	"paddb %%mm1, %%mm0 \n\t"
4721
4722	// add 2nd active group
4723	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4724	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4725	"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4726	"paddb %%mm1, %%mm0 \n\t"
4727
4728	// add 3rd active group
4729	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4730	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4731	"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4732	"paddb %%mm1, %%mm0 \n\t"
4733
4734	// add 4th active group
4735	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4736	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4737	"pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
4738	"addl $8, %%edx \n\t"
4739	"paddb %%mm1, %%mm0 \n\t"
4740	"cmpl _MMXLength, %%edx \n\t"
4741	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4742	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4743	"jb sub_2lp \n\t"
4744
4745	: "=a" (dummy_value_a), // 0 // output regs (dummy)
4746	"=D" (dummy_value_D) // 1
4747
4748	: "0" (bpp), // eax // input regs
4749	"1" (row) // edi
4750
4751	: "%edx", "%esi" // clobber list
4752	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4753	, "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4754	#endif
4755	);
4756	}
4757	break;
4758
4759	case 8:
4760	{
4761	__asm__ __volatile__ (
4762	// preload "movl row, %%edi \n\t"
4763	"movl _dif, %%edx \n\t"
4764	"movl %%edi, %%esi \n\t" // lp = row
4765	// preload "movl bpp, %%eax \n\t"
4766	"addl %%eax, %%edi \n\t" // rp = row + bpp
4767	"movl _MMXLength, %%ecx \n\t"
4768
4769	// prime the pump: load the first Raw(x-bpp) data set
4770	"movq -8(%%edi,%%edx,), %%mm7 \n\t"
4771	"andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4772
4773	"sub_8lp: \n\t"
4774	"movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
4775	"paddb %%mm7, %%mm0 \n\t"
4776	"movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
4777	"movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
4778
4779	// Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4780	// This will be repeated for each group of 8 bytes with the 8th
4781	// group being used as the Raw(x-bpp) for the 1st group of the
4782	// next loop.
4783
4784	"paddb %%mm0, %%mm1 \n\t"
4785	"movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4786	"movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
4787	"paddb %%mm1, %%mm2 \n\t"
4788	"movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4789	"movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4790	"paddb %%mm2, %%mm3 \n\t"
4791	"movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4792	"movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4793	"paddb %%mm3, %%mm4 \n\t"
4794	"movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4795	"movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4796	"paddb %%mm4, %%mm5 \n\t"
4797	"movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4798	"movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4799	"paddb %%mm5, %%mm6 \n\t"
4800	"movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4801	"movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4802	"addl $64, %%edx \n\t"
4803	"paddb %%mm6, %%mm7 \n\t"
4804	"cmpl %%ecx, %%edx \n\t"
4805	"movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4806	"jb sub_8lp \n\t"
4807
4808	"cmpl _MMXLength, %%edx \n\t"
4809	"jnb sub_8lt8 \n\t"
4810
4811	"sub_8lpA: \n\t"
4812	"movq (%%edi,%%edx,), %%mm0 \n\t"
4813	"addl $8, %%edx \n\t"
4814	"paddb %%mm7, %%mm0 \n\t"
4815	"cmpl _MMXLength, %%edx \n\t"
4816	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4817	"movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
4818	// to mm1 to be new Raw(x-bpp)
4819	// for next loop
4820	"jb sub_8lpA \n\t"
4821
4822	"sub_8lt8: \n\t"
4823
4824	: "=a" (dummy_value_a), // 0 // output regs (dummy)
4825	"=D" (dummy_value_D) // 1
4826
4827	: "0" (bpp), // eax // input regs
4828	"1" (row) // edi
4829
4830	: "%ecx", "%edx", "%esi" // clobber list
4831	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4832	, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4833	#endif
4834	);
4835	}
4836	break;
4837
4838	default: // bpp greater than 8 bytes GRR BOGUS
4839	{
4840	__asm__ __volatile__ (
4841	"movl _dif, %%edx \n\t"
4842	// preload "movl row, %%edi \n\t"
4843	"movl %%edi, %%esi \n\t" // lp = row
4844	// preload "movl bpp, %%eax \n\t"
4845	"addl %%eax, %%edi \n\t" // rp = row + bpp
4846
4847	"sub_Alp: \n\t"
4848	"movq (%%edi,%%edx,), %%mm0 \n\t"
4849	"movq (%%esi,%%edx,), %%mm1 \n\t"
4850	"addl $8, %%edx \n\t"
4851	"paddb %%mm1, %%mm0 \n\t"
4852	"cmpl _MMXLength, %%edx \n\t"
4853	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4854	// -8 to offset addl edx
4855	"jb sub_Alp \n\t"
4856
4857	: "=a" (dummy_value_a), // 0 // output regs (dummy)
4858	"=D" (dummy_value_D) // 1
4859
4860	: "0" (bpp), // eax // input regs
4861	"1" (row) // edi
4862
4863	: "%edx", "%esi" // clobber list
4864	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4865	, "%mm0", "%mm1"
4866	#endif
4867	);
4868	}
4869	break;
4870
4871	} // end switch (bpp)
4872
4873	__asm__ __volatile__ (
4874	"movl _MMXLength, %%edx \n\t"
4875	//pre "movl row, %%edi \n\t"
4876	"cmpl _FullLength, %%edx \n\t"
4877	"jnb sub_end \n\t"
4878
4879	"movl %%edi, %%esi \n\t" // lp = row
4880	//pre "movl bpp, %%eax \n\t"
4881	"addl %%eax, %%edi \n\t" // rp = row + bpp
4882	"xorl %%eax, %%eax \n\t"
4883
4884	"sub_lp2: \n\t"
4885	"movb (%%esi,%%edx,), %%al \n\t"
4886	"addb %%al, (%%edi,%%edx,) \n\t"
4887	"incl %%edx \n\t"
4888	"cmpl _FullLength, %%edx \n\t"
4889	"jb sub_lp2 \n\t"
4890
4891	"sub_end: \n\t"
4892	"EMMS \n\t" // end MMX instructions
4893
4894	: "=a" (dummy_value_a), // 0 // output regs (dummy)
4895	"=D" (dummy_value_D) // 1
4896
4897	: "0" (bpp), // eax // input regs
4898	"1" (row) // edi
4899
4900	: "%edx", "%esi" // clobber list
4901	);
4902
4903	} // end of png_read_filter_row_mmx_sub()
4904	#endif
4905
4906
4907
4908
4909	//===========================================================================//
4910	// //
4911	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
4912	// //
4913	//===========================================================================//
4914
4915	// Optimized code for PNG Up filter decoder
4916
4917	static void /* PRIVATE */
4918	png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4919	png_bytep prev_row)
4920	{
4921	png_uint_32 len;
4922	int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
4923	int dummy_value_S;
4924	int dummy_value_D;
4925
4926	len = row_info->rowbytes; // number of bytes to filter
4927
4928	__asm__ __volatile__ (
4929	//pre "movl row, %%edi \n\t"
4930	// get # of bytes to alignment
4931	#ifdef __PIC__
4932	"pushl %%ebx \n\t"
4933	#endif
4934	"movl %%edi, %%ecx \n\t"
4935	"xorl %%ebx, %%ebx \n\t"
4936	"addl $0x7, %%ecx \n\t"
4937	"xorl %%eax, %%eax \n\t"
4938	"andl $0xfffffff8, %%ecx \n\t"
4939	//pre "movl prev_row, %%esi \n\t"
4940	"subl %%edi, %%ecx \n\t"
4941	"jz up_go \n\t"
4942
4943	"up_lp1: \n\t" // fix alignment
4944	"movb (%%edi,%%ebx,), %%al \n\t"
4945	"addb (%%esi,%%ebx,), %%al \n\t"
4946	"incl %%ebx \n\t"
4947	"cmpl %%ecx, %%ebx \n\t"
4948	"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4949	"jb up_lp1 \n\t" // offset incl ebx
4950
4951	"up_go: \n\t"
4952	//pre "movl len, %%edx \n\t"
4953	"movl %%edx, %%ecx \n\t"
4954	"subl %%ebx, %%edx \n\t" // subtract alignment fix
4955	"andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4956	"subl %%edx, %%ecx \n\t" // drop over bytes from length
4957
4958	// unrolled loop - use all MMX registers and interleave to reduce
4959	// number of branch instructions (loops) and reduce partial stalls
4960	"up_loop: \n\t"
4961	"movq (%%esi,%%ebx,), %%mm1 \n\t"
4962	"movq (%%edi,%%ebx,), %%mm0 \n\t"
4963	"movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4964	"paddb %%mm1, %%mm0 \n\t"
4965	"movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4966	"movq %%mm0, (%%edi,%%ebx,) \n\t"
4967	"paddb %%mm3, %%mm2 \n\t"
4968	"movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4969	"movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4970	"movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4971	"movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4972	"paddb %%mm5, %%mm4 \n\t"
4973	"movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4974	"movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4975	"paddb %%mm7, %%mm6 \n\t"
4976	"movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4977	"movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4978	"movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4979	"movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4980	"paddb %%mm1, %%mm0 \n\t"
4981	"movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4982	"movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4983	"paddb %%mm3, %%mm2 \n\t"
4984	"movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4985	"movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4986	"movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4987	"movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4988	"paddb %%mm5, %%mm4 \n\t"
4989	"movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4990	"movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4991	"addl $64, %%ebx \n\t"
4992	"paddb %%mm7, %%mm6 \n\t"
4993	"cmpl %%ecx, %%ebx \n\t"
4994	"movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
4995	"jb up_loop \n\t" // -8 to offset addl ebx
4996
4997	"cmpl $0, %%edx \n\t" // test for bytes over mult of 64
4998	"jz up_end \n\t"
4999
5000	"cmpl $8, %%edx \n\t" // test for less than 8 bytes
5001	"jb up_lt8 \n\t" // [added by lcreeve at netins.net]
5002
5003	"addl %%edx, %%ecx \n\t"
5004	"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
5005	"subl %%edx, %%ecx \n\t" // drop over bytes from length
5006	"jz up_lt8 \n\t"
5007
5008	"up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
5009	"movq (%%esi,%%ebx,), %%mm1 \n\t"
5010	"movq (%%edi,%%ebx,), %%mm0 \n\t"
5011	"addl $8, %%ebx \n\t"
5012	"paddb %%mm1, %%mm0 \n\t"
5013	"cmpl %%ecx, %%ebx \n\t"
5014	"movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
5015	"jb up_lpA \n\t" // offset add ebx
5016	"cmpl $0, %%edx \n\t" // test for bytes over mult of 8
5017	"jz up_end \n\t"
5018
5019	"up_lt8: \n\t"
5020	"xorl %%eax, %%eax \n\t"
5021	"addl %%edx, %%ecx \n\t" // move over byte count into counter
5022
5023	"up_lp2: \n\t" // use x86 regs for remaining bytes
5024	"movb (%%edi,%%ebx,), %%al \n\t"
5025	"addb (%%esi,%%ebx,), %%al \n\t"
5026	"incl %%ebx \n\t"
5027	"cmpl %%ecx, %%ebx \n\t"
5028	"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
5029	"jb up_lp2 \n\t" // offset inc ebx
5030
5031	"up_end: \n\t"
5032	"EMMS \n\t" // conversion of filtered row complete
5033	#ifdef __PIC__
5034	"popl %%ebx \n\t"
5035	#endif
5036
5037	: "=d" (dummy_value_d), // 0 // output regs (dummy)
5038	"=S" (dummy_value_S), // 1
5039	"=D" (dummy_value_D) // 2
5040
5041	: "0" (len), // edx // input regs
5042	"1" (prev_row), // esi
5043	"2" (row) // edi
5044
5045	: "%eax", "%ecx" // clobber list (no input regs!)
5046	#ifndef __PIC__
5047	, "%ebx"
5048	#endif
5049
5050	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5051	, "%mm0", "%mm1", "%mm2", "%mm3"
5052	, "%mm4", "%mm5", "%mm6", "%mm7"
5053	#endif
5054	);
5055
5056	} // end of png_read_filter_row_mmx_up()
5057
5058	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5059
5060
5061
5062
5063	/===========================================================================/
5064	/* */
5065	/* P N G _ R E A D _ F I L T E R _ R O W */
5066	/* */
5067	/===========================================================================/
5068
5069
5070	/* Optimized png_read_filter_row routines */
5071
5072	void /* PRIVATE */
5073	png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5074	row, png_bytep prev_row, int filter)
5075	{
5076	#ifdef PNG_DEBUG
5077	char filnm[10];
5078	#endif
5079
5080	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5081	/* GRR: these are superseded by png_ptr->asm_flags: */
5082	#define UseMMX_sub 1 // GRR: converted 20000730
5083	#define UseMMX_up 1 // GRR: converted 20000729
5084	#define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
5085	#define UseMMX_paeth 1 // GRR: converted 20000828
5086
5087	if (_mmx_supported == 2) {
5088	/* this should have happened in png_init_mmx_flags() already */
5089	#if !defined(PNG_1_0_X)
5090	png_warning(png_ptr, "asm_flags may not have been initialized");
5091	#endif
5092	png_mmx_support();
5093	}
5094	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5095
5096	#ifdef PNG_DEBUG
5097	png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5098	switch (filter)
5099	{
5100	case 0: sprintf(filnm, "none");
5101	break;
5102	case 1: sprintf(filnm, "sub-%s",
5103	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5104	#if !defined(PNG_1_0_X)
5105	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5106	#endif
5107	#endif
5108	"x86");
5109	break;
5110	case 2: sprintf(filnm, "up-%s",
5111	#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5112	#if !defined(PNG_1_0_X)
5113	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5114	#endif
5115	#endif
5116	"x86");
5117	break;
5118	case 3: sprintf(filnm, "avg-%s",
5119	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5120	#if !defined(PNG_1_0_X)
5121	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5122	#endif
5123	#endif
5124	"x86");
5125	break;
5126	case 4: sprintf(filnm, "Paeth-%s",
5127	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5128	#if !defined(PNG_1_0_X)
5129	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5130	#endif
5131	#endif
5132	"x86");
5133	break;
5134	default: sprintf(filnm, "unknw");
5135	break;
5136	}
5137	png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5138	png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5139	png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5140	(int)((row_info->pixel_depth + 7) >> 3));
5141	png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5142	#endif /* PNG_DEBUG */
5143
5144	switch (filter)
5145	{
5146	case PNG_FILTER_VALUE_NONE:
5147	break;
5148
5149	case PNG_FILTER_VALUE_SUB:
5150	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5151	#if !defined(PNG_1_0_X)
5152	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5153	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5154	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5155	#else
5156	if (_mmx_supported)
5157	#endif
5158	{
5159	png_read_filter_row_mmx_sub(row_info, row);
5160	}
5161	else
5162	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5163	{
5164	png_uint_32 i;
5165	png_uint_32 istop = row_info->rowbytes;
5166	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5167	png_bytep rp = row + bpp;
5168	png_bytep lp = row;
5169
5170	for (i = bpp; i < istop; i++)
5171	{
5172	rp = (png_byte)(((int)(rp) + (int)(*lp++)) & 0xff);
5173	rp++;
5174	}
5175	} /* end !UseMMX_sub */
5176	break;
5177
5178	case PNG_FILTER_VALUE_UP:
5179	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5180	#if !defined(PNG_1_0_X)
5181	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5182	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5183	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5184	#else
5185	if (_mmx_supported)
5186	#endif
5187	{
5188	png_read_filter_row_mmx_up(row_info, row, prev_row);
5189	}
5190	else
5191	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5192	{
5193	png_uint_32 i;
5194	png_uint_32 istop = row_info->rowbytes;
5195	png_bytep rp = row;
5196	png_bytep pp = prev_row;
5197
5198	for (i = 0; i < istop; ++i)
5199	{
5200	rp = (png_byte)(((int)(rp) + (int)(*pp++)) & 0xff);
5201	rp++;
5202	}
5203	} /* end !UseMMX_up */
5204	break;
5205
5206	case PNG_FILTER_VALUE_AVG:
5207	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5208	#if !defined(PNG_1_0_X)
5209	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5210	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5211	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5212	#else
5213	if (_mmx_supported)
5214	#endif
5215	{
5216	png_read_filter_row_mmx_avg(row_info, row, prev_row);
5217	}
5218	else
5219	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5220	{
5221	png_uint_32 i;
5222	png_bytep rp = row;
5223	png_bytep pp = prev_row;
5224	png_bytep lp = row;
5225	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5226	png_uint_32 istop = row_info->rowbytes - bpp;
5227
5228	for (i = 0; i < bpp; i++)
5229	{
5230	rp = (png_byte)(((int)(rp) +
5231	((int)(*pp++) >> 1)) & 0xff);
5232	rp++;
5233	}
5234
5235	for (i = 0; i < istop; i++)
5236	{
5237	rp = (png_byte)(((int)(rp) +
5238	((int)(pp++ + lp++) >> 1)) & 0xff);
5239	rp++;
5240	}
5241	} /* end !UseMMX_avg */
5242	break;
5243
5244	case PNG_FILTER_VALUE_PAETH:
5245	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5246	#if !defined(PNG_1_0_X)
5247	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5248	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5249	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5250	#else
5251	if (_mmx_supported)
5252	#endif
5253	{
5254	png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5255	}
5256	else
5257	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5258	{
5259	png_uint_32 i;
5260	png_bytep rp = row;
5261	png_bytep pp = prev_row;
5262	png_bytep lp = row;
5263	png_bytep cp = prev_row;
5264	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5265	png_uint_32 istop = row_info->rowbytes - bpp;
5266
5267	for (i = 0; i < bpp; i++)
5268	{
5269	rp = (png_byte)(((int)(rp) + (int)(*pp++)) & 0xff);
5270	rp++;
5271	}
5272
5273	for (i = 0; i < istop; i++) /* use leftover rp,pp */
5274	{
5275	int a, b, c, pa, pb, pc, p;
5276
5277	a = *lp++;
5278	b = *pp++;
5279	c = *cp++;
5280
5281	p = b - c;
5282	pc = a - c;
5283
5284	#ifdef PNG_USE_ABS
5285	pa = abs(p);
5286	pb = abs(pc);
5287	pc = abs(p + pc);
5288	#else
5289	pa = p < 0 ? -p : p;
5290	pb = pc < 0 ? -pc : pc;
5291	pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5292	#endif
5293
5294	/*
5295	if (pa <= pb && pa <= pc)
5296	p = a;
5297	else if (pb <= pc)
5298	p = b;
5299	else
5300	p = c;
5301	*/
5302
5303	p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5304
5305	rp = (png_byte)(((int)(rp) + p) & 0xff);
5306	rp++;
5307	}
5308	} /* end !UseMMX_paeth */
5309	break;
5310
5311	default:
5312	png_warning(png_ptr, "Ignoring bad row-filter type");
5313	*row=0;
5314	break;
5315	}
5316	}
5317
5318	#endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5319
5320
5321	/===========================================================================/
5322	/* */
5323	/* P N G _ M M X _ S U P P O R T */
5324	/* */
5325	/===========================================================================/
5326
5327	/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5328	* (2) all instructions compile with gcc 2.7.2.3 and later
5329	* (3) the function is moved down here to prevent gcc from
5330	* inlining it in multiple places and then barfing be-
5331	* cause the ".NOT_SUPPORTED" label is multiply defined
5332	* [is there a way to signal that a single function should
5333	* not be inlined? is there a way to modify the label for
5334	* each inlined instance, e.g., by appending _1, _2, etc.?
5335	* maybe if don't use leading "." in label name? (nope...sigh)]
5336	*/
5337
5338	int PNGAPI
5339	png_mmx_support(void)
5340	{
5341	#if defined(PNG_MMX_CODE_SUPPORTED)
5342	__asm__ __volatile__ (
5343	"pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
5344	"pushl %%ecx \n\t" // so does ecx...
5345	"pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
5346	// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
5347	// "pushf \n\t" // 16-bit pushf
5348	"pushfl \n\t" // save Eflag to stack
5349	"popl %%eax \n\t" // get Eflag from stack into eax
5350	"movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
5351	"xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5352	"pushl %%eax \n\t" // save modified Eflag back to stack
5353	// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
5354	// "popf \n\t" // 16-bit popf
5355	"popfl \n\t" // restore modified value to Eflag reg
5356	"pushfl \n\t" // save Eflag to stack
5357	"popl %%eax \n\t" // get Eflag from stack
5358	"pushl %%ecx \n\t" // save original Eflag to stack
5359	"popfl \n\t" // restore original Eflag
5360	"xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
5361	"jz 0f \n\t" // if same, CPUID instr. is not supported
5362
5363	"xorl %%eax, %%eax \n\t" // set eax to zero
5364	// ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
5365	"cpuid \n\t" // get the CPU identification info
5366	"cmpl $1, %%eax \n\t" // make sure eax return non-zero value
5367	"jl 0f \n\t" // if eax is zero, MMX is not supported
5368
5369	"xorl %%eax, %%eax \n\t" // set eax to zero and...
5370	"incl %%eax \n\t" // ...increment eax to 1. This pair is
5371	// faster than the instruction "mov eax, 1"
5372	"cpuid \n\t" // get the CPU identification info again
5373	"andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5374	"cmpl $0, %%edx \n\t" // 0 = MMX not supported
5375	"jz 0f \n\t" // non-zero = yes, MMX IS supported
5376
5377	"movl $1, %%eax \n\t" // set return value to 1
5378	"jmp 1f \n\t" // DONE: have MMX support
5379
5380	"0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
5381	"movl $0, %%eax \n\t" // set return value to 0
5382	"1: \n\t" // .RETURN: target label for jump instructions
5383	"movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5384	"popl %%edx \n\t" // restore edx
5385	"popl %%ecx \n\t" // restore ecx
5386	"popl %%ebx \n\t" // restore ebx
5387
5388	// "ret \n\t" // DONE: no MMX support
5389	// (fall through to standard C "ret")
5390
5391	: // output list (none)
5392
5393	: // any variables used on input (none)
5394
5395	: "%eax" // clobber list
5396	// , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5397	// , "memory" // if write to a variable gcc thought was in a reg
5398	// , "cc" // "condition codes" (flag bits)
5399	);
5400	#else
5401	_mmx_supported = 0;
5402	#endif /* PNG_MMX_CODE_SUPPORTED */
5403
5404	return _mmx_supported;
5405	}
5406
5407
5408	#endif /* PNG_USE_PNGGCCRD */

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/libpng-1.2.8/pnggccrd.c@ 13371

Download in other formats: