5
* Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
6
* (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
8
* sad_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net>
9
* Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
12
* This program is free software; you can redistribute it and/or
13
* modify it under the terms of the GNU General Public License
14
* as published by the Free Software Foundation; either version 2
15
* of the License, or (at your option) any later version.
17
* This program is distributed in the hope that it will be useful,
18
* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20
* GNU General Public License for more details.
22
* You should have received a copy of the GNU General Public License
23
* along with this program; if not, write to the Free Software
24
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
28
#include "mjpeg_types.h"
29
#if defined( ARCH_X86) || defined(ARCH_X86_64)
33
int sad_00_mmx(uint8_t *blk1,uint8_t *blk2,int lx,int h, int distlim)
37
* N.b. distlim is *ignored* as testing for it is more expensive than the
38
* occasional saving by aborting the computionation early...
40
* mm0 = distance accumulators (4 words)
54
movq_m2r(blk1[0], mm4); /* load first 8 bytes of p1 row */
55
movq_m2r(blk2[0], mm5); /* load first 8 bytes of p2 row */
57
movq_r2r(mm4, mm7); /* mm5 = abs(mm4-mm5) */
58
psubusb_r2r(mm5, mm7);
59
psubusb_r2r(mm4, mm5);
62
/* Add the abs(mm4-mm5) bytes to the accumulators */
63
movq_m2r(blk1[8], mm2); /* load second 8 bytes of p1 row (interleaved) */
64
movq_r2r(mm5, mm7); /* mm7 := [i : B0..3, mm1]W */
65
punpcklbw_r2r(mm6, mm7);
66
movq_m2r(blk2[8], mm3);
68
punpckhbw_r2r(mm6, mm5);
71
/* This is logically where the mm2, mm3 loads would go... */
73
movq_r2r(mm2, mm7); /* mm3 = abs(mm2-mm3) */
74
psubusb_r2r(mm3, mm7);
75
psubusb_r2r(mm2, mm3);
78
/* Add the abs(mm4-mm5) bytes to the accumulators */
80
punpcklbw_r2r(mm6, mm7);
81
punpckhbw_r2r(mm6, mm3);
84
blk1 += lx; /* update pointers to next row */
92
/* Sum the Accumulators */
93
movq_r2r(mm0, mm5); /* mm5 := [W0+W2,W1+W3, mm0 */
98
movq_r2r(mm4, mm7); /* mm6 := [W0+W2+W1+W3, mm0] */
101
movd_r2g(mm4, rv); /* store return value */
112
* sad_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum
113
* We're reduce to seven bits as otherwise we also have to mess
114
* horribly with carries and signed only comparisons make the code
115
* simply enormous (and probably barely faster than a simple loop).
116
* Since signals with a bona-fide 8bit res will be rare we simply
117
* take the precision hit...
118
* Actually we don't worry about carries from the low-order bits
119
* either so 1/4 of the time we'll be 1 too low...
121
* Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
124
* This program is free software; you can redistribute it and/or
125
* modify it under the terms of the GNU General Public License
126
* as published by the Free Software Foundation; either version 2
127
* of the License, or (at your option) any later version.
129
* This program is distributed in the hope that it will be useful,
130
* but WITHOUT ANY WARRANTY; without even the implied warranty of
131
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
132
* GNU General Public License for more details.
134
* You should have received a copy of the GNU General Public License
135
* along with this program; if not, write to the Free Software
136
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
139
int sad_01_mmx(uint8_t *p1, uint8_t *p2, int lx, int h)
144
* mm0 = distance accumulators (4 words)
148
* mm4 = temp 4 bytes in words interpolating p1, p1+1
149
* mm5 = temp 4 bytes in words from p2
150
* mm6 = temp comparison bit mask p1,p2
151
* mm7 = temp comparison bit mask p2,p1
157
/* First 8 bytes of row */
159
/* First 4 bytes of 8 */
161
movq_m2r(p1[0], mm4); /* mm4 := first 4 bytes p1 */
163
movq_r2r(mm4, mm2); /* mm2 records all 8 bytes */
164
punpcklbw_r2r(mm7, mm4); /* First 4 bytes p1 in Words... */
166
movq_m2r(p1[1], mm6); /* mm6 := first 4 bytes p1+1 */
167
movq_r2r(mm6, mm3); /* mm3 records all 8 bytes */
168
punpcklbw_r2r(mm7, mm6);
169
paddw_r2r(mm6, mm4); /* mm4 := First 4 bytes interpolated in words */
172
movq_m2r(p2[0], mm5); /* mm5:=first 4 bytes of p2 in words */
174
punpcklbw_r2r(mm7, mm5);
177
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
179
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
183
paddw_r2r(mm6, mm0); /* Add to accumulator */
185
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
186
pcmpgtw_r2r(mm4, mm6);
187
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
190
paddw_r2r(mm5, mm0); /* Add to accumulator */
192
/* Second 4 bytes of 8 */
194
movq_r2r(mm2, mm4); /* mm4 := Second 4 bytes p1 in words */
196
punpckhbw_r2r(mm7, mm4);
197
movq_r2r(mm3, mm6); /* mm6 := Second 4 bytes p1+1 in words */
198
punpckhbw_r2r(mm7, mm6);
200
paddw_r2r(mm6, mm4); /* mm4 := First 4 Interpolated bytes in words */
203
movq_r2r(mm1, mm5); /* mm5:= second 4 bytes of p2 in words */
204
punpckhbw_r2r(mm7, mm5);
207
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
209
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
213
paddw_r2r(mm6, mm0); /* Add to accumulator */
215
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
216
pcmpgtw_r2r(mm4, mm6);
217
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
220
paddw_r2r(mm5, mm0); /* Add to accumulator */
223
/* Second 8 bytes of row */
225
/* First 4 bytes of 8 */
227
movq_m2r(p1[8], mm4); /* mm4 := first 4 bytes p1+8 */
229
movq_r2r(mm4, mm2); /* mm2 records all 8 bytes */
230
punpcklbw_r2r(mm7, mm4); /* First 4 bytes p1 in Words... */
232
movq_m2r(p1[9], mm6); /* mm6 := first 4 bytes p1+9 */
233
movq_r2r(mm6, mm3); /* mm3 records all 8 bytes */
234
punpcklbw_r2r(mm7, mm6);
235
paddw_r2r(mm6, mm4); /* mm4 := First 4 bytes interpolated in words */
238
movq_m2r(p2[8], mm5); /* mm5:=first 4 bytes of p2+8 in words */
240
punpcklbw_r2r(mm7, mm5);
243
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
245
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
249
paddw_r2r(mm6, mm0); /* Add to accumulator */
251
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
252
pcmpgtw_r2r(mm4, mm6);
253
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
256
paddw_r2r(mm5, mm0); /* Add to accumulator */
258
/* Second 4 bytes of 8 */
260
movq_r2r(mm2, mm4); /* mm4 := Second 4 bytes p1 in words */
262
punpckhbw_r2r(mm7, mm4);
263
movq_r2r(mm3, mm6); /* mm6 := Second 4 bytes p1+1 in words */
264
punpckhbw_r2r(mm7, mm6);
266
paddw_r2r(mm6, mm4); /* mm4 := First 4 Interpolated bytes in words */
269
movq_r2r(mm1, mm5); /* mm5:= second 4 bytes of p2 in words */
270
punpckhbw_r2r(mm7, mm5);
273
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
275
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
279
paddw_r2r(mm6, mm0); /* Add to accumulator */
281
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
282
pcmpgtw_r2r(mm4, mm6);
283
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
286
paddw_r2r(mm5, mm0); /* Add to accumulator */
289
p1 += lx; /* update pointers to next row */
295
/* Sum the Accumulators */
302
movd_r2g(mm0, rv); /* store return value */
312
* sad_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum
313
* We're reduce to seven bits as otherwise we also have to mess
314
* horribly with carries and signed only comparisons make the code
315
* simply enormous (and probably barely faster than a simple loop).
316
* Since signals with a bona-fide 8bit res will be rare we simply
317
* take the precision hit...
318
* Actually we don't worry about carries from the low-order bits
319
* either so 1/4 of the time we'll be 1 too low...
321
* Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
324
* This program is free software; you can redistribute it and/or
325
* modify it under the terms of the GNU General Public License
326
* as published by the Free Software Foundation; either version 2
327
* of the License, or (at your option) any later version.
329
* This program is distributed in the hope that it will be useful,
330
* but WITHOUT ANY WARRANTY; without even the implied warranty of
331
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
332
* GNU General Public License for more details.
334
* You should have received a copy of the GNU General Public License
335
* along with this program; if not, write to the Free Software
336
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
339
int sad_10_mmx(uint8_t *p1, uint8_t *p2, int lx, int h)
344
* mm0 = distance accumulators (4 words)
348
* mm4 = temp 4 bytes in words interpolating p1, p1+1
349
* mm5 = temp 4 bytes in words from p2
350
* mm6 = temp comparison bit mask p1,p2
351
* mm7 = temp comparison bit mask p2,p1
357
/* First 8 bytes of row */
359
/* First 4 bytes of 8 */
361
movq_m2r(p1[0], mm4); /* mm4 := first 4 bytes p1 */
363
movq_r2r(mm4, mm2); /* mm2 records all 8 bytes */
364
punpcklbw_r2r(mm7, mm4); /* First 4 bytes p1 in Words... */
366
movq_m2r(p1[lx], mm6); /* mm6 := first 4 bytes p1+lx */
367
movq_r2r(mm6, mm3); /* mm3 records all 8 bytes */
368
punpcklbw_r2r(mm7, mm6);
369
paddw_r2r(mm6, mm4); /* mm4 := First 4 bytes interpolated in words */
372
movq_m2r(p2[0], mm5); /* mm5:=first 4 bytes of p2 in words */
374
punpcklbw_r2r(mm7, mm5);
377
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
379
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
383
paddw_r2r(mm6, mm0); /* Add to accumulator */
385
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
386
pcmpgtw_r2r(mm4, mm6);
387
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
390
paddw_r2r(mm5, mm0); /* Add to accumulator */
393
/* Second 4 bytes of 8 */
395
movq_r2r(mm2, mm4); /* mm4 := Second 4 bytes p1 in words */
397
punpckhbw_r2r(mm7, mm4);
398
movq_r2r(mm3, mm6); /* mm6 := Second 4 bytes p1+lx in words */
399
punpckhbw_r2r(mm7, mm6);
401
paddw_r2r(mm6, mm4); /* mm4 := First 4 Interpolated bytes in words */
404
movq_r2r(mm1, mm5); /* mm5:= second 4 bytes of p2 in words */
405
punpckhbw_r2r(mm7, mm5);
408
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
410
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
414
paddw_r2r(mm6, mm0); /* Add to accumulator */
416
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
417
pcmpgtw_r2r(mm4, mm6);
418
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
421
paddw_r2r(mm5, mm0); /* Add to accumulator */
424
/* Second 8 bytes of row */
426
/* First 4 bytes of 8 */
428
movq_m2r(p1[8], mm4); /* mm4 := first 4 bytes p1+8 */
430
movq_r2r(mm4, mm2); /* mm2 records all 8 bytes */
431
punpcklbw_r2r(mm7, mm4); /* First 4 bytes p1 in Words... */
433
movq_m2r(p1[lx+8], mm6); /* mm6 := first 4 bytes p1+lx+8 */
434
movq_r2r(mm6, mm3); /* mm3 records all 8 bytes */
435
punpcklbw_r2r(mm7, mm6);
436
paddw_r2r(mm6, mm4); /* mm4 := First 4 bytes interpolated in words */
439
movq_m2r(p2[8], mm5); /* mm5:=first 4 bytes of p2+8 in words */
441
punpcklbw_r2r(mm7, mm5);
444
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
446
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
450
paddw_r2r(mm6, mm0); /* Add to accumulator */
452
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
453
pcmpgtw_r2r(mm4, mm6);
454
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
457
paddw_r2r(mm5, mm0); /* Add to accumulator */
459
/* Second 4 bytes of 8 */
461
movq_r2r(mm2, mm4); /* mm4 := Second 4 bytes p1 in words */
463
punpckhbw_r2r(mm7, mm4);
464
movq_r2r(mm3, mm6); /* mm6 := Second 4 bytes p1+lx in words */
465
punpckhbw_r2r(mm7, mm6);
467
paddw_r2r(mm6, mm4); /* mm4 := First 4 Interpolated bytes in words */
470
movq_r2r(mm1, mm5); /* mm5:= second 4 bytes of p2 in words */
471
punpckhbw_r2r(mm7, mm5);
474
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
476
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
480
paddw_r2r(mm6, mm0); /* Add to accumulator */
482
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
483
pcmpgtw_r2r(mm4, mm6);
484
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
487
paddw_r2r(mm5, mm0); /* Add to accumulator */
495
/* Sum the Accumulators */
502
movd_r2g(mm0, rv); /* store return value */
515
* Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
518
* This program is free software; you can redistribute it and/or
519
* modify it under the terms of the GNU General Public License
520
* as published by the Free Software Foundation; either version 2
521
* of the License, or (at your option) any later version.
523
* This program is distributed in the hope that it will be useful,
524
* but WITHOUT ANY WARRANTY; without even the implied warranty of
525
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
526
* GNU General Public License for more details.
528
* You should have received a copy of the GNU General Public License
529
* along with this program; if not, write to the Free Software
530
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
533
int sad_11_mmx(uint8_t *p1, uint8_t *p2, int lx, int h)
538
* mm0 = distance accumulators (4 words)
542
* I'd love to find someplace to stash p1+1 and p1+lx+1's bytes
543
* but I don't think thats going to happen in iA32-land...
544
* mm4 = temp 4 bytes in words interpolating p1, p1+1
545
* mm5 = temp 4 bytes in words from p2
546
* mm6 = temp comparison bit mask p1,p2
547
* mm7 = temp comparison bit mask p2,p1
554
/* First 8 bytes of row */
556
/* First 4 bytes of 8 */
558
movq_m2r(p1[0], mm4); /* mm4 := first 4 bytes p1 */
560
movq_r2r(mm4, mm2); /* mm2 records all 8 bytes */
561
punpcklbw_r2r(mm7, mm4); /* First 4 bytes p1 in Words... */
563
movq_m2r(p1[lx], mm6); /* mm6 := first 4 bytes p1+lx */
564
movq_r2r(mm6, mm3); /* mm3 records all 8 bytes */
565
punpcklbw_r2r(mm7, mm6);
568
movq_m2r(p1[1], mm5); /* mm5 := first 4 bytes p1+1 */
569
punpcklbw_r2r(mm7, mm5); /* First 4 bytes p1 in Words... */
571
movq_m2r(p1[lx+1], mm6); /* mm6 := first 4 bytes p1+lx+1 */
572
punpcklbw_r2r(mm7, mm6);
575
psrlw_i2r(2, mm4); /* mm4 := First 4 bytes interpolated in words */
577
movq_m2r(p2[0], mm5); /* mm5:=first 4 bytes of p2 in words */
579
punpcklbw_r2r(mm7, mm5);
582
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
584
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
588
paddw_r2r(mm6, mm0); /* Add to accumulator */
590
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
591
pcmpgtw_r2r(mm4, mm6);
592
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
595
paddw_r2r(mm5, mm0); /* Add to accumulator */
597
/* Second 4 bytes of 8 */
599
movq_r2r(mm2, mm4); /* mm4 := Second 4 bytes p1 in words */
601
punpckhbw_r2r(mm7, mm4);
602
movq_r2r(mm3, mm6); /* mm6 := Second 4 bytes p1+1 in words */
603
punpckhbw_r2r(mm7, mm6);
606
movq_m2r(p1[1], mm5); /* mm5 := first 4 bytes p1+1 */
607
punpckhbw_r2r(mm7, mm5); /* First 4 bytes p1 in Words... */
609
movq_m2r(p1[lx+1], mm6); /* mm6 := first 4 bytes p1+lx+1 */
610
punpckhbw_r2r(mm7, mm6);
613
psrlw_i2r(2, mm4); /* mm4 := First 4 bytes interpolated in words */
615
movq_r2r(mm1, mm5); /* mm5:= second 4 bytes of p2 in words */
616
punpckhbw_r2r(mm7, mm5);
619
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
621
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
625
paddw_r2r(mm6, mm0); /* Add to accumulator */
627
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
628
pcmpgtw_r2r(mm4, mm6);
629
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
632
paddw_r2r(mm5, mm0); /* Add to accumulator */
635
/* Second 8 bytes of row */
637
/* First 4 bytes of 8 */
639
movq_m2r(p1[8], mm4); /* mm4 := first 4 bytes p1+8 */
641
movq_r2r(mm4, mm2); /* mm2 records all 8 bytes */
642
punpcklbw_r2r(mm7, mm4); /* First 4 bytes p1 in Words... */
644
movq_m2r(p1[lx+8], mm6); /* mm6 := first 4 bytes p1+lx+8 */
645
movq_r2r(mm6, mm3); /* mm3 records all 8 bytes */
646
punpcklbw_r2r(mm7, mm6);
649
movq_m2r(p1[9], mm5); /* mm5 := first 4 bytes p1+9 */
650
punpcklbw_r2r(mm7, mm5); /* First 4 bytes p1 in Words... */
652
movq_m2r(p1[lx+9], mm6); /* mm6 := first 4 bytes p1+lx+9 */
653
punpcklbw_r2r(mm7, mm6);
656
psrlw_i2r(2, mm4); /* mm4 := First 4 bytes interpolated in words */
658
movq_m2r(p2[8], mm5); /* mm5:=first 4 bytes of p2+8 in words */
660
punpcklbw_r2r(mm7, mm5);
663
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
665
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
669
paddw_r2r(mm6, mm0); /* Add to accumulator */
671
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
672
pcmpgtw_r2r(mm4, mm6);
673
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
676
paddw_r2r(mm5, mm0); /* Add to accumulator */
678
/* Second 4 bytes of 8 */
680
movq_r2r(mm2, mm4); /* mm4 := Second 4 bytes p1 in words */
682
punpckhbw_r2r(mm7, mm4);
683
movq_r2r(mm3, mm6); /* mm6 := Second 4 bytes p1+1 in words */
684
punpckhbw_r2r(mm7, mm6);
687
movq_m2r(p1[9], mm5); /* mm5 := first 4 bytes p1+9 */
688
punpckhbw_r2r(mm7, mm5); /* First 4 bytes p1 in Words... */
690
movq_m2r(p1[lx+9], mm6); /* mm6 := first 4 bytes p1+lx+9 */
691
punpckhbw_r2r(mm7, mm6);
694
psrlw_i2r(2, mm4); /* mm4 := First 4 bytes interpolated in words */
696
movq_r2r(mm1, mm5); /* mm5:= second 4 bytes of p2 in words */
697
punpckhbw_r2r(mm7, mm5);
700
pcmpgtw_r2r(mm5, mm7); /* mm7 := [i : W0..3,mm4>mm5] */
702
movq_r2r(mm4, mm6); /* mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] */
706
paddw_r2r(mm6, mm0); /* Add to accumulator */
708
movq_r2r(mm5, mm6); /* mm6 := [i : W0..3,mm5>mm4] */
709
pcmpgtw_r2r(mm4, mm6);
710
psubw_r2r(mm4, mm5); /* mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] */
713
paddw_r2r(mm5, mm0); /* Add to accumulator */
715
p1 += lx; /* update pointers to next row */
721
/* Sum the Accumulators */
728
movd_r2g(mm0, rv); /* store return value */
736
int sad_sub22_mmx(uint8_t *blk1,uint8_t *blk2,int lx,int h)
741
* mm0 = distance accumulators (4 words)
755
movq_m2r(blk1[0], mm4); /* load 8 bytes of p1 */
756
movq_m2r(blk2[0], mm5); /* load 8 bytes of p2 */
758
movq_r2r(mm4, mm7); /* mm5 = abs(*p1-*p2) */
759
psubusb_r2r(mm5, mm7);
760
psubusb_r2r(mm4, mm5);
761
blk1 += lx; /* update pointer to next row */
764
/* Add the mm5 bytes to the accumulatores */
766
punpcklbw_r2r(mm6, mm7);
768
punpckhbw_r2r(mm6, mm5);
769
blk2 += lx; /* update pointer to next row */
772
movq_m2r(blk1[0], mm4); /* load 8 bytes of p1 (next row) */
773
movq_m2r(blk2[0], mm5); /* load 8 bytes of p2 (next row) */
775
movq_r2r(mm4, mm7); /* mm5 = abs(*p1-*p2) */
776
psubusb_r2r(mm5, mm7);
777
psubusb_r2r(mm4, mm5);
778
blk1 += lx; /* update pointer to next row */
781
/* Add the mm5 bytes to the accumulatores */
783
punpcklbw_r2r(mm6, mm7);
784
blk2 += lx; /* update pointer to next row */
786
punpckhbw_r2r(mm6, mm5);
791
/* Sum the Accumulators */
802
movd_r2g(mm0, rv); /* store return value */
811
int sad_sub44_mmx(uint8_t *blk1, uint8_t *blk2, int qlx, int qh)
816
* mm0 = distance accumulator left block p1
817
* mm1 = distance accumulator right block p1
819
* mm3 = right block of p1
820
* mm4 = left block of p1
833
* Beware loop obfuscated by interleaving to try to
837
movq_m2r(blk1[0], mm4); /* mm4 = first 4 bytes of p1 in words */
838
movq_m2r(blk2[0], mm5); /* mm5 = 4 bytes of p2 in words */
840
punpcklbw_r2r(mm2, mm4);
841
punpcklbw_r2r(mm2, mm5);
845
psubusw_r2r(mm5, mm7);
846
psubusw_r2r(mm4, mm6);
848
blk1 += qlx; /* update a pointer to next row */
851
paddw_r2r(mm7, mm0); /* Add absolute differences to left block accumulators */
859
/* Sum the accumulators */