603
602
GimpCompositeContext op = *_op;
606
* Inhale one whole i686 cache line at once. 64 bytes, 16 rgba8
607
* pixels, 4 128 bit xmm registers.
605
* Inhale one whole i686 cache line at once. 128 bytes == 32 rgba8
606
* pixels == 8 128 bit xmm registers.
609
608
for (; op.n_pixels >= 16; op.n_pixels -= 16)
612
asm volatile (" movdqu %0,%%xmm0\n"
613
"\tmovdqu %1,%%xmm1\n"
614
"\tmovdqu %2,%%xmm2\n"
615
"\tmovdqu %3,%%xmm3\n"
616
"\tmovdqu %4,%%xmm4\n"
617
"\tmovdqu %5,%%xmm5\n"
618
"\tmovdqu %6,%%xmm6\n"
619
"\tmovdqu %7,%%xmm7\n"
621
"\tmovdqu %%xmm0,%1\n"
622
"\tmovdqu %%xmm1,%0\n"
623
"\tmovdqu %%xmm2,%3\n"
624
"\tmovdqu %%xmm3,%2\n"
625
"\tmovdqu %%xmm4,%5\n"
626
"\tmovdqu %%xmm5,%4\n"
627
"\tmovdqu %%xmm6,%7\n"
628
"\tmovdqu %%xmm7,%6\n"
629
: "+m" (op.A[0]), "+m" (op.B[0]),
630
"+m" (op.A[1]), "+m" (op.B[1]),
631
"+m" (op.A[2]), "+m" (op.B[2]),
632
"+m" (op.A[3]), "+m" (op.B[3])
634
: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
637
asm volatile (" movdqu %0,%%xmm0\n"
638
"\tmovdqu %1,%%xmm1\n"
639
"\tmovdqu %2,%%xmm2\n"
640
"\tmovdqu %3,%%xmm3\n"
641
: "+m" (op.A[0]), "+m" (op.B[0]),
642
"+m" (op.A[1]), "+m" (op.B[1])
644
: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
647
asm volatile ("\tmovdqu %4,%%xmm4\n"
648
"\tmovdqu %5,%%xmm5\n"
649
"\tmovdqu %6,%%xmm6\n"
650
"\tmovdqu %7,%%xmm7\n"
651
: "+m" (op.A[2]), "+m" (op.B[2]),
652
"+m" (op.A[3]), "+m" (op.B[3])
654
: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
657
asm volatile ("\tmovdqu %%xmm0,%1\n"
658
"\tmovdqu %%xmm1,%0\n"
659
"\tmovdqu %%xmm2,%3\n"
660
"\tmovdqu %%xmm3,%2\n"
661
: "+m" (op.A[0]), "+m" (op.B[0]),
662
"+m" (op.A[1]), "+m" (op.B[1])
664
: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
667
asm volatile ("\tmovdqu %%xmm4,%5\n"
668
"\tmovdqu %%xmm5,%4\n"
669
"\tmovdqu %%xmm6,%7\n"
670
"\tmovdqu %%xmm7,%6\n"
671
: "+m" (op.A[2]), "+m" (op.B[2]),
672
"+m" (op.A[3]), "+m" (op.B[3])
674
: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
610
asm volatile (" movdqu %0,%%xmm0\n" : :"m" (op.A[0]) : "%xmm0");
611
asm volatile (" movdqu %0,%%xmm1\n" : :"m" (op.B[0]) : "%xmm1");
612
asm volatile (" movdqu %0,%%xmm2\n" : :"m" (op.A[1]) : "%xmm2");
613
asm volatile (" movdqu %0,%%xmm3\n" : :"m" (op.B[1]) : "%xmm3");
614
asm volatile (" movdqu %0,%%xmm4\n" : :"m" (op.A[2]) : "%xmm4");
615
asm volatile (" movdqu %0,%%xmm5\n" : :"m" (op.B[2]) : "%xmm5");
616
asm volatile (" movdqu %0,%%xmm6\n" : :"m" (op.A[3]) : "%xmm6");
617
asm volatile (" movdqu %0,%%xmm7\n" : :"m" (op.B[3]) : "%xmm7");
619
asm volatile ("\tmovdqu %%xmm0,%0\n" : "=m" (op.A[0]));
620
asm volatile ("\tmovdqu %%xmm1,%0\n" : "=m" (op.B[0]));
621
asm volatile ("\tmovdqu %%xmm2,%0\n" : "=m" (op.A[1]));
622
asm volatile ("\tmovdqu %%xmm3,%0\n" : "=m" (op.B[1]));
623
asm volatile ("\tmovdqu %%xmm4,%0\n" : "=m" (op.A[2]));
624
asm volatile ("\tmovdqu %%xmm5,%0\n" : "=m" (op.B[2]));
625
asm volatile ("\tmovdqu %%xmm6,%0\n" : "=m" (op.A[3]));
626
asm volatile ("\tmovdqu %%xmm7,%0\n" : "=m" (op.B[3]));
682
631
for (; op.n_pixels >= 4; op.n_pixels -= 4)
684
633
asm volatile (" movdqu %0,%%xmm2\n"