2
; MMX32 iDCT algorithm (IEEE-1180 compliant) :: idct_mmx32()
6
; v0.16B33 initial release
8
; This was one of the harder pieces of work to code.
9
; Intel's app-note focuses on the numerical issues of the algorithm, but
10
; assumes the programmer is familiar with IDCT mathematics, leaving the
11
; form of the complete function up to the programmer's imagination.
15
; I played around with the code for quite a few hours. I came up
16
; with *A* working IDCT algorithm, however I'm not sure whether my routine
17
; is "the correct one." But rest assured, my code passes all six IEEE
18
; accuracy tests with plenty of margin.
20
; My IDCT algorithm consists of 4 steps:
22
; 1) IDCT-row transformation (using the IDCT-row function) on all 8 rows
23
; This yields an intermediate 8x8 matrix.
25
; 2) intermediate matrix transpose (mandatory)
27
; 3) IDCT-row transformation (2nd time) on all 8 rows of the intermediate
28
; matrix. The output is the final-result, in transposed form.
30
; 4) post-transformation matrix transpose
31
; (not necessary if the input-data is already transposed, this could
32
; be done during the MPEG "zig-zag" scan, but since my algorithm
33
; requires at least one transpose operation, why not re-use the
36
; Although the (1st) and (3rd) steps use the SAME row-transform operation,
37
; the (3rd) step uses different shift&round constants (explained later.)
39
; Also note that the intermediate transpose (2) would not be neccessary,
40
; if the subsequent operation were a iDCT-column transformation. Since
41
; we only have the iDCT-row transform, we transpose the intermediate
42
; matrix and use the iDCT-row transform a 2nd time.
44
; I had to change some constants/variables for my method to work :
46
; As given by Intel, the #defines for SHIFT_INV_COL and RND_INV_COL are
47
; wrong. Not surprising since I'm not using a true column-transform
48
; operation, but the row-transform operation (as mentioned earlier.)
49
; round_inv_col[], which is given as "4 short" values, should have the
50
; same dimensions as round_inv_row[]. The corrected variables are
53
; Intel's code defines a different table for each each row operation.
54
; The tables given are 0/4, 1/7, 2/6, and 5/3. My code only uses row#0.
55
; Using the other rows messes up the overall transform.
57
; IMPLEMENTATION DETAILs
58
; ----------------------
60
; I divided the algorithm's work into two subroutines,
61
; 1) idct_mmx32_rows() - transforms 8 rows, then transpose
62
; 2) idct_mmx32_cols() - transforms 8 rows, then transpose
63
; yields final result ("drop-in" direct replacement for INT32 IDCT)
65
; The 2nd function is a clone of the 1st, with changes made only to the
66
; shift&rounding instructions.
68
; In the 1st function (rows), the shift & round instructions use
69
; SHIFT_INV_ROW & round_inv_row[] (renamed to r_inv_row[])
71
; In the 2nd function (cols)-> r_inv_col[], and
72
; SHIFT_INV_COL & round_inv_col[] (renamed to r_inv_col[])
74
; Each function contains an integrated transpose-operator, which comes
75
; AFTER the primary transformation operation. In the future, I'll optimize
76
; the code to do more of the transpose-work "in-place". Right now, I've
77
; left the code as two subroutines and a main calling function, so other
78
; people can read the code more easily.
80
; liaor@umcc.ais.org http:;members.tripod.com/~liaor
84
;;; A.Stevens Jul 2000 easy-peasy quick port to nasm
85
;;; Isn't open source a sensible idea...
88
;=============================================================================
90
; AP-922 http:;developer.intel.com/vtune/cbts/strmsimd
91
; These examples contain code fragments for first stage iDCT 8x8
92
; (for rows) and first stage DCT 8x8 (for columns)
94
;============================================================================
96
%define INP eax ; pointer to (short *blk)
97
%define OUT ecx ; pointer to output (temporary store space qwTemp[])
98
%define TABLE ebx ; pointer to idct_tab_01234567[]
99
%define round_inv_row edx
100
%define round_inv_col edx
103
%define ROW_STRIDE 16 ; for 8x8 matrix transposer
104
%define BITS_INV_ACC 4 ; 4 or 5 for IEEE
105
%define SHIFT_INV_ROW (16 - BITS_INV_ACC)
106
%define SHIFT_INV_COL (1 + BITS_INV_ACC +14 ) ; changed from Intel's val)
109
;; Variables and tables defined in C for convenience
111
extern idct_r_inv_row ; 2 DWORDSs
112
extern idct_r_inv_col ; "
113
extern idct_r_inv_corr ; "
114
extern idct_tab_01234567 ; Catenated table of coefficients
117
;; private variables and functions
122
qwTemp: resw 64 ; temporary storage space, 8x8 of shorts
127
;; static void idct_mmx( short *blk
131
push ebp ; save frame pointer
140
;; transform all 8 rows of 8x8 iDCT block
143
; this subroutine performs two operations
144
; 1) iDCT row transform
145
; for( i = 0; i < 8; ++ i)
146
; DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
148
; 2) transpose the matrix (which was stored in qwTemp[])
149
; qwTemp[] -> [8x8 matrix transpose] -> blk[]
151
mov INP, [ebp+8] ; INP = blk
152
mov edi, 0x00; ; x = 0
154
lea TABLE,[idct_tab_01234567]; ; row 0
158
lea round_inv_row, [idct_r_inv_row]
161
; for ( x = 0; x < 8; ++x ) ; transform one row per iteration
164
movq mm0, [INP] ; 0 ; x3 x2 x1 x0
166
movq mm1, [INP+8] ; 1 ; x7 x6 x5 x4
167
movq mm2, mm0 ; ; 2 ; x3 x2 x1 x0
169
movq mm3, [TABLE] ; 3 ; w06 w04 w02 w00
170
punpcklwd mm0, mm1 ; x5 x1 x4 x0
173
movq mm5, mm0 ; ; 5 ; x5 x1 x4 x0
174
punpckldq mm0, mm0 ; ; x4 x0 x4 x0
176
movq mm4, [TABLE+8] ; ; 4 ; w07 w05 w03 w01
177
punpckhwd mm2, mm1 ; ; 1 ; x7 x3 x6 x2
179
pmaddwd mm3, mm0 ; ; x4*w06+x0*w04 x4*w02+x0*w00
180
movq mm6, mm2 ; ; 6 ; x7 x3 x6 x2
182
movq mm1, [TABLE+32] ;; 1 ; w22 w20 w18 w16
183
punpckldq mm2, mm2 ; ; x6 x2 x6 x2
185
pmaddwd mm4, mm2 ; ; x6*w07+x2*w05 x6*w03+x2*w01
186
punpckhdq mm5, mm5 ; ; x5 x1 x5 x1
188
pmaddwd mm0, [TABLE+16] ;; x4*w14+x0*w12 x4*w10+x0*w08
189
punpckhdq mm6, mm6 ; ; x7 x3 x7 x3
191
movq mm7, [TABLE+40] ;; 7 ; w23 w21 w19 w17
192
pmaddwd mm1, mm5 ; ; x5*w22+x1*w20 x5*w18+x1*w16
194
paddd mm3, [round_inv_row];; +rounder
195
pmaddwd mm7, mm6 ; ; x7*w23+x3*w21 x7*w19+x3*w17
197
pmaddwd mm2, [TABLE+24] ;; x6*w15+x2*w13 x6*w11+x2*w09
198
paddd mm3, mm4 ; ; 4 ; a1=sum(even1) a0=sum(even0)
200
pmaddwd mm5, [TABLE+48] ;; x5*w30+x1*w28 x5*w26+x1*w24
201
movq mm4, mm3 ; ; 4 ; a1 a0
203
pmaddwd mm6, [TABLE+56] ;; x7*w31+x3*w29 x7*w27+x3*w25
204
paddd mm1, mm7 ; ; 7 ; b1=sum(odd1) b0=sum(odd0)
206
paddd mm0, [round_inv_row];; +rounder
207
psubd mm3, mm1 ; ; a1-b1 a0-b0
209
psrad mm3, SHIFT_INV_ROW ; ; y6=a1-b1 y7=a0-b0
210
paddd mm1, mm4 ; ; 4 ; a1+b1 a0+b0
212
paddd mm0, mm2 ; ; 2 ; a3=sum(even3) a2=sum(even2)
213
psrad mm1, SHIFT_INV_ROW ; ; y1=a1+b1 y0=a0+b0
215
paddd mm5, mm6 ; ; 6 ; b3=sum(odd3) b2=sum(odd2)
216
movq mm4, mm0 ; ; 4 ; a3 a2
218
paddd mm0, mm5 ; ; a3+b3 a2+b2
219
psubd mm4, mm5 ; ; 5 ; a3-b3 a2-b2
221
add INP, 16; ; increment INPUT pointer -> row 1
222
psrad mm4, SHIFT_INV_ROW ; ; y4=a3-b3 y5=a2-b2
224
; add TABLE, 0; ; TABLE += 64 -> row 1
225
psrad mm0, SHIFT_INV_ROW ; ; y3=a3+b3 y2=a2+b2
227
; movq mm2, [INP] ; ; row+1; 0; x3 x2 x1 x0
228
packssdw mm4, mm3 ; ; 3 ; y6 y7 y4 y5
230
packssdw mm1, mm0 ; ; 0 ; y3 y2 y1 y0
231
movq mm7, mm4 ; ; 7 ; y6 y7 y4 y5
233
; movq mm0, mm2 ; ; row+1; 2 ; x3 x2 x1 x0
234
psrld mm4, 16 ; ; 0 y6 0 y4
236
movq [OUT], mm1 ; ; 1 ; save y3 y2 y1 y0
237
pslld mm7, 16 ; ; y7 0 y5 0
239
; movq mm1, [INP+8] ; ; row+1; 1 ; x7 x6 x5 x4
240
por mm7, mm4 ; ; 4 ; y7 y6 y5 y4
242
movq mm3, [TABLE] ; ; 3 ; w06 w04 w02 w00
243
; punpcklwd mm0, mm1 ; ; row+1; x5 x1 x4 x0
245
; begin processing row 1
246
movq [OUT+8], mm7 ; ; 7 ; save y7 y6 y5 y4
249
add OUT, 16; ; increment OUTPUT pointer -> row 1
251
jl near lpa; ; end for ( x = 0; x < 8; ++x )
253
; done with the iDCT row-transformation
255
; now we have to transpose the output 8x8 matrix
256
; 8x8 (OUT) -> 8x8't' (IN)
257
; the transposition is implemented as 4 sub-operations.
258
; 1) transpose upper-left quad
259
; 2) transpose lower-right quad
260
; 3) transpose lower-left quad
261
; 4) transpose upper-right quad
264
; mm0 = 1st row [ A B C D ] row1
265
; mm1 = 2nd row [ E F G H ] 2
266
; mm2 = 3rd row [ I J K L ] 3
267
; mm3 = 4th row [ M N O P ] 4
269
; 1) transpose upper-left quad
272
movq mm0, [OUT + ROW_STRIDE * 0 ]
274
movq mm1, [OUT + ROW_STRIDE * 1 ]
275
movq mm4, mm0; ; mm4 = copy of row1[A B C D]
277
movq mm2, [OUT + ROW_STRIDE * 2 ]
278
punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
280
movq mm3, [OUT + ROW_STRIDE * 3]
281
punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
284
punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
286
punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
287
movq mm1, mm0; ; mm1 = [ 0 4 1 5]
289
mov INP, [ebp+8]; ; load input address
290
punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
292
movq mm3, mm4; ; mm3 = [ 2 6 3 7]
293
punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
295
movq [ INP + ROW_STRIDE * 0 ], mm0; ; store row 1
296
punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
298
; begin reading next quadrant (lower-right)
299
movq mm0, [OUT + ROW_STRIDE*4 + 8];
300
punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
302
movq [ INP +ROW_STRIDE * 2], mm4; ; store row 3
303
movq mm4, mm0; ; mm4 = copy of row1[A B C D]
305
movq [ INP +ROW_STRIDE * 1], mm1; ; store row 2
307
movq mm1, [OUT + ROW_STRIDE*5 + 8]
309
movq [ INP +ROW_STRIDE * 3], mm3; ; store row 4
310
punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
312
; 2) transpose lower-right quadrant
314
; movq mm0, [OUT + ROW_STRIDE*4 + 8]
316
; movq mm1, [OUT + ROW_STRIDE*5 + 8]
317
; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
319
movq mm2, [OUT + ROW_STRIDE*6 + 8]
320
; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
321
punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
323
movq mm3, [OUT + ROW_STRIDE*7 + 8]
326
punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
327
movq mm1, mm0; ; mm1 = [ 0 4 1 5]
329
punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
330
movq mm3, mm4; ; mm3 = [ 2 6 3 7]
332
punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
334
punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
337
movq [ INP + ROW_STRIDE*4 + 8], mm0; ; store row 1
338
punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
340
movq mm0, [OUT + ROW_STRIDE * 4 ]
341
punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
342
movq [ INP +ROW_STRIDE*6 + 8], mm4; ; store row 3
343
movq mm4, mm0; ; mm4 = copy of row1[A B C D]
344
movq [ INP +ROW_STRIDE*5 + 8], mm1; ; store row 2
346
movq mm1, [OUT + ROW_STRIDE * 5 ]
349
movq [ INP +ROW_STRIDE*7 + 8], mm3; ; store row 4
350
punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
352
; 3) transpose lower-left
353
; movq mm0, [OUT + ROW_STRIDE * 4 ]
355
; movq mm1, [OUT + ROW_STRIDE * 5 ]
356
; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
358
movq mm2, [OUT + ROW_STRIDE * 6 ]
359
; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
360
punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
362
movq mm3, [OUT + ROW_STRIDE * 7 ]
365
punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
366
movq mm1, mm0; ; mm1 = [ 0 4 1 5]
368
punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
369
movq mm3, mm4; ; mm3 = [ 2 6 3 7]
371
punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
373
punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
376
movq [ INP + ROW_STRIDE * 0 + 8 ], mm0; ; store row 1
377
punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
379
; begin reading next quadrant (upper-right)
380
movq mm0, [OUT + ROW_STRIDE*0 + 8];
381
punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
383
movq [ INP +ROW_STRIDE * 2 + 8], mm4; ; store row 3
384
movq mm4, mm0; ; mm4 = copy of row1[A B C D]
386
movq [ INP +ROW_STRIDE * 1 + 8 ], mm1; ; store row 2
387
movq mm1, [OUT + ROW_STRIDE*1 + 8]
389
movq [ INP +ROW_STRIDE * 3 + 8], mm3; ; store row 4
390
punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
393
; 2) transpose lower-right quadrant
395
; movq mm0, [OUT + ROW_STRIDE*4 + 8]
397
; movq mm1, [OUT + ROW_STRIDE*5 + 8]
398
; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
400
movq mm2, [OUT + ROW_STRIDE*2 + 8]
401
; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
402
punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
404
movq mm3, [OUT + ROW_STRIDE*3 + 8]
407
punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
408
movq mm1, mm0; ; mm1 = [ 0 4 1 5]
410
punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
411
movq mm3, mm4; ; mm3 = [ 2 6 3 7]
413
punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
415
punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
418
movq [ INP + ROW_STRIDE*4 ], mm0; ; store row 1
419
punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
421
movq [ INP +ROW_STRIDE*5 ], mm1; ; store row 2
422
punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
424
movq [ INP +ROW_STRIDE*6 ], mm4; ; store row 3
427
movq [ INP +ROW_STRIDE*7 ], mm3; ; store row 4
429
; Conceptually this is the column transform.
430
; Actually, the matrix is transformed
431
; row by row. This function is identical to idct_mmx32_rows(),
432
; except for the SHIFT amount and ROUND_INV amount.
434
; this subroutine performs two operations
435
; 1) iDCT row transform
436
; for( i = 0; i < 8; ++ i)
437
; DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
439
; 2) transpose the matrix (which was stored in qwTemp[])
440
; qwTemp[] -> [8x8 matrix transpose] -> blk[]
443
mov INP, [ebp+8]; ; ; row 0
444
mov edi, 0x00; ; x = 0
446
lea TABLE, [idct_tab_01234567]; ; row 0
448
; mov OUT, INP; ; algorithm writes data in-place -> row 0
450
lea round_inv_col, [idct_r_inv_col]
451
jmp acc_idct_colloop1
453
; for ( x = 0; x < 8; ++x ) ; transform one row per iteration
457
movq mm0, [INP] ; ; 0 ; x3 x2 x1 x0
459
movq mm1, [INP+8] ; ; 1 ; x7 x6 x5 x4
460
movq mm2, mm0 ; ; 2 ; x3 x2 x1 x0
462
movq mm3, [TABLE] ; ; 3 ; w06 w04 w02 w00
463
punpcklwd mm0, mm1 ; ; x5 x1 x4 x0
466
movq mm5, mm0 ; ; 5 ; x5 x1 x4 x0
467
punpckldq mm0, mm0 ; ; x4 x0 x4 x0
469
movq mm4, [TABLE+8] ; ; 4 ; w07 w05 w03 w01
470
punpckhwd mm2, mm1 ; ; 1 ; x7 x3 x6 x2
472
pmaddwd mm3, mm0 ; ; x4*w06+x0*w04 x4*w02+x0*w00
473
movq mm6, mm2 ; ; 6 ; x7 x3 x6 x2
475
movq mm1, [TABLE+32] ;; 1 ; w22 w20 w18 w16
476
punpckldq mm2, mm2 ; ; x6 x2 x6 x2
478
pmaddwd mm4, mm2 ; ; x6*w07+x2*w05 x6*w03+x2*w01
479
punpckhdq mm5, mm5 ; ; x5 x1 x5 x1
481
pmaddwd mm0, [TABLE+16] ;; x4*w14+x0*w12 x4*w10+x0*w08
482
punpckhdq mm6, mm6 ; ; x7 x3 x7 x3
484
movq mm7, [TABLE+40] ;; 7 ; w23 w21 w19 w17
485
pmaddwd mm1, mm5 ; ; x5*w22+x1*w20 x5*w18+x1*w16
487
paddd mm3, [round_inv_col] ;; +rounder
488
pmaddwd mm7, mm6 ; ; x7*w23+x3*w21 x7*w19+x3*w17
490
pmaddwd mm2, [TABLE+24] ;; x6*w15+x2*w13 x6*w11+x2*w09
491
paddd mm3, mm4 ; ; 4 ; a1=sum(even1) a0=sum(even0)
493
pmaddwd mm5, [TABLE+48] ;; x5*w30+x1*w28 x5*w26+x1*w24
494
movq mm4, mm3 ; ; 4 ; a1 a0
496
pmaddwd mm6, [TABLE+56] ;; x7*w31+x3*w29 x7*w27+x3*w25
497
paddd mm1, mm7 ; ; 7 ; b1=sum(odd1) b0=sum(odd0)
499
paddd mm0, [round_inv_col] ;; +rounder
500
psubd mm3, mm1 ; ; a1-b1 a0-b0
502
psrad mm3, SHIFT_INV_COL; ; y6=a1-b1 y7=a0-b0
503
paddd mm1, mm4 ; ; 4 ; a1+b1 a0+b0
505
paddd mm0, mm2 ; ; 2 ; a3=sum(even3) a2=sum(even2)
506
psrad mm1, SHIFT_INV_COL; ; y1=a1+b1 y0=a0+b0
508
paddd mm5, mm6 ; ; 6 ; b3=sum(odd3) b2=sum(odd2)
509
movq mm4, mm0 ; ; 4 ; a3 a2
511
paddd mm0, mm5 ; ; a3+b3 a2+b2
512
psubd mm4, mm5 ; ; 5 ; a3-b3 a2-b2
514
add INP, 16; ; increment INPUT pointer -> row 1
515
psrad mm4, SHIFT_INV_COL; ; y4=a3-b3 y5=a2-b2
517
add TABLE, 0; ; TABLE += 64 -> row 1
518
psrad mm0, SHIFT_INV_COL; ; y3=a3+b3 y2=a2+b2
520
; movq mm2, [INP] ; ; row+1; 0; x3 x2 x1 x0
521
packssdw mm4, mm3 ; ; 3 ; y6 y7 y4 y5
523
packssdw mm1, mm0 ; ; 0 ; y3 y2 y1 y0
524
movq mm7, mm4 ; ; 7 ; y6 y7 y4 y5
526
; movq mm0, mm2 ; ; row+1; 2 ; x3 x2 x1 x0
527
; por mm1, dct_one_corr ; ; correction y2 +0.5
528
psrld mm4, 16 ; ; 0 y6 0 y4
530
movq [OUT], mm1 ; ; 1 ; save y3 y2 y1 y0
531
pslld mm7, 16 ; ; y7 0 y5 0
533
; movq mm1, [INP+8] ; ; row+1; 1 ; x7 x6 x5 x4
534
; por mm7, dct_one_corr ; ; correction y2 +0.5
535
por mm7, mm4 ; ; 4 ; y7 y6 y5 y4
537
; movq mm3, [TABLE] ; ; 3 ; w06 w04 w02 w00
538
; punpcklwd mm0, mm1 ; ; row+1; x5 x1 x4 x0
540
; begin processing row 1
541
movq [OUT+8], mm7 ; ; 7 ; save y7 y6 y5 y4
545
cmp edi, 0x08; ; compare x <> 8
547
jl near acc_idct_colloop1; ; end for ( x = 0; x < 8; ++x )
549
; done with the iDCT column-transformation
551
; now we have to transpose the output 8x8 matrix
552
; 8x8 (OUT) -> 8x8't' (IN)
554
; the transposition is implemented as 4 sub-operations.
555
; 1) transpose upper-left quad
556
; 2) transpose lower-right quad
557
; 3) transpose lower-left quad
558
; 4) transpose upper-right quad
562
; mm0 = 1st row [ A B C D ] row1
563
; mm1 = 2nd row [ E F G H ] 2
564
; mm2 = 3rd row [ I J K L ] 3
565
; mm3 = 4th row [ M N O P ] 4
567
; 1) transpose upper-left quad
570
movq mm0, [OUT + ROW_STRIDE * 0 ]
572
movq mm1, [OUT + ROW_STRIDE * 1 ]
573
movq mm4, mm0; ; mm4 = copy of row1[A B C D]
575
movq mm2, [OUT + ROW_STRIDE * 2 ]
576
punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
578
movq mm3, [OUT + ROW_STRIDE * 3]
579
punpckhwd mm4, mm1 ; mm4 = [ 2 6 3 7]
582
punpcklwd mm2, mm3 ; mm2 = [ 8 12 9 13]
584
punpckhwd mm6, mm3 ; mm6 = 10 14 11 15]
585
movq mm1, mm0 ; mm1 = [ 0 4 1 5]
587
mov INP, [ebp+8] ; load input address
588
punpckldq mm0, mm2 ; final result mm0 = row1 [0 4 8 12]
590
movq mm3, mm4; ; mm3 = [ 2 6 3 7]
591
punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
593
movq [ INP + ROW_STRIDE * 0 ], mm0; ; store row 1
594
punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
596
; begin reading next quadrant (lower-right)
597
movq mm0, [OUT + ROW_STRIDE*4 + 8];
598
punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
600
movq [ INP +ROW_STRIDE * 2], mm4; ; store row 3
601
movq mm4, mm0; ; mm4 = copy of row1[A B C D]
603
movq [ INP +ROW_STRIDE * 1], mm1; ; store row 2
605
movq mm1, [OUT + ROW_STRIDE*5 + 8]
607
movq [ INP +ROW_STRIDE * 3], mm3; ; store row 4
608
punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
610
; 2) transpose lower-right quadrant
612
; movq mm0, [OUT + ROW_STRIDE*4 + 8]
614
; movq mm1, [OUT + ROW_STRIDE*5 + 8]
615
; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
617
movq mm2, [OUT + ROW_STRIDE*6 + 8]
618
; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
619
punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
621
movq mm3, [OUT + ROW_STRIDE*7 + 8]
624
punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
625
movq mm1, mm0; ; mm1 = [ 0 4 1 5]
627
punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
628
movq mm3, mm4; ; mm3 = [ 2 6 3 7]
630
punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
632
punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
635
movq [ INP + ROW_STRIDE*4 + 8], mm0; ; store row 1
636
punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
638
movq mm0, [OUT + ROW_STRIDE * 4 ]
639
punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
640
movq [ INP +ROW_STRIDE*6 + 8], mm4; ; store row 3
641
movq mm4, mm0; ; mm4 = copy of row1[A B C D]
643
movq [ INP +ROW_STRIDE*5 + 8], mm1; ; store row 2
645
movq mm1, [OUT + ROW_STRIDE * 5 ]
648
movq [ INP +ROW_STRIDE*7 + 8], mm3; ; store row 4
649
punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
651
; 3) transpose lower-left
652
; movq mm0, [OUT + ROW_STRIDE * 4 ]
654
; movq mm1, [OUT + ROW_STRIDE * 5 ]
655
; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
657
movq mm2, [OUT + ROW_STRIDE * 6 ]
658
; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
659
punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
661
movq mm3, [OUT + ROW_STRIDE * 7 ]
664
punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
665
movq mm1, mm0; ; mm1 = [ 0 4 1 5]
667
punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
668
movq mm3, mm4; ; mm3 = [ 2 6 3 7]
670
punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
672
punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
675
movq [ INP + ROW_STRIDE * 0 + 8 ], mm0; ; store row 1
676
punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
678
; begin reading next quadrant (upper-right)
679
movq mm0, [OUT + ROW_STRIDE*0 + 8];
680
punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
682
movq [ INP +ROW_STRIDE * 2 + 8], mm4; ; store row 3
683
movq mm4, mm0; ; mm4 = copy of row1[A B C D]
685
movq [ INP +ROW_STRIDE * 1 + 8 ], mm1; ; store row 2
686
movq mm1, [OUT + ROW_STRIDE*1 + 8]
688
movq [ INP +ROW_STRIDE * 3 + 8], mm3; ; store row 4
689
punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
692
; 2) transpose lower-right quadrant
694
; movq mm0, [OUT + ROW_STRIDE*4 + 8]
696
; movq mm1, [OUT + ROW_STRIDE*5 + 8]
697
; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
699
movq mm2, [OUT + ROW_STRIDE*2 + 8]
700
; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
701
punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
703
movq mm3, [OUT + ROW_STRIDE*3 + 8]
706
punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
707
movq mm1, mm0; ; mm1 = [ 0 4 1 5]
709
punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
710
movq mm3, mm4; ; mm3 = [ 2 6 3 7]
712
punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
714
punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
717
movq [ INP + ROW_STRIDE*4 ], mm0; ; store row 1
718
punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
720
movq [ INP +ROW_STRIDE*5 ], mm1; ; store row 2
721
punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
723
movq [ INP +ROW_STRIDE*6 ], mm4; ; store row 3
726
movq [ INP +ROW_STRIDE*7 ], mm3; ; store row 4
733
pop ebp ; restore frame pointer