x264 编码器像素运算系列:sad 函数

时间:2025-03-10 14:14:13
// 代码有删减 .macro SAD_START_4 ld1 {}[0], [x2], x3 ld1 {}[0], [x0], x1 ld1 {}[1], [x2], x3 ld1 {}[1], [x0], x1 uabdl v16.8h, v0.8b, v1.8b .endm .macro SAD_4 ld1 {}[0], [x2], x3 ld1 {}[0], [x0], x1 ld1 {}[1], [x2], x3 ld1 {}[1], [x0], x1 uabal v16.8h, v0.8b, v1.8b .endm .macro SAD_START_8 ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 uabdl v16.8h, v0.8b, v1.8b uabdl v17.8h, v2.8b, v3.8b .endm .macro SAD_8 ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 uabal v16.8h, v0.8b, v1.8b uabal v17.8h, v2.8b, v3.8b .endm .macro SAD_START_16 ld1 {v1.16b}, [x2], x3 ld1 {v0.16b}, [x0], x1 ld1 {v3.16b}, [x2], x3 ld1 {v2.16b}, [x0], x1 uabdl v16.8h, v0.8b, v1.8b uabdl2 v17.8h, v0.16b, v1.16b uabal v16.8h, v2.8b, v3.8b uabal2 v17.8h, v2.16b, v3.16b .endm .macro SAD_16 ld1 {v1.16b}, [x2], x3 ld1 {v0.16b}, [x0], x1 ld1 {v3.16b}, [x2], x3 ld1 {v2.16b}, [x0], x1 uabal v16.8h, v0.8b, v1.8b uabal2 v17.8h, v0.16b, v1.16b uabal v16.8h, v2.8b, v3.8b uabal2 v17.8h, v2.16b, v3.16b .endm .macro SAD_FUNC w, h, name function pixel_sad\name\()_\w\()x\h\()_neon, export=1 SAD_START_\w .rept \h / 2 - 1 SAD_\w .endr .if \w > 4 add v16.8h, v16.8h, v17.8h .endif uaddlv s0, v16.8h fmov w0, s0 ret endfunc .endm .macro SAD_X_4 x, first=uabal ld1 {}[0], [x0], x7 ld1 {}[0], [x1], x5 ld1 {}[1], [x0], x7 ld1 {}[1], [x1], x5 ld1 {}[0], [x2], x5 ld1 {}[1], [x2], x5 \first v16.8h, v1.8b, v0.8b ld1 {}[0], [x3], x5 ld1 {}[1], [x3], x5 \first v17.8h, v2.8b, v0.8b .if \x == 4 ld1 {}[0], [x4], x5 ld1 {}[1], [x4], x5 .endif \first v18.8h, v3.8b, v0.8b .if \x == 4 \first v19.8h, v4.8b, v0.8b .endif .endm .macro SAD_X_8 x, first=uabal ld1 {v0.8b}, [x0], x7 ld1 {v1.8b}, [x1], x5 ld1 {v2.8b}, [x2], x5 \first v16.8h, v1.8b, v0.8b ld1 {v3.8b}, [x3], x5 \first v17.8h, v2.8b, v0.8b ld1 {v5.8b}, [x0], x7 ld1 {v1.8b}, [x1], x5 \first v18.8h, v3.8b, v0.8b ld1 {v2.8b}, [x2], x5 uabal v16.8h, v1.8b, v5.8b ld1 {v3.8b}, [x3], x5 uabal v17.8h, v2.8b, v5.8b .if \x == 4 ld1 {v4.8b}, [x4], x5 ld1 {v1.8b}, [x4], x5 .endif uabal v18.8h, v3.8b, v5.8b .if \x == 4 \first v19.8h, v4.8b, v0.8b uabal v19.8h, v1.8b, v5.8b .endif .endm .macro SAD_X_16 x, first=uabal ld1 {v0.16b}, [x0], x7 ld1 {v1.16b}, [x1], x5 ld1 {v2.16b}, [x2], x5 \first v16.8h, v1.8b, v0.8b \first\()2 v20.8h, v1.16b, v0.16b ld1 {v3.16b}, [x3], x5 \first v17.8h, v2.8b, v0.8b \first\()2 v21.8h, v2.16b, v0.16b ld1 {v5.16b}, [x0], x7 ld1 {v1.16b}, [x1], x5 \first v18.8h, v3.8b, v0.8b \first\()2 v22.8h, v3.16b, v0.16b ld1 {v2.16b}, [x2], x5 uabal v16.8h, v1.8b, v5.8b uabal2 v20.8h, v1.16b, v5.16b ld1 {v3.16b}, [x3], x5 uabal v17.8h, v2.8b, v5.8b uabal2 v21.8h, v2.16b, v5.16b .if \x == 4 ld1 {v4.16b}, [x4], x5 ld1 {v1.16b}, [x4], x5 .endif uabal v18.8h, v3.8b, v5.8b uabal2 v22.8h, v3.16b, v5.16b .if \x == 4 \first v19.8h, v4.8b, v0.8b \first\()2 v23.8h, v4.16b, v0.16b uabal v19.8h, v1.8b, v5.8b uabal2 v23.8h, v1.16b, v5.16b .endif .endm .macro SAD_X_FUNC x, w, h function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 .if \x == 3 mov x6, x5 mov x5, x4 .endif mov x7, #FENC_STRIDE SAD_X_\w \x, uabdl .rept \h / 2 - 1 SAD_X_\w \x .endr .if \w > 8 add v16.8h, v16.8h, v20.8h add v17.8h, v17.8h, v21.8h add v18.8h, v18.8h, v22.8h .if \x == 4 add v19.8h, v19.8h, v23.8h .endif .endif // add up the sads uaddlv s0, v16.8h uaddlv s1, v17.8h uaddlv s2, v18.8h stp s0, s1, [x6], #8 .if \x == 3 str s2, [x6] .else uaddlv s3, v19.8h stp s2, s3, [x6] .endif ret endfunc .endm // 代码有删减 .macro SAD_START_4 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {}[0], [x2], x3 ld1 {}[0], [x0], x1 ld1 {}[1], [x2], x3 ld1 {}[1], [x0], x1 uabdl v16.4s, v0.4h, v1.4h uabdl2 v18.4s, v0.8h, v1.8h .endm .macro SAD_4 ld1 {}[0], [x2], x3 ld1 {}[0], [x0], x1 ld1 {}[1], [x2], x3 ld1 {}[1], [x0], x1 uabal v16.4s, v0.4h, v1.4h uabal2 v18.4s, v0.8h, v1.8h .endm .macro SAD_START_8 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 uabdl v16.4s, v0.4h, v1.4h uabdl2 v17.4s, v0.8h, v1.8h uabdl v18.4s, v2.4h, v3.4h uabdl2 v19.4s, v2.8h, v3.8h .endm .macro SAD_8 ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 uabal v16.4s, v0.4h, v1.4h uabal2 v17.4s, v0.8h, v1.8h uabal v18.4s, v2.4h, v3.4h uabal2 v19.4s, v2.8h, v3.8h .endm .macro SAD_START_16 lsl x1, x1, #1 lsl x3, x3, #1 ld2 {v0.8h, v1.8h}, [x2], x3 ld2 {v2.8h, v3.8h}, [x0], x1 ld2 {v4.8h, v5.8h}, [x2], x3 ld2 {v6.8h, v7.8h}, [x0], x1 uabdl v16.4s, v0.4h, v2.4h uabdl2 v17.4s, v0.8h, v2.8h uabdl v20.4s, v1.4h, v3.4h uabdl2 v21.4s, v1.8h, v3.8h uabdl v18.4s, v4.4h, v6.4h uabdl2 v19.4s, v4.8h, v6.8h uabdl v22.4s, v5.4h, v7.4h uabdl2 v23.4s, v5.8h, v7.8h .endm .macro SAD_16 ld2 {v0.8h, v1.8h}, [x2], x3 ld2 {v2.8h, v3.8h}, [x0], x1 ld2 {v4.8h, v5.8h}, [x2], x3 ld2 {v6.8h, v7.8h}, [x0], x1 uabal v16.4s, v0.4h, v2.4h uabal2 v17.4s, v0.8h, v2.8h uabal v20.4s, v1.4h, v3.4h uabal2 v21.4s, v1.8h, v3.8h uabal v18.4s, v4.4h, v6.4h uabal2 v19.4s, v4.8h, v6.8h uabal v22.4s, v5.4h, v7.4h uabal2 v23.4s, v5.8h, v7.8h .endm .macro SAD_FUNC w, h, name function pixel_sad\name\()_\w\()x\h\()_neon, export=1 SAD_START_\w .rept \h / 2 - 1 SAD_\w .endr .if \w > 8 add v20.4s, v20.4s, v21.4s add v16.4s, v16.4s, v20.4s add v22.4s, v22.4s, v23.4s add v18.4s, v18.4s, v22.4s .endif .if \w > 4 add v16.4s, v16.4s, v17.4s add v18.4s, v18.4s, v19.4s .endif add v16.4s, v16.4s, v18.4s uaddlv s0, v16.8h fmov w0, s0 ret endfunc .endm .macro SAD_X_4 x, first=uaba ld1 {}[0], [x0], x7 ld1 {}[0], [x1], x5 ld1 {}[1], [x0], x7 ld1 {}[1], [x1], x5 ld1 {}[0], [x2], x5 ld1 {}[1], [x2], x5 \first v16.8h, v1.8h, v0.8h ld1 {}[0], [x3], x5 ld1 {}[1], [x3], x5 \first v17.8h, v2.8h, v0.8h .if \x == 4 ld1 {}[0], [x4], x5 ld1 {}[1], [x4], x5 .endif \first v18.8h, v3.8h, v0.8h .if \x == 4 \first v19.8h, v4.8h, v0.8h .endif .endm .macro SAD_X_8 x, first=uaba ld1 {v0.8h}, [x0], x7 ld1 {v1.8h}, [x1], x5 \first v16.8h, v1.8h, v0.8h ld1 {v2.8h}, [x2], x5 ld1 {v3.8h}, [x3], x5 \first v17.8h, v2.8h, v0.8h ld1 {v5.8h}, [x0], x7 ld1 {v1.8h}, [x1], x5 \first v18.8h, v3.8h, v0.8h ld1 {v2.8h}, [x2], x5 uaba v16.8h, v1.8h, v5.8h ld1 {v3.8h}, [x3], x5 uaba v17.8h, v2.8h, v5.8h .if \x == 4 ld1 {v4.8h}, [x4], x5 ld1 {v1.8h}, [x4], x5 .endif uaba v18.8h, v3.8h, v5.8h .if \x == 4 \first v19.8h, v4.8h, v0.8h uaba v19.8h, v1.8h, v5.8h .endif .endm .macro SAD_X_16 x, first=uaba ld1 {v0.8h, v1.8h}, [x0], x7 ld1 {v2.8h, v3.8h}, [x1], x5 ld1 {v4.8h, v5.8h}, [x2], x5 \first v16.8h, v2.8h, v0.8h \first v20.8h, v3.8h, v1.8h ld1 {v24.8h, v25.8h}, [x3], x5 \first v17.8h, v4.8h, v0.8h \first v21.8h, v5.8h, v1.8h ld1 {v6.8h, v7.8h}, [x0], x7 ld1 {v2.8h, v3.8h}, [x1], x5 \first v18.8h, v24.8h, v0.8h \first v22.8h, v25.8h, v1.8h ld1 {v4.8h, v5.8h}, [x2], x5 uaba v16.8h, v2.8h, v6.8h uaba v20.8h, v3.8h, v7.8h ld1 {v24.8h, v25.8h}, [x3], x5 uaba v17.8h, v4.8h, v6.8h uaba v21.8h, v5.8h, v7.8h .if \x == 4 ld1 {v26.8h, v27.8h}, [x4], x5 ld1 {v28.8h, v29.8h}, [x4], x5 .endif uaba v18.8h, v24.8h, v6.8h uaba v22.8h, v25.8h, v7.8h .if \x == 4 \first v19.8h, v26.8h, v0.8h \first v23.8h, v27.8h, v1.8h uaba v19.8h, v28.8h, v6.8h uaba v23.8h, v29.8h, v7.8h .endif .endm .macro SAD_X_FUNC x, w, h function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 .if \x == 3 mov x6, x5 mov x5, x4 .endif mov x7, #FENC_STRIDE lsl x5, x5, #1 lsl x7, x7, #1 SAD_X_\w \x, uabd .rept \h / 2 - 1 SAD_X_\w \x .endr .if \w > 8 add v16.8h, v16.8h, v20.8h add v17.8h, v17.8h, v21.8h add v18.8h, v18.8h, v22.8h .if \x == 4 add v19.8h, v19.8h, v23.8h .endif .endif // add up the sads uaddlv s0, v16.8h uaddlv s1, v17.8h uaddlv s2, v18.8h stp s0, s1, [x6], #8 .if \x == 3 str s2, [x6] .else uaddlv s3, v19.8h stp s2, s3, [x6] .endif ret endfunc .endm // 代码有删减 SAD_FUNC 4, 4 SAD_FUNC 4, 8 SAD_FUNC 4, 16 SAD_FUNC 8, 4 SAD_FUNC 8, 8 SAD_FUNC 8, 16 SAD_FUNC 16, 8 SAD_FUNC 16, 16 SAD_X_FUNC 3, 4, 4 SAD_X_FUNC 3, 4, 8 SAD_X_FUNC 3, 8, 4 SAD_X_FUNC 3, 8, 8 SAD_X_FUNC 3, 8, 16 SAD_X_FUNC 3, 16, 8 SAD_X_FUNC 3, 16, 16 SAD_X_FUNC 4, 4, 4 SAD_X_FUNC 4, 4, 8 SAD_X_FUNC 4, 8, 4 SAD_X_FUNC 4, 8, 8 SAD_X_FUNC 4, 8, 16 SAD_X_FUNC 4, 16, 8 SAD_X_FUNC 4, 16, 16