注:问号以及未注释部分 会在x265-1.9版本内更新
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
* Mandar Gurav <mandar@multicorewareinc.com>
* Mahesh Pittala <mahesh@multicorewareinc.com>
* Min Chen <min.chen@multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#include "common.h"
#include "primitives.h"
#include "x265.h"
#include <cstdlib> // abs()
using namespace X265_NS;
namespace {
// place functions in anonymous namespace (file static)
/** 函数功能 : 计算SAD(8位)
/*\参数 lx:块的宽度
/*\参数 ly:块的高度
/*\参数 pix1:计算块的首地址
/*\参数 stride_pix1:计算块的步长
/*\参数 pix2:参考块的首地址
/*\参数 stride_pix2:参考块的步长
* \返回 :返回SAD值 */
template<int lx, int ly>
int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int sum = 0;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
sum += abs(pix1[x] - pix2[x]);
pix1 += stride_pix1;
pix2 += stride_pix2;
}
return sum;
}
/** 函数功能 : 计算SAD(16位)
/*\参数 lx:块的宽度
/*\参数 ly:块的高度
/*\参数 pix1:计算块的首地址
/*\参数 stride_pix1:计算块的步长
/*\参数 pix2:参考块的首地址
/*\参数 stride_pix2:参考块的步长
* \返回 :返回SAD值 */
template<int lx, int ly>
int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
{
int sum = 0;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
sum += abs(pix1[x] - pix2[x]);
pix1 += stride_pix1;
pix2 += stride_pix2;
}
return sum;
}
/** 函数功能 : 同时计算3个MV对应的3个SAD值
/* 调用范围 : ME中
/*\参数 lx:块的宽度
/*\参数 ly:块的高度
/*\参数 pix1:计算块的首地址
/*\参数 pix2:参考块的首地址
/*\参数 pix3:参考块的首地址
/*\参数 pix4:参考块的首地址
/*\参数 frefstride:参考块的步长
/*\参数 res:存储3个MV对应的3个SAD值
* \返回 :null */
template<int lx, int ly>
void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
{
res[0] = 0;
res[1] = 0;
res[2] = 0;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
{
res[0] += abs(pix1[x] - pix2[x]);
res[1] += abs(pix1[x] - pix3[x]);
res[2] += abs(pix1[x] - pix4[x]);
}
pix1 += FENC_STRIDE; //搜索块统一步长都为64
pix2 += frefstride;
pix3 += frefstride;
pix4 += frefstride;
}
}
/** 函数功能 : 同时计算4个MV对应的4个SAD值
/* 调用范围 : ME中
/*\参数 lx:块的宽度
/*\参数 ly:块的高度
/*\参数 pix1:计算块的首地址
/*\参数 pix2:参考块的首地址
/*\参数 pix3:参考块的首地址
/*\参数 pix4:参考块的首地址
/*\参数 pix5:参考块的首地址
/*\参数 frefstride:参考块的步长
/*\参数 res:存储4个MV对应的4个SAD值
* \返回 :null */
template<int lx, int ly>
void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
{
res[0] = 0;
res[1] = 0;
res[2] = 0;
res[3] = 0;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
{
res[0] += abs(pix1[x] - pix2[x]);
res[1] += abs(pix1[x] - pix3[x]);
res[2] += abs(pix1[x] - pix4[x]);
res[3] += abs(pix1[x] - pix5[x]);
}
pix1 += FENC_STRIDE;//搜索块统一步长都为64
pix2 += frefstride;
pix3 += frefstride;
pix4 += frefstride;
pix5 += frefstride;
}
}
template<int lx, int ly, class T1, class T2>
sse_ret_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
{
sse_ret_t sum = 0;
int tmp;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
{
tmp = pix1[x] - pix2[x];
sum += (tmp * tmp);
}
pix1 += stride_pix1;
pix2 += stride_pix2;
}
return sum;
}
#define BITS_PER_SUM (8 * sizeof(sum_t)) //占用的位数 一般为8*2=16
//哈达玛4x4列变换
#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \
sum2_t t0 = s0 + s1; \
sum2_t t1 = s0 - s1; \
sum2_t t2 = s2 + s3; \
sum2_t t3 = s2 - s3; \
d0 = t0 + t2; \
d2 = t0 - t2; \
d1 = t1 + t3; \
d3 = t1 - t3; \
}
// in: a pseudo-simd number of the form x+(y<<16)
// return: abs(x)+(abs(y)<<16)
/** 函数功能 :将值转换为绝对值
/*\参数 a :哈达玛4x4变换后的值(一次含两个数高16位 低16位)
* \返回 :返回绝对值 */
inline sum2_t abs2(sum2_t a)
{
sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
return (a + s) ^ s;//返回绝对值
}
/** 函数功能 :返回4x4块的SATD值/2
/*\参数 pix1:原始块地址
/*\参数 i_pix1:原始块步长
/*\参数 pix2:预测块地址
/*\参数 i_pix2:预测块步长
* \返回 :返回4x4块的SATD值/2 */
static int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
/*
h=
1 1
1 -1
H=
h h
h -h
R= A B
C D
=
a(0,0) a(0,1) a(0,2) a(0,3)
a(1,0) a(1,1) a(1,2) a(1,3)
a(2,0) a(2,1) a(2,2) a(2,3)
a(3,0) a(3,1) a(3,2) a(3,3)
H*R*H = 行变换
a(0,0)+a(0,1)+a(0,2)+a(0,3) a(0,0)-a(0,1)+a(0,2)-a(0,3) a(0,0)+a(0,1)-a(0,2)-a(0,3) a(0,0)-a(0,1)-a(0,2)+a(0,3)
H* a(1,0)+a(1,1)+a(1,2)+a(1,3) a(1,0)-a(1,1)+a(1,2)-a(1,3) a(1,0)+a(1,1)-a(1,2)-a(1,3) a(1,0)-a(1,1)-a(1,2)+a(1,3)
a(2,0)+a(2,1)+a(2,2)+a(2,3) a(2,0)-a(2,1)+a(2,2)-a(2,3) a(2,0)+a(2,1)-a(2,2)-a(2,3) a(2,0)-a(2,1)-a(2,2)+a(2,3)
a(3,0)+a(3,1)+a(3,2)+a(3,3) a(3,0)-a(3,1)+a(3,2)-a(3,3) a(3,0)+a(3,1)-a(3,2)-a(3,3) a(3,0)-a(3,1)-a(3,2)+a(3,3)
= 列变换
b(0,0) b(0,1) b(0,2) b(0,3)
H*b(1,0) b(1,1) b(1,2) b(1,3)
b(2,0) b(2,1) b(2,2) b(2,3)
b(3,0) b(3,1) b(3,2) b(3,3)
=
b(0,0)+b(1,0)+b(2,0)+b(3,0) b(0,1)+b(1,1)+b(2,1)+b(3,1) b(0,2)+b(1,2)+b(2,2)+b(3,2) b(0,3)+b(1,3)+b(2,3)+b(3,3)
b(0,0)-b(1,0)+b(2,0)-b(3,0) b(0,1)-b(1,1)+b(2,1)-b(3,1) b(0,2)-b(1,2)+b(2,2)-b(3,2) b(0,3)-b(1,3)+b(2,3)-b(3,3)
b(0,0)+b(1,0)-b(2,0)-b(3,0) b(0,1)+b(1,1)-b(2,1)-b(3,1) b(0,2)+b(1,2)-b(2,2)-b(3,2) b(0,3)+b(1,3)-b(2,3)-b(3,3)
b(0,0)-b(1,0)-b(2,0)+b(3,0) b(0,1)-b(1,1)-b(2,1)+b(3,1) b(0,2)-b(1,2)-b(2,2)+b(3,2) b(0,3)-b(1,3)-b(2,3)+b(3,3)
哈达玛矩阵:hadma
1 1 1 1
1 -1 1 -1
1 1 -1 -1
1 -1 -1 1
对其做哈达玛变换:
1 1 1 1
1 -1 1 -1
1 1 -1 -1
1 -1 -1 1
hadma*A*hadma' = 1 1 1 1 -1 -5 -2 -3 1 1 1 1
1 -1 1 -1 * -4 -4 -1 -2 * 1 -1 1 -1
1 1 -1 -1 -2 -3 -5 -4 1 1 -1 -1
1 -1 -1 1 2 -3 -2 -3 1 -1 -1 1
= -42 12 2 8
-8 -2 4 2
-2 0 -14 -4
8 10 4 6
求得矩阵的绝对值和为:Σabs(x) = 128 则satd = 128/2 = 64
**/
sum2_t tmp[4][2];//暂存行变换后的的值 实质存储四个 高16位和低16位分别存储
sum2_t a0, a1, a2, a3, b0, b1;//用于临时存储残差值
sum2_t sum = 0;//存储SATD值
//行变换
for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)//遍历每一行
{
a0 = pix1[0] - pix2[0];//计算残差
a1 = pix1[1] - pix2[1];//计算残差
b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);//暂存两个临时值 分别用高16位和低16位存储一个值
a2 = pix1[2] - pix2[2];//计算残差
a3 = pix1[3] - pix2[3];//计算残差
b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);//暂存两个临时值 分别用高16位和低16位存储
tmp[i][0] = b0 + b1;//高位存储b(i,1) 低位存储 b(i,0)
tmp[i][1] = b0 - b1;//高位存储b(i,3) 低位存储 b(i,2)
}
//列变换
for (int i = 0; i < 2; i++)//遍历每一列 (一次遍历两列)
{
HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);//作列变换
a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);//当前当前列的绝对值和
sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);//累加绝对值
}
return (int)(sum >> 1);//因为所有的SATD值都是偶数 在此除以2并不影响精度
}
/** 函数功能 :返回4x4块的SATD值/2
/*\参数 pix1:残差块
/*\参数 i_pix1:残差块步长步长
* \返回 :返回4x4块的SATD值/2 */
static int satd_4x4(const int16_t* pix1, intptr_t stride_pix1)
{
int32_t tmp[4][4];
int32_t s01, s23, d01, d23;
int32_t satd = 0;
int d;
//行变换
for (d = 0; d < 4; d++, pix1 += stride_pix1)
{
s01 = pix1[0] + pix1[1];
s23 = pix1[2] + pix1[3];
d01 = pix1[0] - pix1[1];
d23 = pix1[2] - pix1[3];
tmp[d][0] = s01 + s23;
tmp[d][1] = s01 - s23;
tmp[d][2] = d01 - d23;
tmp[d][3] = d01 + d23;
}
//列变换
for (d = 0; d < 4; d++)
{
s01 = tmp[0][d] + tmp[1][d];
s23 = tmp[2][d] + tmp[3][d];
d01 = tmp[0][d] - tmp[1][d];
d23 = tmp[2][d] - tmp[3][d];
satd += abs(s01 + s23) + abs(s01 - s23) + abs(d01 - d23) + abs(d01 + d23);
}
return (int)(satd / 2);
}
// x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
/** 函数功能 :返回8x4块的SATD值/2 (跟拆分成4x4计算相同)
/*\参数 pix1:原始块地址
/*\参数 i_pix1:原始块步长
/*\参数 pix2:预测块地址
/*\参数 i_pix2:预测块步长
* \返回 :返回8x4块的SATD值/2 */
static int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
sum2_t tmp[4][4];
sum2_t a0, a1, a2, a3;
sum2_t sum = 0;
for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)//行变换
{
a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
}
for (int i = 0; i < 4; i++)//列变换
{
HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
}
return (((sum_t)sum) + (sum >> BITS_PER_SUM)) >> 1;
}
/** 函数功能 : 将当前块拆分成4x4块并返回SATD值/2
/*\参数 w:块的宽度
/*\参数 h:块的高度
/*\参数 pix1:原始块地址
/*\参数 i_pix1:原始块步长
/*\参数 pix2:预测块地址
/*\参数 i_pix2:预测块步长
* \返回 :返回SATD值/2 */
template<int w, int h>
// calculate satd in blocks of 4x4
int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int satd = 0;
for (int row = 0; row < h; row += 4)
for (int col = 0; col < w; col += 4)
satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2); //拆分成4x4块计算
return satd;//返回SATD值/2
}
/** 函数功能 : 将当前块拆分成8x4块并返回SATD值/2
/*\参数 w:块的宽度
/*\参数 h:块的高度
/*\参数 pix1:原始块地址
/*\参数 i_pix1:原始块步长
/*\参数 pix2:预测块地址
/*\参数 i_pix2:预测块步长
* \返回 :返回SATD值/2 */
template<int w, int h>
// calculate satd in blocks of 8x4
int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int satd = 0;
for (int row = 0; row < h; row += 4)
for (int col = 0; col < w; col += 8)
satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2);//拆分成计算8x4块的SATD值/2 (跟拆分成4x4计算相同)
return satd;
}
/** 函数功能 :返回8x8的SATD值
/*\参数 pix1:原始块地址
/*\参数 i_pix1:原始块步长
/*\参数 pix2:预测块地址
/*\参数 i_pix2:预测块步长
* \返回 :返回8x8的SATD值 */
inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
//详情:查看satd_4x4
//sum2_t类型:typedef uint32_t sum2_t;
/*
R=
-1 -5 -2 -3 -7 -4 -3 -8
-4 -4 -1 -2 -6 -8 -9 -5
-2 -3 -5 -4 -5 -8 -7 -4
2 -3 -2 -3 -6 -2 0 1
1 -1 -5 -7 -5 -6 -4 -3
0 -5 -8 -3 -5 -1 -1 -5
-3 -9 -3 -5 -5 -5 -8 -7
-10 -2 2 -3 -5 -7 -6 1
H8=
H H
H -H
H8*R*H8 =
-256 10 -12 14 50 32 22 4
-36 26 32 6 14 0 2 12
-4 10 28 -6 10 -12 10 36
44 -2 -20 -26 -6 8 26 -16
10 4 -6 4 28 2 4 10
-2 -12 -10 -12 -8 -22 -8 2
-38 0 -34 -12 24 2 -60 -34
18 20 14 -8 -24 14 -4 74
绝对值和SATD = 1316
**/
sum2_t tmp[8][4];
sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
sum2_t sum = 0;
for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2)//行变换
{
a0 = pix1[0] - pix2[0];
a1 = pix1[1] - pix2[1];
b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
a2 = pix1[2] - pix2[2];
a3 = pix1[3] - pix2[3];
b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
a4 = pix1[4] - pix2[4];
a5 = pix1[5] - pix2[5];
b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
a6 = pix1[6] - pix2[6];
a7 = pix1[7] - pix2[7];
b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
}
for (int i = 0; i < 4; i++)//列变换
{
HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
b0 = abs2(a0 + a4) + abs2(a0 - a4);
b0 += abs2(a1 + a5) + abs2(a1 - a5);
b0 += abs2(a2 + a6) + abs2(a2 - a6);
b0 += abs2(a3 + a7) + abs2(a3 - a7);
sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
}
return (int)sum;
}
/** 函数功能 :返回8x8的SATD值除以4
/*\参数 pix1:原始块地址
/*\参数 i_pix1:原始块步长
/*\参数 pix2:预测块地址
/*\参数 i_pix2:预测块步长
* \返回 :返回8x8的SATD值除以4 */
inline int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
}
/** 函数功能 :返回8x8的SATD值
/*\参数 pix1:残差块地址
/*\参数 i_pix1:残差块步长
* \返回 :返回8x8的SATD值 */
inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
{
int32_t tmp[8][8];
int32_t a0, a1, a2, a3, a4, a5, a6, a7;
int32_t sum = 0;
for (int i = 0; i < 8; i++, pix1 += i_pix1)//行变换
{
a0 = pix1[0] + pix1[1];
a1 = pix1[2] + pix1[3];
a2 = pix1[4] + pix1[5];
a3 = pix1[6] + pix1[7];
a4 = pix1[0] - pix1[1];
a5 = pix1[2] - pix1[3];
a6 = pix1[4] - pix1[5];
a7 = pix1[6] - pix1[7];
tmp[i][0] = (a0 + a1) + (a2 + a3);
tmp[i][1] = (a0 + a1) - (a2 + a3);
tmp[i][2] = (a0 - a1) + (a2 - a3);
tmp[i][3] = (a0 - a1) - (a2 - a3);
tmp[i][4] = (a4 + a5) + (a6 + a7);
tmp[i][5] = (a4 + a5) - (a6 + a7);
tmp[i][6] = (a4 - a5) + (a6 - a7);
tmp[i][7] = (a4 - a5) - (a6 - a7);
}
for (int i = 0; i < 8; i++)//列变换
{
a0 = (tmp[0][i] + tmp[1][i]) + (tmp[2][i] + tmp[3][i]);
a2 = (tmp[0][i] + tmp[1][i]) - (tmp[2][i] + tmp[3][i]);
a1 = (tmp[0][i] - tmp[1][i]) + (tmp[2][i] - tmp[3][i]);
a3 = (tmp[0][i] - tmp[1][i]) - (tmp[2][i] - tmp[3][i]);
a4 = (tmp[4][i] + tmp[5][i]) + (tmp[6][i] + tmp[7][i]);
a6 = (tmp[4][i] + tmp[5][i]) - (tmp[6][i] + tmp[7][i]);
a5 = (tmp[4][i] - tmp[5][i]) + (tmp[6][i] - tmp[7][i]);
a7 = (tmp[4][i] - tmp[5][i]) - (tmp[6][i] - tmp[7][i]);
a0 = abs(a0 + a4) + abs(a0 - a4);
a0 += abs(a1 + a5) + abs(a1 - a5);
a0 += abs(a2 + a6) + abs(a2 - a6);
a0 += abs(a3 + a7) + abs(a3 - a7);
sum += a0;
}
return (int)sum;
}
/** 函数功能 :返回8x8的SATD值除以4
/*\参数 pix1:残差块地址
/*\参数 i_pix1:残差块步长
* \返回 :返回8x8的SATD值除以4 */
static int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
{
return (int)((_sa8d_8x8(pix1, i_pix1) + 2) >> 2);
}
/** 函数功能 :将当前块拆分成8x8分别计算SATD的累加值和再除以4
/*\参数 pix1:原始块地址
/*\参数 i_pix1:原始块步长
/*\参数 pix2:预测块地址
/*\参数 i_pix2:预测块步长
* \返回 :返回SATD值除以4 */
static int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
+ _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
+ _sa8d_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2)
+ _sa8d_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2);//将当前16x16块拆分四个8x8分别计算SATD值
// This matches x264 sa8d_16x16, but is slightly different from HM's behavior because
// this version only rounds once at the end
return (sum + 2) >> 2;//返回SATD值除以4
}
template<int w, int h>
// Calculate sa8d in blocks of 8x8
int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int cost = 0;
for (int y = 0; y < h; y += 8)
for (int x = 0; x < w; x += 8)
cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
return cost;
}
/** 函数功能 :将当前块拆分成16x16分别计算SATD的累加值和再除以4
/*\参数 w:块的宽度
/*\参数 h:块的高度
/*\参数 pix1:原始块地址
/*\参数 i_pix1:原始块步长
/*\参数 pix2:预测块地址
/*\参数 i_pix2:预测块步长
* \返回 :返回当前块拆分成16x16分别计算SATD的累加值和再除以4 */
template<int w, int h>
// Calculate sa8d in blocks of 16x16
int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int cost = 0;//初始化为0
//将当前块划分成几个16x16分别计算
for (int y = 0; y < h; y += 16)
for (int x = 0; x < w; x += 16)
cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);//将当前块拆分成16x16分别计算SATD的累加值和再除以4
return cost;//返回当前块拆分成16x16分别计算SATD的累加值和再除以4
}
template<int size>
int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
{
int sum = 0;
for (int y = 0; y < size; y++)
{
for (int x = 0; x < size; x++)
sum += a[x] * a[x];
a += dstride;
}
return sum;
}
template<int size>
void blockfill_s_c(int16_t* dst, intptr_t dstride, int16_t val)
{
for (int y = 0; y < size; y++)
for (int x = 0; x < size; x++)
dst[y * dstride + x] = val;
}
/** 函数功能 :将残差数据进行左移操作(左移15- TU尺寸)
/*\参数 size:当前变换块大小
/*\参数 dst:存储移位后的TU数据
/*\参数 src:残差块数据
/*\参数 srcStride:残差块步长
/*\参数 shift:变换需要移位的位数:7- TU尺寸
* \返回 :null*/
template<int size>
void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
{
X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
X265_CHECK(shift >= 0, "invalid shift\n");
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
dst[j] = src[j] << shift;//左移7- TU尺寸
src += srcStride;
dst += size;
}
}
template<int size>
void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
{
X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
X265_CHECK(shift > 0, "invalid shift\n");
int16_t round = 1 << (shift - 1);
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
dst[j] = (src[j] + round) >> shift;
src += srcStride;
dst += size;
}
}
template<int size>
void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
{
X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n");
X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
X265_CHECK(shift >= 0, "invalid shift\n");
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
dst[j] = src[j] << shift;
src += size;
dst += dstStride;
}
}
template<int size>
void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
{
X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n");
X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
X265_CHECK(shift > 0, "invalid shift\n");
int16_t round = 1 << (shift - 1);
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
dst[j] = (src[j] + round) >> shift;
src += size;
dst += dstStride;
}
}
/** 函数功能 : 获取残差数据
* \参数 fenc : 原始块地址
* \参数 pred : 预测块地址
* \参数 residual : 残差块地址
* \参数 stride : 原始块步长
* \返回 : null */
template<int blockSize>
void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
{
for (int y = 0; y < blockSize; y++)
{
for (int x = 0; x < blockSize; x++)
residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);//获取残差
fenc += stride;
residual += stride;
pred += stride;
}
}
/** 函数功能 : copy原始块数据(置换存储)
* \参数 blockSize : 目标块大小
* \参数 dst : 目标块存储位置
* \参数 src : 原始块存储位置
* \参数 stride : 原始块步长
* \返回 : null */
template<int blockSize>
void transpose(pixel* dst, const pixel* src, intptr_t stride)
{
for (int k = 0; k < blockSize; k++)
for (int l = 0; l < blockSize; l++)
dst[k * blockSize + l] = src[l * stride + k];//copy数据
}
static void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
{
int x, y;
#if CHECKED_BUILD || _DEBUG
const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
X265_CHECK((shift >= correction), "shift must be include factor correction, please update ASM ABI\n");
#endif
for (y = 0; y <= height - 1; y++)
{
for (x = 0; x <= width - 1; )
{
// note: width can be odd
dst[x] = x265_clip(((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
x++;
}
src += srcStride;
dst += dstStride;
}
}
/** 函数功能 : P帧加权参考帧获取
/* 调用范围 : 只在MotionReference::applyWeight、LookaheadTLD::weightCostLuma、LookaheadTLD::weightsAnalyse和weightCost函数中被调用
* \参数 src : 原始P参考帧
* \参数 dst : 加权后P参考帧
* \参数 stride : 步长
* \参数 width : 宽度
* \参数 height : 高度
* \参数 w0 : 加权系数
* \参数 round : 四舍五入操作
* \参数 shift : 右移位数(前面为提高精度左移位,现在恢复原有精度)
* \参数 offset : offset信息 整帧所有像素偏移值
* \返回 : null */
static void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
{
int x, y;
const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
X265_CHECK(!(width & 15), "weightp alignment error\n");
X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
X265_CHECK((shift >= correction), "shift must be include factor correction, please update ASM ABI\n");
X265_CHECK(!(round & ((1 << correction) - 1)), "round must be include factor correction, please update ASM ABI\n");
for (y = 0; y <= height - 1; y++)
{
for (x = 0; x <= width - 1; )
{
// simulating pixel to short conversion
int16_t val = src[x] << correction;
dst[x] = x265_clip(((w0 * (val) + round) >> shift) + offset);
x++;
}
src += stride;
dst += stride;
}
}
/** 函数功能 : 获取两块的平均值
/*\参数 lx:块的宽度
/*\参数 ly:块的高度
* \参数 dst :平均后像素存储位置
* \参数 dstride :步长
* \参数 src0 :块0首地址
* \参数 sstride0 :块0步长
* \参数 src1 :块1首地址
* \参数 sstride1 :块1步长
* \返回 : null */
template<int lx, int ly>
void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)//获取两块的平均值
{
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
dst[x] = (src0[x] + src1[x] + 1) >> 1;
src0 += sstride0;
src1 += sstride1;
dst += dstride;
}
}
/** 函数功能 :将64x64的intraPU的周边参考像素下采样
* \参数 dst :目标存储位置
* \参数 src :源数据地址
* \返回值 : null
*/
static void scale1D_128to64(pixel *dst, const pixel *src)
{
int x;
const pixel* src1 = src;
const pixel* src2 = src + 128;
pixel* dst1 = dst;
pixel* dst2 = dst + 64/*128*/;
for (x = 0; x < 128; x += 2)
{
// Top pixel
pixel pix0 = src1[(x + 0)];
pixel pix1 = src1[(x + 1)];
// Left pixel
pixel pix2 = src2[(x + 0)];
pixel pix3 = src2[(x + 1)];
int sum1 = pix0 + pix1;
int sum2 = pix2 + pix3;
dst1[x >> 1] = (pixel)((sum1 + 1) >> 1);
dst2[x >> 1] = (pixel)((sum2 + 1) >> 1);
}
}
/** 函数功能 :将64x64的intraPU下采样到32x32
* \参数 dst :目标存储位置
* \参数 src :源数据地址
* \参数 dsth :源数据步长
* \返回值 : null
*/
static void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
{
uint32_t x, y;
/*例如
a b
c d
下采样:(a+b+c+4)/4
**/
for (y = 0; y < 64; y += 2)
{
for (x = 0; x < 64; x += 2)
{
pixel pix0 = src[(y + 0) * stride + (x + 0)];
pixel pix1 = src[(y + 0) * stride + (x + 1)];
pixel pix2 = src[(y + 1) * stride + (x + 0)];
pixel pix3 = src[(y + 1) * stride + (x + 1)];
int sum = pix0 + pix1 + pix2 + pix3;//求和
dst[y / 2 * 32 + x / 2] = (pixel)((sum + 2) >> 2);//取平均
}
}
}
/** 函数功能 :将原始帧视频亮度作1/2下采样
/* 调用范围 :只在Lowres::init函数中被调用
* \参数 src0 :原始视频帧亮度首地址:origPic->m_picOrg[0]
* \参数 dst0 :lowresPlane[0]
* \参数 dsth :lowresPlane[1]
* \参数 dstv :lowresPlane[2]
* \参数 dstc :lowresPlane[3]
* \参数 src_stride:原始帧亮度步长
* \参数 dst_stride:下采样视频亮度步长
* \参数 width :下采样视频的宽度
* \参数 heigh :下采样视频的高度
* \返回值 : null
*/
static
void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc,
intptr_t src_stride, intptr_t dst_stride, int width, int height)
{
/* downscale and generate 4 hpel planes for lookahead */
/* 这样做的目的是更好的通过1/2下采样视频的编码估计原始视频编码状态
将亮度分四种方法进行1/2下采样
原始点:
82 89 86 86 93
85 89 99 101 113
96 97 97 100 104
106 108 107 111 109
127 156 133 139 137
0: 在行列选择偶数像素点为基准并选择相邻的右边、下边、右下机本身4个点作平均
* * + +
* * + +
- - # #
- - # #
87 94
102 104
87 = ((((82 + 85 + 1) >> 1) + ((89 + 89 + 1) >> 1) + 1) >> 1)
94 = ((((86 + 99 + 1) >> 1) + ((86 + 101 + 1) >> 1) + 1) >> 1)
102= ((((96 + 106 + 1) >> 1) + ((97 + 108 + 1) >> 1) + 1) >> 1)
104 = ((((97 + 107 + 1) >> 1) + ((100 + 111 + 1) >> 1) + 1) >> 1)
h: 在行选择偶数像素点,在列选择奇数像素点为基准并选择相邻的右边、下边、右下机本身4个点作平均
= * * = + +
= * * = + +
= - - = # #
= - - = # #
91 99
103 107
91 = ((((89 + 89 + 1) >> 1) + ((86 + 99 + 1) >> 1) + 1) >> 1)
99 = ((((86 + 101 + 1) >> 1) + ((93 + 113 + 1) >> 1) + 1) >> 1)
v: 在行选择奇数像素点,在列选择偶数像素点为基准并选择相邻的右边、下边、右下机本身4个点作平均
= = = =
* * + +
* * + +
= = = =
- - # #
- - # #
92 100
125 123
92 = ((((85 + 96 + 1) >> 1) + ((89 + 97 + 1) >> 1) + 1) >> 1)
在行列选择奇数像素点为基准并选择相邻的右边、下边、右下机本身4个点作平均
= = = = = =
= * * = + +
= * * = + +
= = = = = =
= - - = # #
= - - = # #
96 105
126 124
96 = ((((89 + 97 + 1) >> 1) + ((99 + 97 + 1) >> 1) + 1) >> 1)
**/
for (int y = 0; y < height; y++)
{
const pixel* src1 = src0 + src_stride;
const pixel* src2 = src1 + src_stride;
for (int x = 0; x < width; x++)
{
// slower than naive bilinear, but matches asm
#define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
dst0[x] = FILTER(src0[2 * x], src1[2 * x], src0[2 * x + 1], src1[2 * x + 1]);
dsth[x] = FILTER(src0[2 * x + 1], src1[2 * x + 1], src0[2 * x + 2], src1[2 * x + 2]);
dstv[x] = FILTER(src1[2 * x], src2[2 * x], src1[2 * x + 1], src2[2 * x + 1]);
dstc[x] = FILTER(src1[2 * x + 1], src2[2 * x + 1], src1[2 * x + 2], src2[2 * x + 2]);
#undef FILTER
}
src0 += src_stride * 2;
dst0 += dst_stride;
dsth += dst_stride;
dstv += dst_stride;
dstc += dst_stride;
}
}
/* structural similarity metric */
static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
{
for (int z = 0; z < 2; z++)
{
uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
for (int y = 0; y < 4; y++)
{
for (int x = 0; x < 4; x++)
{
int a = pix1[x + y * stride1];
int b = pix2[x + y * stride2];
s1 += a;
s2 += b;
ss += a * a;
ss += b * b;
s12 += a * b;
}
}
sums[z][0] = s1;
sums[z][1] = s2;
sums[z][2] = ss;
sums[z][3] = s12;
pix1 += 4;
pix2 += 4;
}
}
static float ssim_end_1(int s1, int s2, int ss, int s12)
{
/* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
* s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
* Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
#if HIGH_BIT_DEPTH
X265_CHECK((X265_DEPTH == 10) || (X265_DEPTH == 12), "ssim invalid depth\n");
#define type float
static const float ssim_c1 = (float)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64);
static const float ssim_c2 = (float)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63);
#else
X265_CHECK(X265_DEPTH == 8, "ssim invalid depth\n");
#define type int
static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5);
static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5);
#endif
type fs1 = (type)s1;
type fs2 = (type)s2;
type fss = (type)ss;
type fs12 = (type)s12;
type vars = (type)(fss * 64 - fs1 * fs1 - fs2 * fs2);
type covar = (type)(fs12 * 64 - fs1 * fs2);
return (float)(2 * fs1 * fs2 + ssim_c1) * (float)(2 * covar + ssim_c2)
/ ((float)(fs1 * fs1 + fs2 * fs2 + ssim_c1) * (float)(vars + ssim_c2));
#undef type
#undef PIXEL_MAX
}
static float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
{
float ssim = 0.0;
for (int i = 0; i < width; i++)
{
ssim += ssim_end_1(sum0[i][0] + sum0[i + 1][0] + sum1[i][0] + sum1[i + 1][0],
sum0[i][1] + sum0[i + 1][1] + sum1[i][1] + sum1[i + 1][1],
sum0[i][2] + sum0[i + 1][2] + sum1[i][2] + sum1[i + 1][2],
sum0[i][3] + sum0[i + 1][3] + sum1[i][3] + sum1[i + 1][3]);
}
return ssim;
}
/** 函数功能 :返回一个64位整数,低32位存储当前nxn所有元素的和,高32位存储当前nxn所有元素的平方和
/* 调用范围 :只在Lowres::init函数中被调用
* \参数 pix :待计算的块
* \参数 i_stride:步长
* \返回值 :返回一个64位整数,低32位存储当前nxn所有元素的和,高32位存储当前nxn所有元素的平方和
*/
template<int size>
uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
{
uint32_t sum = 0, sqr = 0;
for (int y = 0; y < size; y++)
{
for (int x = 0; x < size; x++)
{
sum += pix[x];
sqr += pix[x] * pix[x];
}
pix += i_stride;
}
return sum + ((uint64_t)sqr << 32);
}
#if defined(_MSC_VER)
#pragma warning(disable: 4127) // conditional expression is constant
#endif
template<int size>
int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
{
static pixel zeroBuf[8] /* = { 0 } */;
if (size)
{
int dim = 1 << (size + 2);
uint32_t totEnergy = 0;
for (int i = 0; i < dim; i += 8)
{
for (int j = 0; j < dim; j+= 8)
{
/* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) -
(sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) -
(sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
totEnergy += abs(sourceEnergy - reconEnergy);
}
}
return totEnergy;
}
else
{
/* 4x4 is too small for sa8d */
int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
return abs(sourceEnergy - reconEnergy);
}
}
template<int size>
int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
{
static int16_t zeroBuf[8] /* = { 0 } */;
if (size)
{
int dim = 1 << (size + 2);
uint32_t totEnergy = 0;
for (int i = 0; i < dim; i += 8)
{
for (int j = 0; j < dim; j+= 8)
{
/* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride) -
(sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride) -
(sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
totEnergy += abs(sourceEnergy - reconEnergy);
}
}
return totEnergy;
}
else
{
/* 4x4 is too small for sa8d */
int sourceEnergy = satd_4x4(source, sstride) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
int reconEnergy = satd_4x4(recon, rstride) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
return abs(sourceEnergy - reconEnergy);
}
}
/** 函数功能 :将对应块大小的数据拷贝到对应缓冲区 调用方式(dst, dst步长, src, src步长)
* \参数 bx :宽度
* \参数 by :高度
* \参数 a :目标存储位置
* \参数 stridea :目标存储位置stride
* \参数 b :源数据地址
* \参数 strideb :源数据stride
* \返回值 :null
*/
template<int bx, int by>
void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
a[x] = b[x];//copy数据
a += stridea;
b += strideb;
}
}
template<int bx, int by>
void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
a[x] = b[x];
a += stridea;
b += strideb;
}
}
template<int bx, int by>
void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
{
X265_CHECK((b[x] >= 0) && (b[x] <= ((1 << X265_DEPTH) - 1)), "blockcopy pixel size fail\n");
a[x] = (pixel)b[x];
}
a += stridea;
b += strideb;
}
}
/** 函数功能 :将对应块大小的数据拷贝到对应位置
* \参数 bx :宽度
* \参数 by :高度
* \参数 a :目标存储位置
* \参数 stridea :目标存储位置stride
* \参数 b :源数据地址
* \参数 strideb :源数据stride
* \返回值 :null
*/
template<int bx, int by>
void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
a[x] = (int16_t)b[x];//copy数据
a += stridea;
b += strideb;
}
}
template<int bx, int by>
void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
a[x] = (int16_t)(b0[x] - b1[x]);
b0 += sstride0;
b1 += sstride1;
a += dstride;
}
}
template<int bx, int by>
void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
a[x] = x265_clip(b0[x] + b1[x]);
b0 += sstride0;
b1 += sstride1;
a += dstride;
}
}
template<int bx, int by>
void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
{
int shiftNum, offset;
shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x += 2)
{
dst[x + 0] = x265_clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum);
dst[x + 1] = x265_clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum);
}
src0 += src0Stride;
src1 += src1Stride;
dst += dstStride;
}
}
static void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
{
for (int r = 0; r < height; r++)
{
for (int c = 0; c < width; c++)
dst[c] = ((pixel)src[c]) << shift;
dst += dstStride;
src += srcStride;
}
}
static void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
{
for (int r = 0; r < height; r++)
{
for (int c = 0; c < width; c++)
dst[c] = (pixel)((src[c] >> shift) & mask);
dst += dstStride;
src += srcStride;
}
}
static void planecopy_sp_shl_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
{
for (int r = 0; r < height; r++)
{
for (int c = 0; c < width; c++)
dst[c] = (pixel)((src[c] << shift) & mask);
dst += dstStride;
src += srcStride;
}
}
/** 函数功能 :计算每行的8x8的传播cost(累加传播cost + 加权intracost)*(intracost-最优cost)/intracost
/* 调用范围 :只在Lookahead::estimateCUPropagate函数中被调用
/*\参数 dst: 存储当前行每个8x8的传播cost
/*\参数 propagateIn:当前行的传播cost存储首地址
/*\参数 intraCosts:当前行的intracost
/*\参数 interCosts:当前行的intercost
/*\参数 invQscales:当前行的AQ offsets
/*\参数 fpsFactor:当前帧的帧率因子(一般为1.0)
/*\参数 len:当前行的长度
* \返回 :返回SAD值 */
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given CU. */
static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
const int32_t* invQscales, const double* fpsFactor, int len)
{
double fps = *fpsFactor / 256; //因为invQscales存储扩大256倍的整数数据,这里除以256目的是缩放数据
for (int i = 0; i < len; i++)
{
double intraCost = intraCosts[i] * invQscales[i]; //获取当前8x8块 加权intra cost
double propagateAmount = (double)propagateIn[i] + intraCost * fps; //累加传播cost 加上加权fps因子之后的intra cost
double propagateNum = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1));//获取最优cost与intracost的差值 高14位的数字:0 表示 intra 1 表示前向搜索 2表示后向搜索 3 表示bi搜索
double propagateDenom = (double)intraCosts[i]; //当前8x8块 intra cost
dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);//(累加传播cost + 加权intracost)*(intracost-最优cost)/intracost
}
}
} // end anonymous namespace
namespace X265_NS {
// x265 private namespace
/* Extend the edges of a picture so that it may safely be used for motion
* compensation. This function assumes the picture is stored in a buffer with
* sufficient padding for the X and Y margins */
/** 函数功能 :将视频帧进行扩边,便于插值和ME搜索
/* 调用范围 :只在weightAnalyse和Lowres::init函数中被调用
* \参数 pic :需要进行插值的视频帧数据
* \参数 stride :视频帧步长
* \参数 width :视频帧宽度
* \参数 height :视频帧高度
* \参数 marginX :两边需要扩边的宽度
* \参数 marginY :上下需要扩边的高度
* \返回值 : null
*/
void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int marginX, int marginY)
{
/* extend left and right margins */
primitives.extendRowBorder(pic, stride, width, height, marginX); //asm 代码,快速实现一行扩边
/* copy top row to create above margin */
pixel* top = pic - marginX;
for (int y = 0; y < marginY; y++)
memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel));
/* copy bottom row to create below margin */
pixel* bot = pic - marginX + (height - 1) * stride;
for (int y = 0; y < marginY; y++)
memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel));
}
/* Initialize entries for pixel functions defined in this file */
void setupPixelPrimitives_c(EncoderPrimitives &p)
{
#define LUMA_PU(W, H) \
p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
p.pu[LUMA_ ## W ## x ## H].addAvg = addAvg<W, H>; \
p.pu[LUMA_ ## W ## x ## H].sad = sad<W, H>; \
p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3<W, H>; \
p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4<W, H>; \
p.pu[LUMA_ ## W ## x ## H].pixelavg_pp = pixelavg_pp<W, H>;
/*
以上宏定义获取相应的函数指针,以LUMA_PU(8, 8);为例:
p.pu[LUMA_8x8].copy_pp = blockcopy_pp_c<8, 8>;
p.pu[LUMA_8x8].addAvg = addAvg<8, 8>;
p.pu[LUMA_8x8].sad = sad<8, 8>;
p.pu[LUMA_8x8].sad_x3 = sad_x3<8, 8>;
p.pu[LUMA_8x8].sad_x4 = sad_x4<8, 8>;
p.pu[LUMA_8x8].pixelavg_pp = pixelavg_pp<8, 8>;
**/
#define LUMA_CU(W, H) \
p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].blockfill_s = blockfill_s_c<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr<W>; \
p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp<BLOCK_ ## W ## x ## H>; \
p.cu[BLOCK_ ## W ## x ## H].psy_cost_ss = psyCost_ss<BLOCK_ ## W ## x ## H>; \
p.cu[BLOCK_ ## W ## x ## H].transpose = transpose<W>; \
p.cu[BLOCK_ ## W ## x ## H].ssd_s = pixel_ssd_s_c<W>; \
p.cu[BLOCK_ ## W ## x ## H].var = pixel_var<W>; \
p.cu[BLOCK_ ## W ## x ## H].calcresidual = getResidual<W>; \
p.cu[BLOCK_ ## W ## x ## H].sse_pp = sse<W, H, pixel, pixel>; \
p.cu[BLOCK_ ## W ## x ## H].sse_ss = sse<W, H, int16_t, int16_t>;
LUMA_PU(4, 4);
LUMA_PU(8, 8);
LUMA_PU(16, 16);
LUMA_PU(32, 32);
LUMA_PU(64, 64);
LUMA_PU(4, 8);
LUMA_PU(8, 4);
LUMA_PU(16, 8);
LUMA_PU(8, 16);
LUMA_PU(16, 12);
LUMA_PU(12, 16);
LUMA_PU(16, 4);
LUMA_PU(4, 16);
LUMA_PU(32, 16);
LUMA_PU(16, 32);
LUMA_PU(32, 24);
LUMA_PU(24, 32);
LUMA_PU(32, 8);
LUMA_PU(8, 32);
LUMA_PU(64, 32);
LUMA_PU(32, 64);
LUMA_PU(64, 48);
LUMA_PU(48, 64);
LUMA_PU(64, 16);
LUMA_PU(16, 64);
p.pu[LUMA_4x4].satd = satd_4x4;
p.pu[LUMA_8x8].satd = satd8<8, 8>;
p.pu[LUMA_8x4].satd = satd_8x4;
p.pu[LUMA_4x8].satd = satd4<4, 8>;
p.pu[LUMA_16x16].satd = satd8<16, 16>;
p.pu[LUMA_16x8].satd = satd8<16, 8>;
p.pu[LUMA_8x16].satd = satd8<8, 16>;
p.pu[LUMA_16x12].satd = satd8<16, 12>;
p.pu[LUMA_12x16].satd = satd4<12, 16>;
p.pu[LUMA_16x4].satd = satd8<16, 4>;
p.pu[LUMA_4x16].satd = satd4<4, 16>;
p.pu[LUMA_32x32].satd = satd8<32, 32>;
p.pu[LUMA_32x16].satd = satd8<32, 16>;
p.pu[LUMA_16x32].satd = satd8<16, 32>;
p.pu[LUMA_32x24].satd = satd8<32, 24>;
p.pu[LUMA_24x32].satd = satd8<24, 32>;
p.pu[LUMA_32x8].satd = satd8<32, 8>;
p.pu[LUMA_8x32].satd = satd8<8, 32>;
p.pu[LUMA_64x64].satd = satd8<64, 64>;
p.pu[LUMA_64x32].satd = satd8<64, 32>;
p.pu[LUMA_32x64].satd = satd8<32, 64>;
p.pu[LUMA_64x48].satd = satd8<64, 48>;
p.pu[LUMA_48x64].satd = satd8<48, 64>;
p.pu[LUMA_64x16].satd = satd8<64, 16>;
p.pu[LUMA_16x64].satd = satd8<16, 64>;
LUMA_CU(4, 4);
LUMA_CU(8, 8);
LUMA_CU(16, 16);
LUMA_CU(32, 32);
LUMA_CU(64, 64);
p.cu[BLOCK_4x4].sa8d = satd_4x4;
p.cu[BLOCK_8x8].sa8d = sa8d_8x8;
p.cu[BLOCK_16x16].sa8d = sa8d_16x16;
p.cu[BLOCK_32x32].sa8d = sa8d16<32, 32>;
p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
#define CHROMA_PU_420(W, H) \
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg = addAvg<W, H>; \
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
/*
以上宏定义获取相应的函数指针,以CHROMA_PU_420(4, 4);为例:
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg = addAvg<4, 4>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].copy_pp = blockcopy_pp_c<4, 4>;
**/
CHROMA_PU_420(2, 2);
CHROMA_PU_420(2, 4);
CHROMA_PU_420(4, 4);
CHROMA_PU_420(8, 8);
CHROMA_PU_420(16, 16);
CHROMA_PU_420(32, 32);
CHROMA_PU_420(4, 2);
CHROMA_PU_420(8, 4);
CHROMA_PU_420(4, 8);
CHROMA_PU_420(8, 6);
CHROMA_PU_420(6, 8);
CHROMA_PU_420(8, 2);
CHROMA_PU_420(2, 8);
CHROMA_PU_420(16, 8);
CHROMA_PU_420(8, 16);
CHROMA_PU_420(16, 12);
CHROMA_PU_420(12, 16);
CHROMA_PU_420(16, 4);
CHROMA_PU_420(4, 16);
CHROMA_PU_420(32, 16);
CHROMA_PU_420(16, 32);
CHROMA_PU_420(32, 24);
CHROMA_PU_420(24, 32);
CHROMA_PU_420(32, 8);
CHROMA_PU_420(8, 32);
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x2].satd = NULL;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = satd_4x4;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = satd8<8, 8>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8<16, 16>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8<32, 32>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd = NULL;
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].satd = NULL;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = satd_8x4;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = satd4<4, 8>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = satd8<16, 8>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = satd8<8, 16>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8<32, 16>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8<16, 32>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd = NULL;
p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd = NULL;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd = NULL;
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].satd = NULL;
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd4<16, 12>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = satd4<12, 16>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = satd4<16, 4>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = satd4<4, 16>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8<32, 24>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8<24, 32>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = satd8<32, 8>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = satd8<8, 32>;
#define CHROMA_CU_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sse_pp = sse<W, H, pixel, pixel>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
CHROMA_CU_420(2, 2)
CHROMA_CU_420(4, 4)
CHROMA_CU_420(8, 8)
CHROMA_CU_420(16, 16)
CHROMA_CU_420(32, 32)
p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd;
p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8<8, 8>;
p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16<16, 16>;
p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16<32, 32>;
#define CHROMA_PU_422(W, H) \
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg = addAvg<W, H>; \
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
CHROMA_PU_422(2, 4);
CHROMA_PU_422(4, 8);
CHROMA_PU_422(8, 16);
CHROMA_PU_422(16, 32);
CHROMA_PU_422(32, 64);
CHROMA_PU_422(4, 4);
CHROMA_PU_422(2, 8);
CHROMA_PU_422(8, 8);
CHROMA_PU_422(4, 16);
CHROMA_PU_422(8, 12);
CHROMA_PU_422(6, 16);
CHROMA_PU_422(8, 4);
CHROMA_PU_422(2, 16);
CHROMA_PU_422(16, 16);
CHROMA_PU_422(8, 32);
CHROMA_PU_422(16, 24);
CHROMA_PU_422(12, 32);
CHROMA_PU_422(16, 8);
CHROMA_PU_422(4, 32);
CHROMA_PU_422(32, 32);
CHROMA_PU_422(16, 64);
CHROMA_PU_422(32, 48);
CHROMA_PU_422(24, 64);
CHROMA_PU_422(32, 16);
CHROMA_PU_422(8, 64);
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].satd = NULL;
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = satd4<4, 8>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = satd8<8, 16>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8<16, 32>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8<32, 64>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = satd_4x4;
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].satd = NULL;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = satd8<8, 8>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = satd4<4, 16>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8<16, 16>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = satd8<8, 32>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8<32, 32>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8<16, 64>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = satd4<8, 12>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd = NULL;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = satd4<8, 4>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd = NULL;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = satd8<16, 24>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4<12, 32>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = satd8<16, 8>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = satd4<4, 32>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = satd8<32, 48>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = satd8<24, 64>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8<32, 16>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = satd8<8, 64>;
#define CHROMA_CU_422(W, H) \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sse_pp = sse<W, H, pixel, pixel>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
CHROMA_CU_422(2, 4)
CHROMA_CU_422(4, 8)
CHROMA_CU_422(8, 16)
CHROMA_CU_422(16, 32)
CHROMA_CU_422(32, 64)
p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d8<8, 16>;
p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16<16, 32>;
p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16<32, 64>;
p.weight_pp = weight_pp_c;
p.weight_sp = weight_sp_c;
p.scale1D_128to64 = scale1D_128to64;
p.scale2D_64to32 = scale2D_64to32;
p.frameInitLowres = frame_init_lowres_core;
p.ssim_4x4x2_core = ssim_4x4x2_core;
p.ssim_end_4 = ssim_end_4;
p.planecopy_cp = planecopy_cp_c;
p.planecopy_sp = planecopy_sp_c;
p.planecopy_sp_shl = planecopy_sp_shl_c;
p.propagateCost = estimateCUPropagateCost;
}
}