HM编码器代码阅读(30)——帧间预测之AMVP模式(五)运动估计

时间:2022-02-03 11:34:27

运动估计

    

    通过 点击打开链接 介绍的方法得到MVP之后,可以根据该MVP确定运动估计的搜索起点,然后进行运动估计

    xMotionEstimation就是进行运动估计的入口函数
    1、先进行一些初始化,设置搜索范围
    2、如果是B类型的slice,或者没有使用快速搜索模式,那么调用xPatternSearch进行整像素精度的全搜索
    3、如果是P类型的slice或者使用了快速搜索模式,那么调用xPatternSearchFast进行整像素精度的快速搜索
    4、调用xPatternSearchFracDIF进行亚像素精度的搜索
    5、得到最优的MV,计算它的率失真代价等

Void TEncSearch::xMotionEstimation( TComDataCU* pcCU, TComYuv* pcYuvOrg, Int iPartIdx, RefPicList eRefPicList, TComMv* pcMvPred, Int iRefIdxPred, TComMv& rcMv, UInt& ruiBits, UInt& ruiCost, Bool bBi  )
{
UInt uiPartAddr;
Int iRoiWidth;
Int iRoiHeight;

TComMv cMvHalf, cMvQter;
TComMv cMvSrchRngLT;
TComMv cMvSrchRngRB;

TComYuv* pcYuv = pcYuvOrg;

// 搜索的范围
m_iSearchRange = m_aaiAdaptSR[eRefPicList][iRefIdxPred];

Int iSrchRng = ( bBi ? m_bipredSearchRange : m_iSearchRange );

// TComPattern是用于访问相邻块/像素的一个工具类
TComPattern* pcPatternKey = pcCU->getPattern ();

Double fWeight = 1.0;

// 得到PU的索引和尺寸
pcCU->getPartIndexAndSize( iPartIdx, uiPartAddr, iRoiWidth, iRoiHeight );

// 如果是B slice
if ( bBi )
{
TComYuv* pcYuvOther = &m_acYuvPred[1-(Int)eRefPicList];
pcYuv = &m_cYuvPredTemp;

pcYuvOrg->copyPartToPartYuv( pcYuv, uiPartAddr, iRoiWidth, iRoiHeight );

pcYuv->removeHighFreq( pcYuvOther, uiPartAddr, iRoiWidth, iRoiHeight );

fWeight = 0.5;
}

// Search key pattern initialization
// 访问相邻块之前进行一些地址方面的初始化
pcPatternKey->initPattern( pcYuv->getLumaAddr( uiPartAddr ),
pcYuv->getCbAddr ( uiPartAddr ),
pcYuv->getCrAddr ( uiPartAddr ),
iRoiWidth,
iRoiHeight,
pcYuv->getStride(),
0, 0 );
// 参考帧的像素
Pel* piRefY = pcCU->getSlice()->getRefPic( eRefPicList, iRefIdxPred )->getPicYuvRec()->getLumaAddr( pcCU->getAddr(), pcCU->getZorderIdxInCU() + uiPartAddr );
Int iRefStride = pcCU->getSlice()->getRefPic( eRefPicList, iRefIdxPred )->getPicYuvRec()->getStride();

// MVP
TComMv cMvPred = *pcMvPred;

// 设置运动估计的搜索范围
if ( bBi )
xSetSearchRange ( pcCU, rcMv , iSrchRng, cMvSrchRngLT, cMvSrchRngRB );
else
xSetSearchRange ( pcCU, cMvPred, iSrchRng, cMvSrchRngLT, cMvSrchRngRB );

// 取得率失真代价
m_pcRdCost->getMotionCost ( 1, 0 );

// 设置预测得到的MV
m_pcRdCost->setPredictor ( *pcMvPred );
m_pcRdCost->setCostScale ( 2 );

// 权重预测方面的设置
setWpScalingDistParam( pcCU, iRefIdxPred, eRefPicList );

// Do integer search
// 先进行整像素搜索,确定一个局部的最佳值
if ( !m_iFastSearch || bBi )
{
// 如果是B slice 或者 不是快速搜索模式,进行常规搜索
xPatternSearch ( pcPatternKey, piRefY, iRefStride, &cMvSrchRngLT, &cMvSrchRngRB, rcMv, ruiCost );
}
else
{
// 进行快速搜索
rcMv = *pcMvPred;
xPatternSearchFast ( pcCU, pcPatternKey, piRefY, iRefStride, &cMvSrchRngLT, &cMvSrchRngRB, rcMv, ruiCost );
}

m_pcRdCost->getMotionCost( 1, 0 );
m_pcRdCost->setCostScale ( 1 );

// 进行亚像素搜索(1/2像素、1/4像素等),以提高搜索的精度
xPatternSearchFracDIF( pcCU, pcPatternKey, piRefY, iRefStride, &rcMv, cMvHalf, cMvQter, ruiCost,bBi );

m_pcRdCost->setCostScale( 0 );
rcMv <<= 2; // 整像素
rcMv += (cMvHalf <<= 1); // 1/2像素
rcMv += cMvQter;// 1/4像素

UInt uiMvBits = m_pcRdCost->getBits( rcMv.getHor(), rcMv.getVer() );

ruiBits += uiMvBits;
ruiCost = (UInt)( floor( fWeight * ( (Double)ruiCost - (Double)m_pcRdCost->getCost( uiMvBits ) ) ) + (Double)m_pcRdCost->getCost( ruiBits ) );
}


整像素运动估计


整像素精度的全搜索算法


Void TEncSearch::xPatternSearch( TComPattern* pcPatternKey, Pel* piRefY, Int iRefStride, TComMv* pcMvSrchRngLT, TComMv* pcMvSrchRngRB, TComMv& rcMv, UInt& ruiSAD )
{
Int iSrchRngHorLeft = pcMvSrchRngLT->getHor();
Int iSrchRngHorRight = pcMvSrchRngRB->getHor();
Int iSrchRngVerTop = pcMvSrchRngLT->getVer();
Int iSrchRngVerBottom = pcMvSrchRngRB->getVer();

UInt uiSad;
UInt uiSadBest = MAX_UINT;
Int iBestX = 0;
Int iBestY = 0;

Pel* piRefSrch;

//-- jclee for using the SAD function pointer
// 设置计算SAD的函数指针和参数
m_pcRdCost->setDistParam( pcPatternKey, piRefY, iRefStride, m_cDistParam );

// fast encoder decision: use subsampled SAD for integer ME
// 快速编码决策:使用亚像素的SAD计算
if ( m_pcEncCfg->getUseFastEnc() )
{
if ( m_cDistParam.iRows > 8 )
{
m_cDistParam.iSubShift = 1;
}
}

piRefY += (iSrchRngVerTop * iRefStride);

// 在预定的范围内进行搜索
for ( Int y = iSrchRngVerTop; y <= iSrchRngVerBottom; y++ )
{
for ( Int x = iSrchRngHorLeft; x <= iSrchRngHorRight; x++ )
{
// find min. distortion position

// piRefY是根据MVP得到的搜索起点,要在起点周围进行搜索

// 得到该位置的像素块
piRefSrch = piRefY + x;
m_cDistParam.pCur = piRefSrch;

setDistParamComp(0);

m_cDistParam.bitDepth = g_bitDepthY;

// 计算当前块作为参考块时的SAD
uiSad = m_cDistParam.DistFunc( &m_cDistParam );

// motion cost
uiSad += m_pcRdCost->getCost( x, y );

// 更新最优代价,以及坐标
if ( uiSad < uiSadBest )
{
uiSadBest = uiSad;
iBestX = x;
iBestY = y;
}
}
piRefY += iRefStride;
}

// 最优的MV
rcMv.set( iBestX, iBestY );

ruiSAD = uiSadBest - m_pcRdCost->getCost( iBestX, iBestY );
return;
}


使用了TZ算法的快速整像素运动估计

    TZSearch搜索算法的大概流程(具体代码实现可能会有点偏差):
    1、确定搜索起点。利用AMVP得到的MVP确定搜索起点。AMVP选出的候选MV有多个,选择其中代价最小的一个最为预测MV,并作为搜索的起始点
    2、以步长1开始,按照菱形或者正方形搜索模板,在搜索范围内进行搜索,然后按照2的指数次幂的方式递增步长,然后选出率失真代价最优的点作为搜索结果
    3、以2得到的搜索结果作为搜素起始点,以步长1开始,在该点的周围做两点搜索,目的是补充搜索最优点周围尚未搜索的点(从这一步开始可能就需要进行运动补偿了)
    4、如果3得到的最优结果对应额步长大于某个阈值,那么以这个点位中心,在一定的范围内做全搜索,然后再次选择最优点
    5、以4得到的最优点为新的起始点,重复2~4,细化搜索,当相邻两次的搜索得到的最优点一致的时候停止搜索


    xPatternSearchFast(快速搜索)直接调用了xTZSearch,这就是HM中使用的搜索算法

Void TEncSearch::xTZSearch( TComDataCU* pcCU, TComPattern* pcPatternKey, Pel* piRefY, Int iRefStride, TComMv* pcMvSrchRngLT, TComMv* pcMvSrchRngRB, TComMv& rcMv, UInt& ruiSAD )
{
// 搜索的范围
Int iSrchRngHorLeft = pcMvSrchRngLT->getHor();
Int iSrchRngHorRight = pcMvSrchRngRB->getHor();
Int iSrchRngVerTop = pcMvSrchRngLT->getVer();
Int iSrchRngVerBottom = pcMvSrchRngRB->getVer();

// 该宏定义了一系列的变量,是TZ算法的配置信息
TZ_SEARCH_CONFIGURATION

UInt uiSearchRange = m_iSearchRange;
pcCU->clipMv( rcMv );
rcMv >>= 2;
// init TZSearchStruct
// TZ结构,用于保存搜索参数和结果
IntTZSearchStruct cStruct;
cStruct.iYStride = iRefStride;
cStruct.piRefY = piRefY;
cStruct.uiBestSad = MAX_UINT;

// set rcMv (Median predictor) as start point and as best point
// 计算以起始MV(即MVP)的SAD,xTZSearchHelp是一个计算SAD并更新最优代价的函数
xTZSearchHelp( pcPatternKey, cStruct, rcMv.getHor(), rcMv.getVer(), 0, 0 );

// test whether one of PRED_A, PRED_B, PRED_C MV is better start point than Median predictor
// 系统中预定义了三个MV,计算这三个MV对应的SAD
if ( bTestOtherPredictedMV )
{
for ( UInt index = 0; index < 3; index++ )
{
TComMv cMv = m_acMvPredictors[index];
pcCU->clipMv( cMv );
cMv >>= 2;
xTZSearchHelp( pcPatternKey, cStruct, cMv.getHor(), cMv.getVer(), 0, 0 );
}
}

// test whether zero Mv is better start point than Median predictor
// 尝试零MV
if ( bTestZeroVector )
{
xTZSearchHelp( pcPatternKey, cStruct, 0, 0, 0, 0 );
}

// start search
Int iDist = 0;
Int iStartX = cStruct.iBestX;
Int iStartY = cStruct.iBestY;

// first search
// 第一次搜索
// 以步长1开始,按照菱形或者正方形搜索模板,在搜索范围内进行搜索,然后按照2的指数次幂的方式递增步长
for ( iDist = 1; iDist <= (Int)uiSearchRange; iDist*=2 )
{
// 棱形模板搜索
if ( bFirstSearchDiamond == 1 )
{
xTZ8PointDiamondSearch ( pcPatternKey, cStruct, pcMvSrchRngLT, pcMvSrchRngRB, iStartX, iStartY, iDist );
}
// 正方形模板搜索
else
{
xTZ8PointSquareSearch ( pcPatternKey, cStruct, pcMvSrchRngLT, pcMvSrchRngRB, iStartX, iStartY, iDist );
}

if ( bFirstSearchStop && ( cStruct.uiBestRound >= uiFirstSearchRounds ) ) // stop criterion
{
break;
}
}

// test whether zero Mv is a better start point than Median predictor
// 尝试零MV是否更好
if ( bTestZeroVectorStart && ((cStruct.iBestX != 0) || (cStruct.iBestY != 0)) )
{
xTZSearchHelp( pcPatternKey, cStruct, 0, 0, 0, 0 );
if ( (cStruct.iBestX == 0) && (cStruct.iBestY == 0) )
{
// test its neighborhood
for ( iDist = 1; iDist <= (Int)uiSearchRange; iDist*=2 )
{
xTZ8PointDiamondSearch( pcPatternKey, cStruct, pcMvSrchRngLT, pcMvSrchRngRB, 0, 0, iDist );
if ( bTestZeroVectorStop && (cStruct.uiBestRound > 0) ) // stop criterion
{
break;
}
}
}
}

// calculate only 2 missing points instead 8 points if cStruct.uiBestDistance == 1
// 第二次搜索
// 做两点搜索
if ( cStruct.uiBestDistance == 1 )
{
cStruct.uiBestDistance = 0;
xTZ2PointSearch( pcPatternKey, cStruct, pcMvSrchRngLT, pcMvSrchRngRB );
}

// raster search if distance is too big
// 第三次搜索
// 如果第二次搜索得到的最优结果对应额步长大于某个阈值,那么以这个点位中心,在一定的范围内做全搜索,然后再次选择最优点
if ( bEnableRasterSearch && ( ((Int)(cStruct.uiBestDistance) > iRaster) || bAlwaysRasterSearch ) )
{
cStruct.uiBestDistance = iRaster;
for ( iStartY = iSrchRngVerTop; iStartY <= iSrchRngVerBottom; iStartY += iRaster )
{
for ( iStartX = iSrchRngHorLeft; iStartX <= iSrchRngHorRight; iStartX += iRaster )
{
xTZSearchHelp( pcPatternKey, cStruct, iStartX, iStartY, 0, iRaster );
}
}
}

// raster refinement
// 第四次搜索
// 以第三次搜索得到的最优点为新的起始点,重复2~4,细化搜索,当相邻两次的搜索得到的最优点一致的时候停止搜索
if ( bRasterRefinementEnable && cStruct.uiBestDistance > 0 )
{
while ( cStruct.uiBestDistance > 0 )
{
iStartX = cStruct.iBestX;
iStartY = cStruct.iBestY;
if ( cStruct.uiBestDistance > 1 )
{
iDist = cStruct.uiBestDistance >>= 1;
if ( bRasterRefinementDiamond == 1 )
{
xTZ8PointDiamondSearch ( pcPatternKey, cStruct, pcMvSrchRngLT, pcMvSrchRngRB, iStartX, iStartY, iDist );
}
else
{
xTZ8PointSquareSearch ( pcPatternKey, cStruct, pcMvSrchRngLT, pcMvSrchRngRB, iStartX, iStartY, iDist );
}
}

// calculate only 2 missing points instead 8 points if cStruct.uiBestDistance == 1
if ( cStruct.uiBestDistance == 1 )
{
cStruct.uiBestDistance = 0;
if ( cStruct.ucPointNr != 0 )
{
xTZ2PointSearch( pcPatternKey, cStruct, pcMvSrchRngLT, pcMvSrchRngRB );
}
}
}
}

// start refinement
// 第五次搜索
// 以第四次搜索得到的最优点为新的起始点,重复2~4,细化搜索,当相邻两次的搜索得到的最优点一致的时候停止搜索
if ( bStarRefinementEnable && cStruct.uiBestDistance > 0 )
{
while ( cStruct.uiBestDistance > 0 )
{
iStartX = cStruct.iBestX;
iStartY = cStruct.iBestY;
cStruct.uiBestDistance = 0;
cStruct.ucPointNr = 0;
for ( iDist = 1; iDist < (Int)uiSearchRange + 1; iDist*=2 )
{
if ( bStarRefinementDiamond == 1 )
{
xTZ8PointDiamondSearch ( pcPatternKey, cStruct, pcMvSrchRngLT, pcMvSrchRngRB, iStartX, iStartY, iDist );
}
else
{
xTZ8PointSquareSearch ( pcPatternKey, cStruct, pcMvSrchRngLT, pcMvSrchRngRB, iStartX, iStartY, iDist );
}
if ( bStarRefinementStop && (cStruct.uiBestRound >= uiStarRefinementRounds) ) // stop criterion
{
break;
}
}

// calculate only 2 missing points instead 8 points if cStrukt.uiBestDistance == 1
if ( cStruct.uiBestDistance == 1 )
{
cStruct.uiBestDistance = 0;
if ( cStruct.ucPointNr != 0 )
{
xTZ2PointSearch( pcPatternKey, cStruct, pcMvSrchRngLT, pcMvSrchRngRB );
}
}
}
}

// write out best match
rcMv.set( cStruct.iBestX, cStruct.iBestY );
ruiSAD = cStruct.uiBestSad - m_pcRdCost->getCost( cStruct.iBestX, cStruct.iBestY );
}
// TZ搜索的帮助函数__inline Void TEncSearch::xTZSearchHelp( TComPattern* pcPatternKey, IntTZSearchStruct& rcStruct, const Int iSearchX, const Int iSearchY, const UChar ucPointNr, const UInt uiDistance ){UInt  uiSad;Pel*  piRefSrch;//  参考图像Y分量的起始地址piRefSrch = rcStruct.piRefY + iSearchY * rcStruct.iYStride + iSearchX;//-- jclee for using the SAD function pointer// 该函数主要职能是设置计算SAD的函数指针m_pcRdCost->setDistParam( pcPatternKey, piRefSrch, rcStruct.iYStride,  m_cDistParam );// fast encoder decision: use subsampled SAD when rows > 8 for integer ME// 判断是否使用快速编码if ( m_pcEncCfg->getUseFastEnc() ){if ( m_cDistParam.iRows > 8 ){m_cDistParam.iSubShift = 1;}}// Y分量setDistParamComp(0);  // Y component// distortionm_cDistParam.bitDepth = g_bitDepthY;// 计算saduiSad = m_cDistParam.DistFunc( &m_cDistParam );// motion cost// 考虑上mv本身带来的开销uiSad += m_pcRdCost->getCost( iSearchX, iSearchY );// 更新最佳值 if( uiSad < rcStruct.uiBestSad ){rcStruct.uiBestSad      = uiSad;rcStruct.iBestX         = iSearchX;rcStruct.iBestY         = iSearchY;rcStruct.uiBestDistance = uiDistance;rcStruct.uiBestRound    = 0;rcStruct.ucPointNr      = ucPointNr;}}

亚像素精度运动估计

入口函数

入口函数是xPatternSearchFracDIF,流程如下:

1、调用xExtDIFUpSamplingH,产生1/2精度的插值像素块
2、调用xPatternRefinement,进行1/2精度的亚像素运动估计
3、调用xExtDIFUpSamplingQ,产生1/4精度的插值像素块
4、调用xPatternRefinement,进行1/4精度的亚像素运动估计

Void TEncSearch::xPatternSearchFracDIF(TComDataCU* pcCU,
TComPattern* pcPatternKey,
Pel* piRefY,
Int iRefStride,
TComMv* pcMvInt,
TComMv& rcMvHalf,
TComMv& rcMvQter,
UInt& ruiCost
,Bool biPred
)
{
// Reference pattern initialization (integer scale)
TComPattern cPatternRoi;
Int iOffset = pcMvInt->getHor() + pcMvInt->getVer() * iRefStride;
cPatternRoi.initPattern( piRefY + iOffset,
NULL,
NULL,
pcPatternKey->getROIYWidth(),
pcPatternKey->getROIYHeight(),
iRefStride,
0, 0 );

// Half-pel refinement
xExtDIFUpSamplingH ( &cPatternRoi, biPred );

rcMvHalf = *pcMvInt; rcMvHalf <<= 1; // for mv-cost
TComMv baseRefMv(0, 0);

// 1/2精度的亚像素运动估计
ruiCost = xPatternRefinement( pcPatternKey, baseRefMv, 2, rcMvHalf );

m_pcRdCost->setCostScale( 0 );

xExtDIFUpSamplingQ ( &cPatternRoi, rcMvHalf, biPred );
baseRefMv = rcMvHalf;
baseRefMv <<= 1;

rcMvQter = *pcMvInt; rcMvQter <<= 1; // for mv-cost
rcMvQter += rcMvHalf; rcMvQter <<= 1;

// 1/4精度的亚像素运动估计
ruiCost = xPatternRefinement( pcPatternKey, baseRefMv, 1, rcMvQter );
}


亚像素精度的运动估计函数

1、初始化,设置率失真代价计算的函数指针以及参数
2、s_acMvRefineH和s_acMvRefineQ是HEVC预定义的两个数组,保存了偏移坐标偏移量,专门用于亚像素精度的运动估计
3、根据iFrac参数选取s_acMvRefineH或者s_acMvRefineQ作为偏移数组
4、遍历偏移数组,对于每一个偏移值,把它加到已经存在的最优MV上,然后进行率失真代价计算,更新最优值
5、最后得到最优的MV

UInt TEncSearch::xPatternRefinement( TComPattern* pcPatternKey,
TComMv baseRefMv,
Int iFrac, TComMv& rcMvFrac )
{
UInt uiDist;
UInt uiDistBest = MAX_UINT;
UInt uiDirecBest = 0;

Pel* piRefPos;
Int iRefStride = m_filteredBlock[0][0].getStride();
m_pcRdCost->setDistParam( pcPatternKey, m_filteredBlock[0][0].getLumaAddr(), iRefStride, 1, m_cDistParam, m_pcEncCfg->getUseHADME() );

const TComMv* pcMvRefine = (iFrac == 2 ? s_acMvRefineH : s_acMvRefineQ);

for (UInt i = 0; i < 9; i++)
{
TComMv cMvTest = pcMvRefine[i];
cMvTest += baseRefMv;

Int horVal = cMvTest.getHor() * iFrac;
Int verVal = cMvTest.getVer() * iFrac;
piRefPos = m_filteredBlock[ verVal & 3 ][ horVal & 3 ].getLumaAddr();
if ( horVal == 2 && ( verVal & 1 ) == 0 )
{
piRefPos += 1;
}
if ( ( horVal & 1 ) == 0 && verVal == 2 )
{
piRefPos += iRefStride;
}
cMvTest = pcMvRefine[i];
cMvTest += rcMvFrac;

setDistParamComp(0); // Y component

m_cDistParam.pCur = piRefPos;
m_cDistParam.bitDepth = g_bitDepthY;
uiDist = m_cDistParam.DistFunc( &m_cDistParam );
uiDist += m_pcRdCost->getCost( cMvTest.getHor(), cMvTest.getVer() );

if ( uiDist < uiDistBest )
{
uiDistBest = uiDist;
uiDirecBest = i;
}
}

// mv更新
rcMvFrac = pcMvRefine[uiDirecBest];

return uiDistBest;
}



产生1/2精度的插值像素块

Void TEncSearch::xExtDIFUpSamplingH( TComPattern* pattern, Bool biPred )
{
Int width = pattern->getROIYWidth();
Int height = pattern->getROIYHeight();
Int srcStride = pattern->getPatternLStride();

Int intStride = m_filteredBlockTmp[0].getStride();
Int dstStride = m_filteredBlock[0][0].getStride();
Short *intPtr;
Short *dstPtr;
Int filterSize = NTAPS_LUMA;
Int halfFilterSize = (filterSize>>1);
Pel *srcPtr = pattern->getROIY() - halfFilterSize*srcStride - 1;

m_if.filterHorLuma(srcPtr, srcStride, m_filteredBlockTmp[0].getLumaAddr(), intStride, width+1, height+filterSize, 0, false);
m_if.filterHorLuma(srcPtr, srcStride, m_filteredBlockTmp[2].getLumaAddr(), intStride, width+1, height+filterSize, 2, false);

intPtr = m_filteredBlockTmp[0].getLumaAddr() + halfFilterSize * intStride + 1;
dstPtr = m_filteredBlock[0][0].getLumaAddr();
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width+0, height+0, 0, false, true);

intPtr = m_filteredBlockTmp[0].getLumaAddr() + (halfFilterSize-1) * intStride + 1;
dstPtr = m_filteredBlock[2][0].getLumaAddr();
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width+0, height+1, 2, false, true);

intPtr = m_filteredBlockTmp[2].getLumaAddr() + halfFilterSize * intStride;
dstPtr = m_filteredBlock[0][2].getLumaAddr();
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width+1, height+0, 0, false, true);

intPtr = m_filteredBlockTmp[2].getLumaAddr() + (halfFilterSize-1) * intStride;
dstPtr = m_filteredBlock[2][2].getLumaAddr();
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width+1, height+1, 2, false, true);
}

产生1/4精度的插值像素块

Void TEncSearch::xExtDIFUpSamplingQ( TComPattern* pattern, TComMv halfPelRef, Bool biPred )
{
Int width = pattern->getROIYWidth();
Int height = pattern->getROIYHeight();
Int srcStride = pattern->getPatternLStride();

Pel *srcPtr;
Int intStride = m_filteredBlockTmp[0].getStride();
Int dstStride = m_filteredBlock[0][0].getStride();
Short *intPtr;
Short *dstPtr;
Int filterSize = NTAPS_LUMA;

Int halfFilterSize = (filterSize>>1);

Int extHeight = (halfPelRef.getVer() == 0) ? height + filterSize : height + filterSize-1;

// Horizontal filter 1/4
srcPtr = pattern->getROIY() - halfFilterSize * srcStride - 1;
intPtr = m_filteredBlockTmp[1].getLumaAddr();
if (halfPelRef.getVer() > 0)
{
srcPtr += srcStride;
}
if (halfPelRef.getHor() >= 0)
{
srcPtr += 1;
}
m_if.filterHorLuma(srcPtr, srcStride, intPtr, intStride, width, extHeight, 1, false);

// Horizontal filter 3/4
srcPtr = pattern->getROIY() - halfFilterSize*srcStride - 1;
intPtr = m_filteredBlockTmp[3].getLumaAddr();
if (halfPelRef.getVer() > 0)
{
srcPtr += srcStride;
}
if (halfPelRef.getHor() > 0)
{
srcPtr += 1;
}
m_if.filterHorLuma(srcPtr, srcStride, intPtr, intStride, width, extHeight, 3, false);

// Generate @ 1,1
intPtr = m_filteredBlockTmp[1].getLumaAddr() + (halfFilterSize-1) * intStride;
dstPtr = m_filteredBlock[1][1].getLumaAddr();
if (halfPelRef.getVer() == 0)
{
intPtr += intStride;
}
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 1, false, true);

// Generate @ 3,1
intPtr = m_filteredBlockTmp[1].getLumaAddr() + (halfFilterSize-1) * intStride;
dstPtr = m_filteredBlock[3][1].getLumaAddr();
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 3, false, true);

if (halfPelRef.getVer() != 0)
{
// Generate @ 2,1
intPtr = m_filteredBlockTmp[1].getLumaAddr() + (halfFilterSize-1) * intStride;
dstPtr = m_filteredBlock[2][1].getLumaAddr();
if (halfPelRef.getVer() == 0)
{
intPtr += intStride;
}
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 2, false, true);

// Generate @ 2,3
intPtr = m_filteredBlockTmp[3].getLumaAddr() + (halfFilterSize-1) * intStride;
dstPtr = m_filteredBlock[2][3].getLumaAddr();
if (halfPelRef.getVer() == 0)
{
intPtr += intStride;
}
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 2, false, true);
}
else
{
// Generate @ 0,1
intPtr = m_filteredBlockTmp[1].getLumaAddr() + halfFilterSize * intStride;
dstPtr = m_filteredBlock[0][1].getLumaAddr();
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 0, false, true);

// Generate @ 0,3
intPtr = m_filteredBlockTmp[3].getLumaAddr() + halfFilterSize * intStride;
dstPtr = m_filteredBlock[0][3].getLumaAddr();
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 0, false, true);
}

if (halfPelRef.getHor() != 0)
{
// Generate @ 1,2
intPtr = m_filteredBlockTmp[2].getLumaAddr() + (halfFilterSize-1) * intStride;
dstPtr = m_filteredBlock[1][2].getLumaAddr();
if (halfPelRef.getHor() > 0)
{
intPtr += 1;
}
if (halfPelRef.getVer() >= 0)
{
intPtr += intStride;
}
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 1, false, true);

// Generate @ 3,2
intPtr = m_filteredBlockTmp[2].getLumaAddr() + (halfFilterSize-1) * intStride;
dstPtr = m_filteredBlock[3][2].getLumaAddr();
if (halfPelRef.getHor() > 0)
{
intPtr += 1;
}
if (halfPelRef.getVer() > 0)
{
intPtr += intStride;
}
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 3, false, true);
}
else
{
// Generate @ 1,0
intPtr = m_filteredBlockTmp[0].getLumaAddr() + (halfFilterSize-1) * intStride + 1;
dstPtr = m_filteredBlock[1][0].getLumaAddr();
if (halfPelRef.getVer() >= 0)
{
intPtr += intStride;
}
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 1, false, true);

// Generate @ 3,0
intPtr = m_filteredBlockTmp[0].getLumaAddr() + (halfFilterSize-1) * intStride + 1;
dstPtr = m_filteredBlock[3][0].getLumaAddr();
if (halfPelRef.getVer() > 0)
{
intPtr += intStride;
}
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 3, false, true);
}

// Generate @ 1,3
intPtr = m_filteredBlockTmp[3].getLumaAddr() + (halfFilterSize-1) * intStride;
dstPtr = m_filteredBlock[1][3].getLumaAddr();
if (halfPelRef.getVer() == 0)
{
intPtr += intStride;
}
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 1, false, true);

// Generate @ 3,3
intPtr = m_filteredBlockTmp[3].getLumaAddr() + (halfFilterSize-1) * intStride;
dstPtr = m_filteredBlock[3][3].getLumaAddr();
m_if.filterVerLuma(intPtr, intStride, dstPtr, dstStride, width, height, 3, false, true);
}