//
//  
//  Phocus
//
//  Created by Thomas Rogon on 10/05/17.
//  Copyright (c) 2017,2018 Hasselblad A/S. All rights reserved.
//

//#ifdef cl_khr_fp64
//#pragma OPENCL EXTENSION cl_khr_fp64 : enable
//#elif defined(cl_amd_fp64)
//#pragma OPENCL EXTENSION cl_amd_fp64 : enable
//#else
//#error "float precision floating point not supported by OpenCL implementation."
//#endif

const sampler_t bnrSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

int4 readImagei4(__read_only image2d_t imgin, int2 pos)
{
//	uint4 upix = read_imageui(imgin, pos); // requires OpenCl 1.2
	uint4 upix = read_imageui(imgin, bnrSampler, pos); // OpenCl 1.1 ok
	int4 ipix = convert_int4(upix);
	return ipix;
}


ushort4 readImageus4(__read_only image2d_t imgin, int2 pos)
{
//	uint4 upix = read_imageui(imgin, pos); // requires OpenCl 1.2
	uint4 upix = read_imageui(imgin, bnrSampler, pos); // OpenCl 1.1 ok
	ushort4 ipix = convert_ushort4(upix);
	return ipix;
}


ushort2 readImageus2(__read_only image2d_t imgin, int2 pos)
{
//	uint4 upix = read_imageui(imgin, pos); // requires OpenCl 1.2
	uint4 upix = read_imageui(imgin, bnrSampler, pos); // OpenCl 1.1 ok
	ushort2 ipix = { convert_ushort(upix.x), convert_ushort(upix.y) };
	return ipix;
}


int4 doBNR32Green(__read_only image2d_t imgin, int2 pos, const int ofs,
	const float lumaSqtThr, bool isRedRow)
{
	int2 pLeft = { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop = { pos.x, pos.y - 1 };
	int2 pBot = { pos.x, pos.y + 1 };
	int2 pLT = { pos.x - 1, pos.y - 1 };
	int2 pRT = { pos.x + 1, pos.y - 1 };
	int2 pLBot = { pos.x - 1, pos.y + 1 };
	int2 pRBot = { pos.x + 1, pos.y + 1 };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	ushort4 rgrgTop = readImageus4(imgin, pTop);
	ushort4 rgrgBot = readImageus4(imgin, pBot);
	ushort4 rgrgLT = readImageus4(imgin, pLT);
	ushort4 rgrgRT = readImageus4(imgin, pRT);
	ushort4 rgrgLBot = readImageus4(imgin, pLBot);
	ushort4 rgrgRBot = readImageus4(imgin, pRBot);
	float2 green1ValG1C, redSumG1C, blueSumG1C, greenSumG1C;
	float bluegreenSumRedMC;
	if (isRedRow)
	{
		bluegreenSumRedMC = (float)rgrgTop.z + rgrgBot.z;
		green1ValG1C = (float2)(rgrgCenter.y, rgrgCenter.w); // y, w are center G's
		redSumG1C = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC);
	}
	else
	{
		bluegreenSumRedMC = (float)(rgrgTop.y + rgrgBot.y); // green
		green1ValG1C = (float2)(rgrgCenter.x, rgrgCenter.z); // y, w are center G's
		redSumG1C = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.x + rgrgTop.x, rgrgBot.z + rgrgTop.z); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC);
	}
	float2 redAveG1C = redSumG1C / 2;
	float2 greenAveG1C = greenSumG1C / 4;
	float2 blueAveG1C = blueSumG1C / 2;
	float2 lumaG1C = redAveG1C + green1ValG1C + blueAveG1C;
	float2 grRedDiffG1C = redAveG1C - greenAveG1C;
	float2 grBlueDiffG1C = blueAveG1C - greenAveG1C;
	float fLumaSqtThr = lumaSqtThr;
	float fLumaGrSqtThr = lumaSqtThr;
	float fGrGrAtRedRelThr = .5;
	float fBlBlAtRedRelThr = .5;
	float fGrGrAtGrRelThr = .5;
	float fRdRdAtGreenRelThr = 1.;
	float fBlBlAtGreenRelThr = 1.;
	float fDiff4Thr = 3.;
	float fDiff24Thr = 3.;
	float fDiff14Thr = 3.;
	float2 lumaG1Thr = { sqrt(lumaG1C.x) * fLumaGrSqtThr, sqrt(lumaG1C.y) * fLumaGrSqtThr };
	float2 patternRG1Thr2 = { lumaG1Thr.x * fRdRdAtGreenRelThr,lumaG1Thr.y * fRdRdAtGreenRelThr };
	float2 patternBG1Thr2 = { lumaG1Thr.x * fBlBlAtGreenRelThr,lumaG1Thr.y * fBlBlAtGreenRelThr };
	float2 patternGG1Thr = { lumaG1Thr.x * fGrGrAtGrRelThr,	lumaG1Thr.y * fGrGrAtGrRelThr };
	float2 gRDiffG1Thr = { lumaG1Thr.x * fDiff24Thr,		lumaG1Thr.y * fDiff24Thr };
	float2 gBDiffG1Thr = { lumaG1Thr.x * fDiff24Thr,		lumaG1Thr.y * fDiff24Thr };
	float sum1 = 0;
	float sum2 = 0;
	int  count1 = 0;
	int  count2 = 0;
	int skipY = 2;
	int skipX = 1;
	int cKernSizeY = 32;
	int cKernSizeX = 32 / 4;
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos = { pos.x + i, pos.y + j };
			int2 pSLeft = { spos.x - 1, spos.y };
			int2 pSRight = { spos.x + 1, spos.y };
			int2 pSTop = { spos.x, spos.y - 1 };
			int2 pSBot = { spos.x, spos.y + 1 };
			int2 pSLT = { spos.x - 1, spos.y - 1 };
			int2 pSRT = { spos.x + 1, spos.y - 1 };
			int2 pSLBot = { spos.x - 1, spos.y + 1 };
			int2 pSRBot = { spos.x + 1, spos.y + 1 };

			ushort4 rgrgSearch = readImageus4(imgin, spos);
			ushort4 rgrgSLeft = readImageus4(imgin, pSLeft);
			ushort4 rgrgSRight = readImageus4(imgin, pSRight);
			ushort4 rgrgSTop = readImageus4(imgin, pSTop);
			ushort4 rgrgSBot = readImageus4(imgin, pSBot);
			ushort4 rgrgSLT = readImageus4(imgin, pSLT);
			ushort4 rgrgSRT = readImageus4(imgin, pSRT);
			ushort4 rgrgSLBot = readImageus4(imgin, pSLBot);
			ushort4 rgrgSRBot = readImageus4(imgin, pSRBot);
			float2 green1ValG1S, redSumG1S, blueSumG1S, greenSumG1S;
			float bluegreenSumRedMS;
			if (isRedRow)
			{
				bluegreenSumRedMS = (float)rgrgSTop.z + rgrgSBot.z;
				green1ValG1S = (float2)(rgrgSearch.y, rgrgSearch.w); // y, w are center G's
				redSumG1S = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS,
										rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS);
			}
			else
			{
				bluegreenSumRedMS = (float)rgrgSTop.y + rgrgSBot.y;
				green1ValG1S = (float2)(rgrgSearch.x, rgrgSearch.z); // y, w are center G's
				redSumG1S = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.x + rgrgSTop.x, rgrgSBot.z + rgrgSTop.z); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS,
										rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS);
			}
			float2 redAveG1S = redSumG1S / 2;
			float2 blueAveG1S = blueSumG1S / 2;
			float2 greenAveG1S = greenSumG1S / 4;
			float2 lumaG1S = redAveG1S + green1ValG1S + blueAveG1S;
			float2 grRedDiffG1S = redAveG1S - greenAveG1S;
			float2 grBlueDiffG1S = blueAveG1S - greenAveG1S;
			if ((fabs(lumaG1S.x - lumaG1C.x) <= lumaG1Thr.x) &&
				(fabs(redAveG1S.x - redAveG1C.x) <= patternRG1Thr2.x) &&
				(fabs(blueAveG1S.x - blueAveG1C.x) <= patternBG1Thr2.x) &&
				(fabs(grRedDiffG1S.x - grRedDiffG1C.x) <= gRDiffG1Thr.x) &&
				(fabs(grBlueDiffG1S.x - grBlueDiffG1C.x) <= gBDiffG1Thr.x) &&
				(fabs(greenAveG1S.x - greenAveG1C.x) <= patternGG1Thr.x))
			{
				sum1 += green1ValG1S.x;
				++count1;
			}
			if ((fabs(lumaG1S.y - lumaG1C.y) <= lumaG1Thr.y) &&
				(fabs(redAveG1S.y - redAveG1C.y) <= patternRG1Thr2.y) &&
				(fabs(blueAveG1S.y - blueAveG1C.y) <= patternBG1Thr2.y) &&
				(fabs(grRedDiffG1S.y - grRedDiffG1C.y) <= gRDiffG1Thr.y) &&
				(fabs(grBlueDiffG1S.y - grBlueDiffG1C.y) <= gBDiffG1Thr.y) &&
				(fabs(greenAveG1S.y - greenAveG1C.y) <= patternGG1Thr.y))
			{
				sum2 += green1ValG1S.y;
				++count2;
			}
			// permutation
			if ((fabs(lumaG1S.y - lumaG1C.x) <= lumaG1Thr.x) &&
				(fabs(redAveG1S.y - redAveG1C.x) <= patternRG1Thr2.x) &&
				(fabs(blueAveG1S.y - blueAveG1C.x) <= patternBG1Thr2.x) &&
				(fabs(grRedDiffG1S.y - grRedDiffG1C.x) <= gRDiffG1Thr.x) &&
				(fabs(grBlueDiffG1S.y - grBlueDiffG1C.x) <= gBDiffG1Thr.x) &&
				(fabs(greenAveG1S.y - greenAveG1C.x) <= patternGG1Thr.x))
			{
				sum1 += green1ValG1S.y;
				++count1;
			}
			if ((fabs(lumaG1S.x - lumaG1C.y) <= lumaG1Thr.y) &&
				(fabs(redAveG1S.x - redAveG1C.y) <= patternRG1Thr2.y) &&
				(fabs(blueAveG1S.x - blueAveG1C.y) <= patternBG1Thr2.y) &&
				(fabs(grRedDiffG1S.x - grRedDiffG1C.y) <= gRDiffG1Thr.y) &&
				(fabs(grBlueDiffG1S.x - grBlueDiffG1C.y) <= gBDiffG1Thr.y) &&
				(fabs(greenAveG1S.x - greenAveG1C.y) <= patternGG1Thr.y))
			{
				sum2 += green1ValG1S.x;
				++count2;
			}
		}
	}
	int4 opix;
	if (isRedRow)
	{
		opix.x = rgrgCenter.x;
		if (count1 > 1)
			opix.y = (int)(sum1 / count1);
		else
			opix.y = rgrgCenter.y;
		opix.z = rgrgCenter.z;
		if (count2 > 1)
			opix.w = (int)(sum2 / count2);
		else
			opix.w = rgrgCenter.w;
	}
	else
	{// r,g permutation (!)
		if (count1 > 1)
			opix.x = (int)(sum1 / count1);
		else
			opix.x = rgrgCenter.x;
		opix.y = rgrgCenter.y;
		if (count2 > 1)
			opix.z = (int)(sum2 / count2);
		else
			opix.z = rgrgCenter.z;
		opix.w = rgrgCenter.w;
	}
	return  opix - ofs;
}


int4 doBNR32RedBlue(__read_only image2d_t imgin, int2 pos, const int ofs,
	const float lumaSqtThr, bool isRedRow)
{
	int2 pLeft = { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop = { pos.x, pos.y - 1 };
	int2 pBot = { pos.x, pos.y + 1 };
	int2 pLT = { pos.x - 1, pos.y - 1 };
	int2 pRT = { pos.x + 1, pos.y - 1 };
	int2 pLBot = { pos.x - 1, pos.y + 1 };
	int2 pRBot = { pos.x + 1, pos.y + 1 };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	ushort4 rgrgTop = readImageus4(imgin, pTop);
	ushort4 rgrgBot = readImageus4(imgin, pBot);
	ushort4 rgrgLT = readImageus4(imgin, pLT);
	ushort4 rgrgRT = readImageus4(imgin, pRT);
	ushort4 rgrgLBot = readImageus4(imgin, pLBot);
	ushort4 rgrgRBot = readImageus4(imgin, pRBot);
	float2 redvalRedC, grSumHRedC, grSumVRedC, blueSumRedC;
	float bluegreenSumRedMC;
	if (isRedRow)
	{
		redvalRedC = (float2)(rgrgCenter.x, rgrgCenter.z); // x, z are center R's
		grSumHRedC = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // Horizontal 
		grSumVRedC = (float2)(rgrgTop.x + rgrgBot.x, rgrgTop.z + rgrgBot.z);	// vertical 
		bluegreenSumRedMC = (float)(rgrgTop.y + rgrgBot.y);
		blueSumRedC = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC);
	}
	else
	{
		redvalRedC = (float2)(rgrgCenter.y, rgrgCenter.w); // x, z are center R's
		grSumHRedC = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // Horizontal 
		grSumVRedC = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w);	// vertical 
		bluegreenSumRedMC = (float)(rgrgTop.z + rgrgBot.z); // green
		blueSumRedC = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC);
	}
	float2 grSumRedC = grSumHRedC + grSumVRedC;
	float2 grAveRedC = grSumRedC / 4;
	float2 blueAveRedC = blueSumRedC / 4;
	float2 lumaRedC = redvalRedC + grAveRedC + blueAveRedC;
	float2 grBlueDiffRedC = grSumRedC - blueSumRedC;
	float2 redGrDiffRedC = grAveRedC - redvalRedC;

	float fLumaSqtThr = lumaSqtThr;
	float fLumaGrSqtThr = lumaSqtThr;
	float fGrGrAtRedRelThr = .5;
	float fBlBlAtRedRelThr = .5;
	float fGrGrAtGrRelThr = .5;
	float fRdRdAtGreenRelThr = 1.;
	float fBlBlAtGreenRelThr = 1.;
	float fDiff4Thr = 3.;
	float fDiff24Thr = 3.;
	float fDiff14Thr = 3.;
	float2 lumaRThr = { sqrt(lumaRedC.x)* fLumaSqtThr, sqrt(lumaRedC.y)* fLumaSqtThr };
	float2 patternGRThr = { lumaRThr.x * fGrGrAtRedRelThr, lumaRThr.y * fGrGrAtRedRelThr };
	float2 patternBRThr = { lumaRThr.x * fBlBlAtRedRelThr, lumaRThr.y * fBlBlAtRedRelThr };
	float2 gBDiffRThr = { lumaRThr.x * fDiff4Thr,		lumaRThr.y * fDiff4Thr };
	float2 redGrDiffRThr = { lumaRThr.x * fDiff14Thr,	lumaRThr.y * fDiff14Thr };
	float sum1 = 0;
	float sum2 = 0;
	int  count1 = 0;
	int  count2 = 0;
	int skipY = 2;
	int skipX = 1;
	int cKernSizeY = 32;
	int cKernSizeX = 32 / 4;
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos = { pos.x + i, pos.y + j };
			int2 pSLeft = { spos.x - 1, spos.y };
			int2 pSRight = { spos.x + 1, spos.y };
			int2 pSTop = { spos.x, spos.y - 1 };
			int2 pSBot = { spos.x, spos.y + 1 };
			int2 pSLT = { spos.x - 1, spos.y - 1 };
			int2 pSRT = { spos.x + 1, spos.y - 1 };
			int2 pSLBot = { spos.x - 1, spos.y + 1 };
			int2 pSRBot = { spos.x + 1, spos.y + 1 };
			ushort4 rgrgSearch = readImageus4(imgin, spos);
			ushort4 rgrgSLeft = readImageus4(imgin, pSLeft);
			ushort4 rgrgSRight = readImageus4(imgin, pSRight);
			ushort4 rgrgSTop = readImageus4(imgin, pSTop);
			ushort4 rgrgSBot = readImageus4(imgin, pSBot);
			ushort4 rgrgSLT = readImageus4(imgin, pSLT);
			ushort4 rgrgSRT = readImageus4(imgin, pSRT);
			ushort4 rgrgSLBot = readImageus4(imgin, pSLBot);
			ushort4 rgrgSRBot = readImageus4(imgin, pSRBot);
			float2 redvalRedS, grSumHRedS, grSumVRedS, blueSumRedS;
			float bluegreenSumRedMS;
			if (isRedRow)
			{
				redvalRedS = (float2)(rgrgSearch.x, rgrgSearch.z); // x, z are center R's
				grSumHRedS = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w);
				grSumVRedS = (float2)(rgrgSTop.x + rgrgSBot.x, rgrgSTop.z + rgrgSBot.z);
				bluegreenSumRedMS = (float)rgrgSTop.y + rgrgSBot.y;
				blueSumRedS = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS,  // corners
									rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS);
			}
			else
			{
				redvalRedS = (float2)(rgrgSearch.y, rgrgSearch.w); // x, z are center R's
				grSumHRedS = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x);
				grSumVRedS = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w);
				bluegreenSumRedMS = (float)rgrgSTop.z + rgrgSBot.z;
				blueSumRedS = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS,
					rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS);
			}
			float2 grSumRedS = grSumHRedS + grSumVRedS;
			float2 grAveRedS = grSumRedS / 4;
			float2 blueAveRedS = blueSumRedS / 4;
			float2 lumaRedS = redvalRedS + grAveRedS + blueAveRedS;
			float2 grBlueDiffRedS = grSumRedS - blueSumRedS;
			float2 redGrDiffRedS = grAveRedS - redvalRedS;
			if ((fabs(lumaRedS.x - lumaRedC.x) <= lumaRThr.x) &&
				(fabs(grAveRedS.x - grAveRedC.x) <= patternGRThr.x) &&
				(fabs(blueAveRedS.x - blueAveRedC.x) <= patternBRThr.x) &&
				(fabs(grBlueDiffRedS.x - grBlueDiffRedC.x) <= gBDiffRThr.x) &&
				(fabs(redGrDiffRedS.x - redGrDiffRedC.x) <= redGrDiffRThr.x))
			{
				sum1 += redvalRedS.x;
				++count1;
			}
			if ((fabs(lumaRedS.y - lumaRedC.y) <= lumaRThr.y) &&
				(fabs(grAveRedS.y - grAveRedC.y) <= patternGRThr.y) &&
				(fabs(blueAveRedS.y - blueAveRedC.y) <= patternBRThr.y) &&
				(fabs(grBlueDiffRedS.y - grBlueDiffRedC.y) <= gBDiffRThr.y) &&
				(fabs(redGrDiffRedS.y - redGrDiffRedC.y) <= redGrDiffRThr.y))
			{
				sum2 += redvalRedS.y;
				++count2;
			}
			// permutation
			if ((fabs(lumaRedS.y - lumaRedC.x) <= lumaRThr.x) &&
				(fabs(grAveRedS.y - grAveRedC.x) <= patternGRThr.x) &&
				(fabs(blueAveRedS.y - blueAveRedC.x) <= patternBRThr.x) &&
				(fabs(grBlueDiffRedS.y - grBlueDiffRedC.x) <= gBDiffRThr.x) &&
				(fabs(redGrDiffRedS.y - redGrDiffRedC.x) <= redGrDiffRThr.x))
			{
				sum1 += redvalRedS.y;
				++count1;
			}
			if ((fabs(lumaRedS.x - lumaRedC.y) <= lumaRThr.y) &&
				(fabs(grAveRedS.x - grAveRedC.y) <= patternGRThr.y) &&
				(fabs(blueAveRedS.x - blueAveRedC.y) <= patternBRThr.y) &&
				(fabs(grBlueDiffRedS.x - grBlueDiffRedC.y) <= gBDiffRThr.y) &&
				(fabs(redGrDiffRedS.x - redGrDiffRedC.y) <= redGrDiffRThr.y))
			{
				sum2 += redvalRedS.x;
				++count2;
			}
		}
	}
	int4 opix;
	if (isRedRow)
	{
		if (count1 > 1)
			opix.x = (int)(sum1 / count1);
		else
			opix.x = rgrgCenter.x;
		opix.y = rgrgCenter.y;
		if (count2 > 1)
			opix.z = (int)(sum2 / count2);
		else
			opix.z = rgrgCenter.z;
		opix.w = rgrgCenter.w;
	}
	else
	{// r,g permutation (!)
		opix.x = rgrgCenter.x;
		if (count1 > 1)
			opix.y = (int)(sum1 / count1);
		else
			opix.y = rgrgCenter.y;
		opix.z = rgrgCenter.z;
		if (count2 > 1)
			opix.w = (int)(sum2 / count2);
		else
			opix.w = rgrgCenter.w;
	}
	return  opix - ofs;
}


int4 doBNR16Green(__read_only image2d_t imgin, int2 pos, const int ofs,
	const float lumaSqtThr, bool isRedRow)
{
	int2 pLeft = { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop = { pos.x, pos.y - 1 };
	int2 pBot = { pos.x, pos.y + 1 };
	int2 pLT = { pos.x - 1, pos.y - 1 };
	int2 pRT = { pos.x + 1, pos.y - 1 };
	int2 pLBot = { pos.x - 1, pos.y + 1 };
	int2 pRBot = { pos.x + 1, pos.y + 1 };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	ushort4 rgrgTop = readImageus4(imgin, pTop);
	ushort4 rgrgBot = readImageus4(imgin, pBot);
	ushort4 rgrgLT = readImageus4(imgin, pLT);
	ushort4 rgrgRT = readImageus4(imgin, pRT);
	ushort4 rgrgLBot = readImageus4(imgin, pLBot);
	ushort4 rgrgRBot = readImageus4(imgin, pRBot);
	float2 green1ValG1C, redSumG1C, blueSumG1C, greenSumG1C;
	float bluegreenSumRedMC;
	if (isRedRow)
	{
		bluegreenSumRedMC = (float)rgrgTop.z + rgrgBot.z;
		green1ValG1C = (float2)(rgrgCenter.y, rgrgCenter.w); // y, w are center G's
		redSumG1C = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC);
	}
	else
	{
		bluegreenSumRedMC = (float)(rgrgTop.y + rgrgBot.y); // green
		green1ValG1C = (float2)(rgrgCenter.x, rgrgCenter.z); // y, w are center G's
		redSumG1C = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.x + rgrgTop.x, rgrgBot.z + rgrgTop.z); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC);
	}
	float2 redAveG1C = redSumG1C / 2;
	float2 greenAveG1C = greenSumG1C / 4;
	float2 blueAveG1C = blueSumG1C / 2;
	float2 lumaG1C = redAveG1C + green1ValG1C + blueAveG1C;
	float2 grRedDiffG1C = redAveG1C - greenAveG1C;
	float2 grBlueDiffG1C = blueAveG1C - greenAveG1C;
	float fLumaSqtThr = lumaSqtThr;
	float fLumaGrSqtThr = lumaSqtThr;
	float fGrGrAtRedRelThr = .5;
	float fBlBlAtRedRelThr = .5;
	float fGrGrAtGrRelThr = .5;
	float fRdRdAtGreenRelThr = 1.;
	float fBlBlAtGreenRelThr = 1.;
	float fDiff4Thr = 3.;
	float fDiff24Thr = 3.;
	float fDiff14Thr = 3.;
	float2 lumaG1Thr = { sqrt(lumaG1C.x) * fLumaGrSqtThr,sqrt(lumaG1C.y) * fLumaGrSqtThr };
	float2 patternRG1Thr2 = { lumaG1Thr.x * fRdRdAtGreenRelThr,lumaG1Thr.y * fRdRdAtGreenRelThr };
	float2 patternBG1Thr2 = { lumaG1Thr.x * fBlBlAtGreenRelThr,lumaG1Thr.y * fBlBlAtGreenRelThr };
	float2 patternGG1Thr = { lumaG1Thr.x * fGrGrAtGrRelThr,	lumaG1Thr.y * fGrGrAtGrRelThr };
	float2 gRDiffG1Thr = { lumaG1Thr.x * fDiff24Thr,		lumaG1Thr.y * fDiff24Thr };
	float2 gBDiffG1Thr = { lumaG1Thr.x * fDiff24Thr,		lumaG1Thr.y * fDiff24Thr };
	float sum1 = 0;
	float sum2 = 0;
	int  count1 = 0;
	int  count2 = 0;
	int skipY = 2;
	int skipX = 1;
	int cKernSizeY = 16;
	int cKernSizeX = 16 / 4;
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos = { pos.x + i, pos.y + j };
			int2 pSLeft = { spos.x - 1, spos.y };
			int2 pSRight = { spos.x + 1, spos.y };
			int2 pSTop = { spos.x, spos.y - 1 };
			int2 pSBot = { spos.x, spos.y + 1 };
			int2 pSLT = { spos.x - 1, spos.y - 1 };
			int2 pSRT = { spos.x + 1, spos.y - 1 };
			int2 pSLBot = { spos.x - 1, spos.y + 1 };
			int2 pSRBot = { spos.x + 1, spos.y + 1 };

			ushort4 rgrgSearch = readImageus4(imgin, spos);
			ushort4 rgrgSLeft = readImageus4(imgin, pSLeft);
			ushort4 rgrgSRight = readImageus4(imgin, pSRight);
			ushort4 rgrgSTop = readImageus4(imgin, pSTop);
			ushort4 rgrgSBot = readImageus4(imgin, pSBot);
			ushort4 rgrgSLT = readImageus4(imgin, pSLT);
			ushort4 rgrgSRT = readImageus4(imgin, pSRT);
			ushort4 rgrgSLBot = readImageus4(imgin, pSLBot);
			ushort4 rgrgSRBot = readImageus4(imgin, pSRBot);
			float2 green1ValG1S, redSumG1S, blueSumG1S, greenSumG1S;
			float bluegreenSumRedMS;
			if (isRedRow)
			{
				bluegreenSumRedMS = (float)rgrgSTop.z + rgrgSBot.z;
				green1ValG1S = (float2)(rgrgSearch.y, rgrgSearch.w); // y, w are center G's
				redSumG1S = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS,
					rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS);
			}
			else
			{
				bluegreenSumRedMS = (float)rgrgSTop.y + rgrgSBot.y;
				green1ValG1S = (float2)(rgrgSearch.x, rgrgSearch.z); // y, w are center G's
				redSumG1S = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.x + rgrgSTop.x, rgrgSBot.z + rgrgSTop.z); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS,
					rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS);
			}
			float2 redAveG1S = redSumG1S / 2;
			float2 blueAveG1S = blueSumG1S / 2;
			float2 greenAveG1S = greenSumG1S / 4;
			float2 lumaG1S = redAveG1S + green1ValG1S + blueAveG1S;
			float2 grRedDiffG1S = redAveG1S - greenAveG1S;
			float2 grBlueDiffG1S = blueAveG1S - greenAveG1S;
			if ((fabs(lumaG1S.x - lumaG1C.x) <= lumaG1Thr.x) &&
				(fabs(redAveG1S.x - redAveG1C.x) <= patternRG1Thr2.x) &&
				(fabs(blueAveG1S.x - blueAveG1C.x) <= patternBG1Thr2.x) &&
				(fabs(grRedDiffG1S.x - grRedDiffG1C.x) <= gRDiffG1Thr.x) &&
				(fabs(grBlueDiffG1S.x - grBlueDiffG1C.x) <= gBDiffG1Thr.x) &&
				(fabs(greenAveG1S.x - greenAveG1C.x) <= patternGG1Thr.x))
			{
				sum1 += green1ValG1S.x;
				++count1;
			}
			if ((fabs(lumaG1S.y - lumaG1C.y) <= lumaG1Thr.y) &&
				(fabs(redAveG1S.y - redAveG1C.y) <= patternRG1Thr2.y) &&
				(fabs(blueAveG1S.y - blueAveG1C.y) <= patternBG1Thr2.y) &&
				(fabs(grRedDiffG1S.y - grRedDiffG1C.y) <= gRDiffG1Thr.y) &&
				(fabs(grBlueDiffG1S.y - grBlueDiffG1C.y) <= gBDiffG1Thr.y) &&
				(fabs(greenAveG1S.y - greenAveG1C.y) <= patternGG1Thr.y))
			{
				sum2 += green1ValG1S.y;
				++count2;
			}
			// permutation
			if ((fabs(lumaG1S.y - lumaG1C.x) <= lumaG1Thr.x) &&
				(fabs(redAveG1S.y - redAveG1C.x) <= patternRG1Thr2.x) &&
				(fabs(blueAveG1S.y - blueAveG1C.x) <= patternBG1Thr2.x) &&
				(fabs(grRedDiffG1S.y - grRedDiffG1C.x) <= gRDiffG1Thr.x) &&
				(fabs(grBlueDiffG1S.y - grBlueDiffG1C.x) <= gBDiffG1Thr.x) &&
				(fabs(greenAveG1S.y - greenAveG1C.x) <= patternGG1Thr.x))
			{
				sum1 += green1ValG1S.y;
				++count1;
			}
			if ((fabs(lumaG1S.x - lumaG1C.y) <= lumaG1Thr.y) &&
				(fabs(redAveG1S.x - redAveG1C.y) <= patternRG1Thr2.y) &&
				(fabs(blueAveG1S.x - blueAveG1C.y) <= patternBG1Thr2.y) &&
				(fabs(grRedDiffG1S.x - grRedDiffG1C.y) <= gRDiffG1Thr.y) &&
				(fabs(grBlueDiffG1S.x - grBlueDiffG1C.y) <= gBDiffG1Thr.y) &&
				(fabs(greenAveG1S.x - greenAveG1C.y) <= patternGG1Thr.y))
			{
				sum2 += green1ValG1S.x;
				++count2;
			}
		}
	}
	int4 opix;
	if (isRedRow)
	{
		opix.x = rgrgCenter.x;
		if (count1 > 1)
			opix.y = (int)(sum1 / count1);
		else
			opix.y = rgrgCenter.y;
		opix.z = rgrgCenter.z;
		if (count2 > 1)
			opix.w = (int)(sum2 / count2);
		else
			opix.w = rgrgCenter.w;
	}
	else
	{// r,g permutation (!)
		if (count1 > 1)
			opix.x = (int)(sum1 / count1);
		else
			opix.x = rgrgCenter.x;
		opix.y = rgrgCenter.y;
		if (count2 > 1)
			opix.z = (int)(sum2 / count2);
		else
			opix.z = rgrgCenter.z;
		opix.w = rgrgCenter.w;
	}
	return  opix - ofs;
}


int4 doBNR16RedBlue(__read_only image2d_t imgin, int2 pos, const int ofs,
	const float lumaSqtThr, bool isRedRow)
{
	int2 pLeft = { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop = { pos.x, pos.y - 1 };
	int2 pBot = { pos.x, pos.y + 1 };
	int2 pLT = { pos.x - 1, pos.y - 1 };
	int2 pRT = { pos.x + 1, pos.y - 1 };
	int2 pLBot = { pos.x - 1, pos.y + 1 };
	int2 pRBot = { pos.x + 1, pos.y + 1 };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	ushort4 rgrgTop = readImageus4(imgin, pTop);
	ushort4 rgrgBot = readImageus4(imgin, pBot);
	ushort4 rgrgLT = readImageus4(imgin, pLT);
	ushort4 rgrgRT = readImageus4(imgin, pRT);
	ushort4 rgrgLBot = readImageus4(imgin, pLBot);
	ushort4 rgrgRBot = readImageus4(imgin, pRBot);
	float2 redvalRedC, grSumHRedC, grSumVRedC, blueSumRedC;
	float bluegreenSumRedMC;
	if (isRedRow)
	{
		redvalRedC = (float2)(rgrgCenter.x, rgrgCenter.z); // x, z are center R's
		grSumHRedC = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // Horizontal 
		grSumVRedC = (float2)(rgrgTop.x + rgrgBot.x, rgrgTop.z + rgrgBot.z);	// vertical 
		bluegreenSumRedMC = (float)(rgrgTop.y + rgrgBot.y);
		blueSumRedC = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC);
	}
	else
	{
		redvalRedC = (float2)(rgrgCenter.y, rgrgCenter.w); // x, z are center R's
		grSumHRedC = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // Horizontal 
		grSumVRedC = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w);	// vertical 
		bluegreenSumRedMC = (float)(rgrgTop.z + rgrgBot.z); // green
		blueSumRedC = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC);
	}
	float2 grSumRedC = grSumHRedC + grSumVRedC;
	float2 grAveRedC = grSumRedC / 4;
	float2 blueAveRedC = blueSumRedC / 4;
	float2 lumaRedC = redvalRedC + grAveRedC + blueAveRedC;
	float2 grBlueDiffRedC = grSumRedC - blueSumRedC;
	float2 redGrDiffRedC = grAveRedC - redvalRedC;

	float fLumaSqtThr = lumaSqtThr;
	float fLumaGrSqtThr = lumaSqtThr;
	float fGrGrAtRedRelThr = .5;
	float fBlBlAtRedRelThr = .5;
	float fGrGrAtGrRelThr = .5;
	float fRdRdAtGreenRelThr = 1.;
	float fBlBlAtGreenRelThr = 1.;
	float fDiff4Thr = 3.;
	float fDiff24Thr = 3.;
	float fDiff14Thr = 3.;
	float2 lumaRThr = { sqrt(lumaRedC.x)* fLumaSqtThr, sqrt(lumaRedC.y)* fLumaSqtThr };
	float2 patternGRThr = { lumaRThr.x * fGrGrAtRedRelThr, lumaRThr.y * fGrGrAtRedRelThr };
	float2 patternBRThr = { lumaRThr.x * fBlBlAtRedRelThr, lumaRThr.y * fBlBlAtRedRelThr };
	float2 gBDiffRThr = { lumaRThr.x * fDiff4Thr,		lumaRThr.y * fDiff4Thr };
	float2 redGrDiffRThr = { lumaRThr.x * fDiff14Thr,	lumaRThr.y * fDiff14Thr };
	float sum1 = 0;
	float sum2 = 0;
	int  count1 = 0;
	int  count2 = 0;
	int skipY = 2;
	int skipX = 1;
	int cKernSizeY = 16;
	int cKernSizeX = 16 / 4;
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos = { pos.x + i, pos.y + j };
			int2 pSLeft = { spos.x - 1, spos.y };
			int2 pSRight = { spos.x + 1, spos.y };
			int2 pSTop = { spos.x, spos.y - 1 };
			int2 pSBot = { spos.x, spos.y + 1 };
			int2 pSLT = { spos.x - 1, spos.y - 1 };
			int2 pSRT = { spos.x + 1, spos.y - 1 };
			int2 pSLBot = { spos.x - 1, spos.y + 1 };
			int2 pSRBot = { spos.x + 1, spos.y + 1 };
			ushort4 rgrgSearch = readImageus4(imgin, spos);
			ushort4 rgrgSLeft = readImageus4(imgin, pSLeft);
			ushort4 rgrgSRight = readImageus4(imgin, pSRight);
			ushort4 rgrgSTop = readImageus4(imgin, pSTop);
			ushort4 rgrgSBot = readImageus4(imgin, pSBot);
			ushort4 rgrgSLT = readImageus4(imgin, pSLT);
			ushort4 rgrgSRT = readImageus4(imgin, pSRT);
			ushort4 rgrgSLBot = readImageus4(imgin, pSLBot);
			ushort4 rgrgSRBot = readImageus4(imgin, pSRBot);
			float2 redvalRedS, grSumHRedS, grSumVRedS, blueSumRedS;
			float bluegreenSumRedMS;
			if (isRedRow)
			{
				redvalRedS = (float2)(rgrgSearch.x, rgrgSearch.z); // x, z are center R's
				grSumHRedS = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w);
				grSumVRedS = (float2)(rgrgSTop.x + rgrgSBot.x, rgrgSTop.z + rgrgSBot.z);
				bluegreenSumRedMS = (float)rgrgSTop.y + rgrgSBot.y;
				blueSumRedS = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS,  // corners
					rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS);
			}
			else
			{
				redvalRedS = (float2)(rgrgSearch.y, rgrgSearch.w); // x, z are center R's
				grSumHRedS = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x);
				grSumVRedS = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w);
				bluegreenSumRedMS = (float)rgrgSTop.z + rgrgSBot.z;
				blueSumRedS = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS,
					rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS);
			}
			float2 grSumRedS = grSumHRedS + grSumVRedS;
			float2 grAveRedS = grSumRedS / 4;
			float2 blueAveRedS = blueSumRedS / 4;
			float2 lumaRedS = redvalRedS + grAveRedS + blueAveRedS;
			float2 grBlueDiffRedS = grSumRedS - blueSumRedS;
			float2 redGrDiffRedS = grAveRedS - redvalRedS;
			if ((fabs(lumaRedS.x - lumaRedC.x) <= lumaRThr.x) &&
				(fabs(grAveRedS.x - grAveRedC.x) <= patternGRThr.x) &&
				(fabs(blueAveRedS.x - blueAveRedC.x) <= patternBRThr.x) &&
				(fabs(grBlueDiffRedS.x - grBlueDiffRedC.x) <= gBDiffRThr.x) &&
				(fabs(redGrDiffRedS.x - redGrDiffRedC.x) <= redGrDiffRThr.x))
			{
				sum1 += redvalRedS.x;
				++count1;
			}
			if ((fabs(lumaRedS.y - lumaRedC.y) <= lumaRThr.y) &&
				(fabs(grAveRedS.y - grAveRedC.y) <= patternGRThr.y) &&
				(fabs(blueAveRedS.y - blueAveRedC.y) <= patternBRThr.y) &&
				(fabs(grBlueDiffRedS.y - grBlueDiffRedC.y) <= gBDiffRThr.y) &&
				(fabs(redGrDiffRedS.y - redGrDiffRedC.y) <= redGrDiffRThr.y))
			{
				sum2 += redvalRedS.y;
				++count2;
			}
			// permutation
			if ((fabs(lumaRedS.y - lumaRedC.x) <= lumaRThr.x) &&
				(fabs(grAveRedS.y - grAveRedC.x) <= patternGRThr.x) &&
				(fabs(blueAveRedS.y - blueAveRedC.x) <= patternBRThr.x) &&
				(fabs(grBlueDiffRedS.y - grBlueDiffRedC.x) <= gBDiffRThr.x) &&
				(fabs(redGrDiffRedS.y - redGrDiffRedC.x) <= redGrDiffRThr.x))
			{
				sum1 += redvalRedS.y;
				++count1;
			}
			if ((fabs(lumaRedS.x - lumaRedC.y) <= lumaRThr.y) &&
				(fabs(grAveRedS.x - grAveRedC.y) <= patternGRThr.y) &&
				(fabs(blueAveRedS.x - blueAveRedC.y) <= patternBRThr.y) &&
				(fabs(grBlueDiffRedS.x - grBlueDiffRedC.y) <= gBDiffRThr.y) &&
				(fabs(redGrDiffRedS.x - redGrDiffRedC.y) <= redGrDiffRThr.y))
			{
				sum2 += redvalRedS.x;
				++count2;
			}
		}
	}
	int4 opix;
	if (isRedRow)
	{
		if (count1 > 1)
			opix.x = (int)(sum1 / count1);
		else
			opix.x = rgrgCenter.x;
		opix.y = rgrgCenter.y;
		if (count2 > 1)
			opix.z = (int)(sum2 / count2);
		else
			opix.z = rgrgCenter.z;
		opix.w = rgrgCenter.w;
	}
	else
	{// r,g permutation (!)
		opix.x = rgrgCenter.x;
		if (count1 > 1)
			opix.y = (int)(sum1 / count1);
		else
			opix.y = rgrgCenter.y;
		opix.z = rgrgCenter.z;
		if (count2 > 1)
			opix.w = (int)(sum2 / count2);
		else
			opix.w = rgrgCenter.w;
	}
	return  opix - ofs;
}



int4 doBNR16FLOAT(__read_only image2d_t imgin, int2 pos, const int ofs,
	const float lumaSqtThr, bool isRedRow)
{
	int2 pLeft = { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop = { pos.x, pos.y - 1 };
	int2 pBot = { pos.x, pos.y + 1 };
	int2 pLT = { pos.x - 1, pos.y - 1 };
	int2 pRT = { pos.x + 1, pos.y - 1 };
	int2 pLBot = { pos.x - 1, pos.y + 1 };
	int2 pRBot = { pos.x + 1, pos.y + 1 };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	ushort4 rgrgTop = readImageus4(imgin, pTop);
	ushort4 rgrgBot = readImageus4(imgin, pBot);
	ushort4 rgrgLT = readImageus4(imgin, pLT);
	ushort4 rgrgRT = readImageus4(imgin, pRT);
	ushort4 rgrgLBot = readImageus4(imgin, pLBot);
	ushort4 rgrgRBot = readImageus4(imgin, pRBot);
	float2 redvalRedC, grSumHRedC, grSumVRedC, bluegreenSumRedMC, blueSumRedC, green1ValG1C, redSumG1C, blueSumG1C, greenSumG1C;
	if (isRedRow)
	{
// Red center
		redvalRedC = (float2)(rgrgCenter.x, rgrgCenter.z); // x, z are center R's
		grSumHRedC = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // Horizontal 
		grSumVRedC = (float2)(rgrgTop.x + rgrgBot.x, rgrgTop.z + rgrgBot.z);	// vertical 
		bluegreenSumRedMC = (float2)(rgrgTop.y + rgrgBot.y, rgrgTop.z + rgrgBot.z);
		blueSumRedC = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC.x, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC.x);
// Green center
		green1ValG1C = (float2)(rgrgCenter.y, rgrgCenter.w); // y, w are center G's
		redSumG1C = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC.y, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC.y);
	}
	else
	{
// Red center
		redvalRedC = (float2)(rgrgCenter.y, rgrgCenter.w); // x, z are center R's
		grSumHRedC = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // Horizontal 
		grSumVRedC = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w);	// vertical 
		bluegreenSumRedMC = (float2)(rgrgTop.z + rgrgBot.z, rgrgTop.y + rgrgBot.y); // green
		blueSumRedC = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC.x, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC.x);
// Green center
		green1ValG1C = (float2)(rgrgCenter.x, rgrgCenter.z); // y, w are center G's
		redSumG1C = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.x + rgrgTop.x, rgrgBot.z + rgrgTop.z); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC.y, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC.y);
	}
	float2 grSumRedC = grSumHRedC + grSumVRedC;
	float2 grAveRedC = grSumRedC / 4;
	float2 blueAveRedC = blueSumRedC / 4;
	float2 lumaRedC = redvalRedC + grAveRedC + blueAveRedC;
	float2 grBlueDiffRedC = grSumRedC - blueSumRedC;
	float2 redGrDiffRedC = grAveRedC - redvalRedC;
	float2 redAveG1C = redSumG1C / 2;
	float2 greenAveG1C = greenSumG1C / 4;
	float2 blueAveG1C = blueSumG1C / 2;
	float2 lumaG1C = redAveG1C + green1ValG1C + blueAveG1C;
	float2 grRedDiffG1C = redAveG1C - greenAveG1C;
	float2 grBlueDiffG1C = blueAveG1C - greenAveG1C;
//  Thresholds fill in 
	float fLumaSqtThr = lumaSqtThr;
	float fLumaGrSqtThr = lumaSqtThr;
	float fGrGrAtRedRelThr = .5;
	float fBlBlAtRedRelThr = .5;
	float fGrGrAtGrRelThr = .5;
	float fRdRdAtGreenRelThr = 1.;
	float fBlBlAtGreenRelThr = 1.;
	float fDiff4Thr = 3.;
	float fDiff24Thr = 3.;
	float fDiff14Thr = 3.;
	float2 lumaRThr = { sqrt(lumaRedC.x)* fLumaSqtThr, sqrt(lumaRedC.y)* fLumaSqtThr };
	float2 patternGRThr = { lumaRThr.x * fGrGrAtRedRelThr, lumaRThr.y * fGrGrAtRedRelThr };
	float2 patternBRThr = { lumaRThr.x * fBlBlAtRedRelThr, lumaRThr.y * fBlBlAtRedRelThr };
	float2 gBDiffRThr = { lumaRThr.x * fDiff4Thr,		lumaRThr.y * fDiff4Thr };
	float2 redGrDiffRThr = { lumaRThr.x * fDiff14Thr,	lumaRThr.y * fDiff14Thr };
	float2 lumaG1Thr = { sqrt(lumaG1C.x) * fLumaGrSqtThr,sqrt(lumaG1C.y) * fLumaGrSqtThr };
	float2 patternRG1Thr2 = { lumaG1Thr.x * fRdRdAtGreenRelThr,lumaG1Thr.y * fRdRdAtGreenRelThr };
	float2 patternBG1Thr2 = { lumaG1Thr.x * fBlBlAtGreenRelThr,lumaG1Thr.y * fBlBlAtGreenRelThr };
	float2 patternGG1Thr = { lumaG1Thr.x * fGrGrAtGrRelThr,	lumaG1Thr.y * fGrGrAtGrRelThr };
	float2 gRDiffG1Thr = { lumaG1Thr.x * fDiff24Thr,		lumaG1Thr.y * fDiff24Thr };
	float2 gBDiffG1Thr = { lumaG1Thr.x * fDiff24Thr,		lumaG1Thr.y * fDiff24Thr };
	float4 sum = { 0,0,0,0 };
	int4  count = { 0,0,0,0 };
	int skipY = 2;
	int skipX = 1;
	int cKernSizeY = 16;
	int cKernSizeX = 16 / 4;
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos = { pos.x + i, pos.y + j };
			int2 pSLeft = { spos.x - 1, spos.y };
			int2 pSRight = { spos.x + 1, spos.y };
			int2 pSTop = { spos.x, spos.y - 1 };
			int2 pSBot = { spos.x, spos.y + 1 };
			int2 pSLT = { spos.x - 1, spos.y - 1 };
			int2 pSRT = { spos.x + 1, spos.y - 1 };
			int2 pSLBot = { spos.x - 1, spos.y + 1 };
			int2 pSRBot = { spos.x + 1, spos.y + 1 };
			ushort4 rgrgSearch = readImageus4(imgin, spos);
			ushort4 rgrgSLeft = readImageus4(imgin, pSLeft);
			ushort4 rgrgSRight = readImageus4(imgin, pSRight);
			ushort4 rgrgSTop = readImageus4(imgin, pSTop);
			ushort4 rgrgSBot = readImageus4(imgin, pSBot);
			ushort4 rgrgSLT = readImageus4(imgin, pSLT);
			ushort4 rgrgSRT = readImageus4(imgin, pSRT);
			ushort4 rgrgSLBot = readImageus4(imgin, pSLBot);
			ushort4 rgrgSRBot = readImageus4(imgin, pSRBot);
			float2 redvalRedS, grSumHRedS, grSumVRedS, bluegreenSumRedMS, blueSumRedS, green1ValG1S, redSumG1S, blueSumG1S, greenSumG1S;
			if (isRedRow)
			{
				redvalRedS = (float2)(rgrgSearch.x, rgrgSearch.z); // x, z are center R's
				grSumHRedS = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w);
				grSumVRedS = (float2)(rgrgSTop.x + rgrgSBot.x, rgrgSTop.z + rgrgSBot.z);
				bluegreenSumRedMS = (float2)(rgrgSTop.y + rgrgSBot.y, rgrgSTop.z + rgrgSBot.z);
				blueSumRedS = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS.x,  // corners
					rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS.x);
				green1ValG1S = (float2)(rgrgSearch.y, rgrgSearch.w); // y, w are center G's
				redSumG1S = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS.y,
					rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS.y);
			}
			else
			{
				redvalRedS = (float2)(rgrgSearch.y, rgrgSearch.w); // x, z are center R's
				grSumHRedS = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x);
				grSumVRedS = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w);
				bluegreenSumRedMS = (float2)(rgrgSTop.z + rgrgSBot.z, rgrgSTop.y + rgrgSBot.y);
				blueSumRedS = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS.x,
					rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS.x);
				green1ValG1S = (float2)(rgrgSearch.x, rgrgSearch.z); // y, w are center G's
				redSumG1S = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.x + rgrgSTop.x, rgrgSBot.z + rgrgSTop.z); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS.y,
					rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS.y);
			}
			float2 grSumRedS = grSumHRedS + grSumVRedS;
			float2 grAveRedS = grSumRedS / 4;
			float2 blueAveRedS = blueSumRedS / 4;
			float2 lumaRedS = redvalRedS + grAveRedS + blueAveRedS;
			float2 grBlueDiffRedS = grSumRedS - blueSumRedS;
			float2 redGrDiffRedS = grAveRedS - redvalRedS;
			if ((fabs(lumaRedS.x - lumaRedC.x) <= lumaRThr.x) &&
				(fabs(grAveRedS.x - grAveRedC.x) <= patternGRThr.x) &&
				(fabs(blueAveRedS.x - blueAveRedC.x) <= patternBRThr.x) &&
				(fabs(grBlueDiffRedS.x - grBlueDiffRedC.x) <= gBDiffRThr.x) &&
				(fabs(redGrDiffRedS.x - redGrDiffRedC.x) <= redGrDiffRThr.x))
			{
				sum.x += redvalRedS.x;
				++count.x;
			}
			if ((fabs(lumaRedS.y - lumaRedC.y) <= lumaRThr.y) &&
				(fabs(grAveRedS.y - grAveRedC.y) <= patternGRThr.y) &&
				(fabs(blueAveRedS.y - blueAveRedC.y) <= patternBRThr.y) &&
				(fabs(grBlueDiffRedS.y - grBlueDiffRedC.y) <= gBDiffRThr.y) &&
				(fabs(redGrDiffRedS.y - redGrDiffRedC.y) <= redGrDiffRThr.y))
			{
				sum.y += redvalRedS.y;
				++count.y;
			}
// permutation
			if ((fabs(lumaRedS.y - lumaRedC.x) <= lumaRThr.x) &&
				(fabs(grAveRedS.y - grAveRedC.x) <= patternGRThr.x) &&
				(fabs(blueAveRedS.y - blueAveRedC.x) <= patternBRThr.x) &&
				(fabs(grBlueDiffRedS.y - grBlueDiffRedC.x) <= gBDiffRThr.x) &&
				(fabs(redGrDiffRedS.y - redGrDiffRedC.x) <= redGrDiffRThr.x))
			{
				sum.x += redvalRedS.y;
				++count.x;
			}
			if ((fabs(lumaRedS.x - lumaRedC.y) <= lumaRThr.y) &&
				(fabs(grAveRedS.x - grAveRedC.y) <= patternGRThr.y) &&
				(fabs(blueAveRedS.x - blueAveRedC.y) <= patternBRThr.y) &&
				(fabs(grBlueDiffRedS.x - grBlueDiffRedC.y) <= gBDiffRThr.y) &&
				(fabs(redGrDiffRedS.x - redGrDiffRedC.y) <= redGrDiffRThr.y))
			{
				sum.y += redvalRedS.x;
				++count.y;
			}
// Green Search
			float2 redAveG1S = redSumG1S / 2;
			float2 blueAveG1S = blueSumG1S / 2;
			float2 greenAveG1S = greenSumG1S / 4;
			float2 lumaG1S = redAveG1S + green1ValG1S + blueAveG1S;
			float2 grRedDiffG1S = redAveG1S - greenAveG1S;
			float2 grBlueDiffG1S = blueAveG1S - greenAveG1S;
			if ((fabs(lumaG1S.x - lumaG1C.x) <= lumaG1Thr.x) &&
				(fabs(redAveG1S.x - redAveG1C.x) <= patternRG1Thr2.x) &&
				(fabs(blueAveG1S.x - blueAveG1C.x) <= patternBG1Thr2.x) &&
				(fabs(grRedDiffG1S.x - grRedDiffG1C.x) <= gRDiffG1Thr.x) &&
				(fabs(grBlueDiffG1S.x - grBlueDiffG1C.x) <= gBDiffG1Thr.x) &&
				(fabs(greenAveG1S.x - greenAveG1C.x) <= patternGG1Thr.x))
			{
				sum.z += green1ValG1S.x;
				++count.z;
			}
			if ((fabs(lumaG1S.y - lumaG1C.y) <= lumaG1Thr.y) &&
				(fabs(redAveG1S.y - redAveG1C.y) <= patternRG1Thr2.y) &&
				(fabs(blueAveG1S.y - blueAveG1C.y) <= patternBG1Thr2.y) &&
				(fabs(grRedDiffG1S.y - grRedDiffG1C.y) <= gRDiffG1Thr.y) &&
				(fabs(grBlueDiffG1S.y - grBlueDiffG1C.y) <= gBDiffG1Thr.y) &&
				(fabs(greenAveG1S.y - greenAveG1C.y) <= patternGG1Thr.y))
			{
				sum.w += green1ValG1S.y;
				++count.w;
			}
// permutation
			if ((fabs(lumaG1S.y - lumaG1C.x) <= lumaG1Thr.x) &&
				(fabs(redAveG1S.y - redAveG1C.x) <= patternRG1Thr2.x) &&
				(fabs(blueAveG1S.y - blueAveG1C.x) <= patternBG1Thr2.x) &&
				(fabs(grRedDiffG1S.y - grRedDiffG1C.x) <= gRDiffG1Thr.x) &&
				(fabs(grBlueDiffG1S.y - grBlueDiffG1C.x) <= gBDiffG1Thr.x) &&
				(fabs(greenAveG1S.y - greenAveG1C.x) <= patternGG1Thr.x))
			{
				sum.z += green1ValG1S.y;
				++count.z;
			}
			if ((fabs(lumaG1S.x - lumaG1C.y) <= lumaG1Thr.y) &&
				(fabs(redAveG1S.x - redAveG1C.y) <= patternRG1Thr2.y) &&
				(fabs(blueAveG1S.x - blueAveG1C.y) <= patternBG1Thr2.y) &&
				(fabs(grRedDiffG1S.x - grRedDiffG1C.y) <= gRDiffG1Thr.y) &&
				(fabs(grBlueDiffG1S.x - grBlueDiffG1C.y) <= gBDiffG1Thr.y) &&
				(fabs(greenAveG1S.x - greenAveG1C.y) <= patternGG1Thr.y))
			{
				sum.w += green1ValG1S.x;
				++count.w;
			}
		}
	}
	int4 opix;
	if (isRedRow)
	{
		if (count.x > 1)
			opix.x = (int)(sum.x / count.x);
		else
			opix.x = rgrgCenter.x;
		if (count.z > 1)
			opix.y = (int)(sum.z / count.z);
		else
			opix.y = rgrgCenter.y;
		if (count.y > 1)
			opix.z = (int)(sum.y / count.y);
		else
			opix.z = rgrgCenter.z;
		if (count.w > 1)
			opix.w = (int)(sum.w / count.w);
		else
			opix.w = rgrgCenter.w;
	}
	else
	{// r,g permutation (!)
		if (count.z > 1)
			opix.x = (int)(sum.z / count.z);
		else
			opix.x = rgrgCenter.x;
		if (count.x > 1)
			opix.y = (int)(sum.x / count.x);
		else
			opix.y = rgrgCenter.y;
		if (count.w > 1)
			opix.z = (int)(sum.w / count.w);
		else
			opix.z = rgrgCenter.z;
		if (count.y > 1)
			opix.w = (int)(sum.y / count.y);
		else
			opix.w = rgrgCenter.w;
	}
	return  opix - ofs;
}


uint domedian3(uint p[3])
{
	uint med;
	if (p[0] > p[1])
	{
		if (p[1] > p[2])
			med = 1;
		else
		{	
			if (p[0] > p[2])
				med = 2;
			else
				med = 0;
		}
	}
	else
	{ 
		if (p[1] < p[2])
			med = 1;
		else
		{	
			if (p[0] < p[2])
				med = 2;
			else
				med = 0;
		}
	}
	return p[med];
}


uint DoMedian5(uint p[5])
{
//	PIX_SORT(p[0], p[1]);
	if (p[0] > p[1]) 
	{
		uint temp = p[0];
		p[0] = p[1];
		p[1] = temp;
	}
//	PIX_SORT(p[3], p[4]);
	if (p[3] > p[4])
	{
		uint temp = p[3];
		p[3] = p[4];
		p[4] = temp;
	}

	if (p[3] < p[0]) // eliminate the lowest
	{
//		PIX_SWAP(p[1], p[4]);
		uint temp = p[1];
		p[4] = p[1];
		p[4] = temp;
		p[3] = p[0];
	}
	p[0] = p[2];
//	PIX_SORT(p[0], p[1]);
	if (p[0] > p[1])
	{
		uint temp = p[0];
		p[0] = p[1];
		p[1] = temp;
	}
	if (p[0] < p[3]) // eliminate another lowest
	{
//		PIX_SWAP(p[1], p[4]);
		uint temp = p[1];
		p[4] = p[1];
		p[4] = temp;
		p[0] = p[3];
	}
	if (p[4] < p[0])
		return p[4];
	else
		return p[0];
}


uint Median5RedLeft(__read_only image2d_t imgin, int2 pos)
{
	int2 pLeft = { pos.x - 1, pos.y };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	int2 pTop2 = { pos.x, pos.y - 2 };
	ushort4 rgrgTop2 = readImageus4(imgin, pTop2);
	int2 pBot2 = { pos.x, pos.y + 2 };
	ushort4 rgrgBot2 = readImageus4(imgin, pBot2);
	uint p[5] = { rgrgTop2.x, rgrgLeft.z, rgrgCenter.x, rgrgCenter.z, rgrgBot2.x };
	return DoMedian5(p);
}


uint median3RedLeft(__read_only image2d_t imgin, int2 pos)
{
	int2 pLeft = { pos.x - 1, pos.y };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	uint p[3] = { rgrgLeft.z, rgrgCenter.x, rgrgCenter.z };
	return domedian3(p);
}


uint Median5RedRight(__read_only image2d_t imgin, int2 pos)
{
	int2 pRight = { pos.x + 1, pos.y };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	int2 pTop2 = { pos.x, pos.y - 2 };
	ushort4 rgrgTop2 = readImageus4(imgin, pTop2);
	int2 pBot2 = { pos.x, pos.y + 2 };
	ushort4 rgrgBot2 = readImageus4(imgin, pBot2);
	uint p[5] = { rgrgTop2.z, rgrgCenter.x, rgrgCenter.z, rgrgRight.x, rgrgBot2.z };
	return DoMedian5(p);
}


uint median3RedRight(__read_only image2d_t imgin, int2 pos)
{
	int2 pRight = { pos.x + 1, pos.y };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	uint p[3] = { rgrgCenter.x, rgrgCenter.z, rgrgRight.x };
	return domedian3(p);
}


uint Median5GrnLeft(__read_only image2d_t imgin, int2 pos)
{
	int2 pLeft = { pos.x - 1, pos.y };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	int2 pTop2 = { pos.x, pos.y - 2 };
	ushort4 rgrgTop2 = readImageus4(imgin, pTop2);
	int2 pBot2 = { pos.x, pos.y + 2 };
	ushort4 rgrgBot2 = readImageus4(imgin, pBot2);
	uint p[5] = { rgrgTop2.y, rgrgLeft.w, rgrgCenter.y, rgrgCenter.w, rgrgBot2.y };
	return DoMedian5(p);
}


uint median3GrnLeft(__read_only image2d_t imgin, int2 pos)
{
	int2 pLeft = { pos.x - 1, pos.y };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	uint p[3] = { rgrgLeft.w, rgrgCenter.y, rgrgCenter.w };
	return domedian3(p);
}


uint Median5GrnRight(__read_only image2d_t imgin, int2 pos)
{
	int2 pRight = { pos.x + 1, pos.y };
	ushort4 rgrgCenter= readImageus4(imgin, pos);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	int2 pTop2 = { pos.x, pos.y - 2 };
	ushort4 rgrgTop2 = readImageus4(imgin, pTop2);
	int2 pBot2 = { pos.x, pos.y + 2 };
	ushort4 rgrgBot2 = readImageus4(imgin, pBot2);
	uint p[5] = { rgrgTop2.w, rgrgCenter.y, rgrgCenter.w, rgrgRight.y, rgrgBot2.w };
	return DoMedian5(p);
}


uint median3GrnRight(__read_only image2d_t imgin, int2 pos)
{
	int2 pRight = { pos.x + 1, pos.y };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	uint p[3] = { rgrgCenter.y, rgrgCenter.w, rgrgRight.y };
	return domedian3(p);
}


__kernel void BNR16(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaSqtThr,
	__write_only image2d_t imgout)
{
	float2 global_id= { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size  = {	get_global_size(0),			get_global_size(1) };
	int2 out_dim	= { get_image_width(imgout),	get_image_height(imgout) };
	float2 block_size = { (float)out_dim.x/(float)work_size.x, (float)out_dim.y/(float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + 1 + kernSize / 4, y + kernSize + 4 };
			int4 opix = doBNR16FLOAT(imgin, ipos, ofs, lumaSqtThr, !leftmostisGreen);
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y };
			write_imageui(imgout, opos, opixu);
		}
	}
}


int4 doBNR32FLOAT(__read_only image2d_t imgin, int2 pos, const int ofs,
	const float lumaSqtThr, bool isRedRow)
{
	int2 pLeft = { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop = { pos.x, pos.y - 1 };
	int2 pBot = { pos.x, pos.y + 1 };
	int2 pLT = { pos.x - 1, pos.y - 1 };
	int2 pRT = { pos.x + 1, pos.y - 1 };
	int2 pLBot = { pos.x - 1, pos.y + 1 };
	int2 pRBot = { pos.x + 1, pos.y + 1 };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	ushort4 rgrgTop = readImageus4(imgin, pTop);
	ushort4 rgrgBot = readImageus4(imgin, pBot);
	ushort4 rgrgLT = readImageus4(imgin, pLT);
	ushort4 rgrgRT = readImageus4(imgin, pRT);
	ushort4 rgrgLBot = readImageus4(imgin, pLBot);
	ushort4 rgrgRBot = readImageus4(imgin, pRBot);
	float2 redvalRedC, grSumHRedC, grSumVRedC, bluegreenSumRedMC, blueSumRedC, green1ValG1C, redSumG1C, blueSumG1C, greenSumG1C;
	if (isRedRow)
	{
		redvalRedC = (float2)(rgrgCenter.x, rgrgCenter.z); // x, z are center R's
		grSumHRedC = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // Horizontal 
		grSumVRedC = (float2)(rgrgTop.x + rgrgBot.x, rgrgTop.z + rgrgBot.z);	// vertical 
		bluegreenSumRedMC = (float2)(rgrgTop.y + rgrgBot.y, rgrgTop.z + rgrgBot.z);
		blueSumRedC = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC.x, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC.x);

		green1ValG1C = (float2)(rgrgCenter.y, rgrgCenter.w); // y, w are center G's
		redSumG1C = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC.y, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC.y);
	}
	else
	{
// Red center
		redvalRedC = (float2)(rgrgCenter.y, rgrgCenter.w); // x, z are center R's
		grSumHRedC = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // Horizontal 
		grSumVRedC = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w);	// vertical 
		bluegreenSumRedMC = (float2)(rgrgTop.z + rgrgBot.z, rgrgTop.y + rgrgBot.y); // green
		blueSumRedC = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC.x, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC.x);
// Green center
		green1ValG1C = (float2)(rgrgCenter.x, rgrgCenter.z); // y, w are center G's
		redSumG1C = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.x + rgrgTop.x, rgrgBot.z + rgrgTop.z); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC.y, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC.y);
	}
	float2 grSumRedC = grSumHRedC + grSumVRedC;
	float2 grAveRedC = grSumRedC / 4;
	float2 blueAveRedC = blueSumRedC / 4;
	float2 lumaRedC = redvalRedC + grAveRedC + blueAveRedC;
	float2 grBlueDiffRedC = grSumRedC - blueSumRedC;
	float2 redGrDiffRedC = grAveRedC - redvalRedC;
	float2 redAveG1C = redSumG1C / 2;
	float2 greenAveG1C = greenSumG1C / 4;
	float2 blueAveG1C = blueSumG1C / 2;
	float2 lumaG1C = redAveG1C + green1ValG1C + blueAveG1C;
	float2 grRedDiffG1C = redAveG1C - greenAveG1C;
	float2 grBlueDiffG1C = blueAveG1C - greenAveG1C;
//  Thresholds fill in 
	float fLumaSqtThr = lumaSqtThr;
	float fLumaGrSqtThr = lumaSqtThr;
	float fGrGrAtRedRelThr = .5;
	float fBlBlAtRedRelThr = .5;
	float fGrGrAtGrRelThr = .5;
	float fRdRdAtGreenRelThr = 1.;
	float fBlBlAtGreenRelThr = 1.;
	float fDiff4Thr = 3.;
	float fDiff24Thr = 3.;
	float fDiff14Thr = 3.;
	float2 lumaRThr = { sqrt(lumaRedC.x)* fLumaSqtThr, sqrt(lumaRedC.y)* fLumaSqtThr };
	float2 patternGRThr = { lumaRThr.x * fGrGrAtRedRelThr, lumaRThr.y * fGrGrAtRedRelThr };
	float2 patternBRThr = { lumaRThr.x * fBlBlAtRedRelThr, lumaRThr.y * fBlBlAtRedRelThr };
	float2 gBDiffRThr = { lumaRThr.x * fDiff4Thr,		lumaRThr.y * fDiff4Thr };
	float2 redGrDiffRThr = { lumaRThr.x * fDiff14Thr,	lumaRThr.y * fDiff14Thr };
	float2 lumaG1Thr = { sqrt(lumaG1C.x) * fLumaGrSqtThr,sqrt(lumaG1C.y) * fLumaGrSqtThr };
	float2 patternRG1Thr2 = { lumaG1Thr.x * fRdRdAtGreenRelThr,lumaG1Thr.y * fRdRdAtGreenRelThr };
	float2 patternBG1Thr2 = { lumaG1Thr.x * fBlBlAtGreenRelThr,lumaG1Thr.y * fBlBlAtGreenRelThr };
	float2 patternGG1Thr = { lumaG1Thr.x * fGrGrAtGrRelThr,	lumaG1Thr.y * fGrGrAtGrRelThr };
	float2 gRDiffG1Thr = { lumaG1Thr.x * fDiff24Thr,		lumaG1Thr.y * fDiff24Thr };
	float2 gBDiffG1Thr = { lumaG1Thr.x * fDiff24Thr,		lumaG1Thr.y * fDiff24Thr };
	float4 sum = { 0,0,0,0 };
	int4  count = { 0,0,0,0 };
	int skipY = 2;
	int skipX = 1;
	int cKernSizeY = 32;
	int cKernSizeX = 32 / 4;
	for (int j = -cKernSizeY;j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX;i <= cKernSizeX;i += skipX)
		{
			int2 spos = { pos.x + i, pos.y + j };
			int2 pSLeft = { spos.x - 1, spos.y };
			int2 pSRight = { spos.x + 1, spos.y };
			int2 pSTop = { spos.x, spos.y - 1 };
			int2 pSBot = { spos.x, spos.y + 1 };
			int2 pSLT = { spos.x - 1, spos.y - 1 };
			int2 pSRT = { spos.x + 1, spos.y - 1 };
			int2 pSLBot = { spos.x - 1, spos.y + 1 };
			int2 pSRBot = { spos.x + 1, spos.y + 1 };

			ushort4 rgrgSearch = readImageus4(imgin, spos);
			ushort4 rgrgSLeft = readImageus4(imgin, pSLeft);
			ushort4 rgrgSRight = readImageus4(imgin, pSRight);
			ushort4 rgrgSTop = readImageus4(imgin, pSTop);
			ushort4 rgrgSBot = readImageus4(imgin, pSBot);
			ushort4 rgrgSLT = readImageus4(imgin, pSLT);
			ushort4 rgrgSRT = readImageus4(imgin, pSRT);
			ushort4 rgrgSLBot = readImageus4(imgin, pSLBot);
			ushort4 rgrgSRBot = readImageus4(imgin, pSRBot);
			float2 redvalRedS, grSumHRedS, grSumVRedS, bluegreenSumRedMS, blueSumRedS, green1ValG1S, redSumG1S, blueSumG1S, greenSumG1S;
			if (isRedRow)
			{
				redvalRedS = (float2)(rgrgSearch.x, rgrgSearch.z); // x, z are center R's
				grSumHRedS = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w);
				grSumVRedS = (float2)(rgrgSTop.x + rgrgSBot.x, rgrgSTop.z + rgrgSBot.z);
				bluegreenSumRedMS = (float2)(rgrgSTop.y + rgrgSBot.y, rgrgSTop.z + rgrgSBot.z);
				blueSumRedS = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS.x,  // corners
										rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS.x);
				green1ValG1S = (float2)(rgrgSearch.y, rgrgSearch.w); // y, w are center G's
				redSumG1S = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS.y,
					rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS.y);
			}
			else
			{
				redvalRedS = (float2)(rgrgSearch.y, rgrgSearch.w); // x, z are center R's
				grSumHRedS = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x);
				grSumVRedS = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w);
				bluegreenSumRedMS = (float2)(rgrgSTop.z + rgrgSBot.z, rgrgSTop.y + rgrgSBot.y);
				blueSumRedS = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS.x,
					rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS.x);
				green1ValG1S = (float2)(rgrgSearch.x, rgrgSearch.z); // y, w are center G's
				redSumG1S = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.x + rgrgSTop.x, rgrgSBot.z + rgrgSTop.z); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS.y,
					rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS.y);
			}
			float2 grSumRedS = grSumHRedS + grSumVRedS;
			float2 grAveRedS = grSumRedS / 4;
			float2 blueAveRedS = blueSumRedS / 4;
			float2 lumaRedS = redvalRedS + grAveRedS + blueAveRedS;
			float2 grBlueDiffRedS = grSumRedS - blueSumRedS;
			float2 redGrDiffRedS = grAveRedS - redvalRedS;
			if ((fabs(lumaRedS.x- lumaRedC.x) <= lumaRThr.x) &&
				(fabs(grAveRedS.x- grAveRedC.x) <= patternGRThr.x) &&
				(fabs(blueAveRedS.x- blueAveRedC.x) <= patternBRThr.x) &&
				(fabs(grBlueDiffRedS.x- grBlueDiffRedC.x) <= gBDiffRThr.x) &&
				(fabs(redGrDiffRedS.x- redGrDiffRedC.x) <= redGrDiffRThr.x))
			{
				sum.x += redvalRedS.x;
				++count.x;
			}
			if ((fabs(lumaRedS.y- lumaRedC.y) <= lumaRThr.y) &&
				(fabs(grAveRedS.y- grAveRedC.y) <= patternGRThr.y) &&
				(fabs(blueAveRedS.y- blueAveRedC.y) <= patternBRThr.y) &&
				(fabs(grBlueDiffRedS.y- grBlueDiffRedC.y) <= gBDiffRThr.y) &&
				(fabs(redGrDiffRedS.y- redGrDiffRedC.y) <= redGrDiffRThr.y))
			{
				sum.y += redvalRedS.y;
				++count.y;
			}
// permutation
			if ((fabs(lumaRedS.y- lumaRedC.x) <= lumaRThr.x) &&
				(fabs(grAveRedS.y- grAveRedC.x) <= patternGRThr.x) &&
				(fabs(blueAveRedS.y- blueAveRedC.x) <= patternBRThr.x) &&
				(fabs(grBlueDiffRedS.y- grBlueDiffRedC.x) <= gBDiffRThr.x) &&
				(fabs(redGrDiffRedS.y- redGrDiffRedC.x) <= redGrDiffRThr.x))
			{
				sum.x += redvalRedS.y;
				++count.x;
			}
			if ((fabs(lumaRedS.x- lumaRedC.y) <= lumaRThr.y) &&
				(fabs(grAveRedS.x- grAveRedC.y) <= patternGRThr.y) &&
				(fabs(blueAveRedS.x- blueAveRedC.y) <= patternBRThr.y) &&
				(fabs(grBlueDiffRedS.x- grBlueDiffRedC.y) <= gBDiffRThr.y) &&
				(fabs(redGrDiffRedS.x- redGrDiffRedC.y) <= redGrDiffRThr.y))
			{
				sum.y += redvalRedS.x;
				++count.y;
			}
// Green Search
			float2 redAveG1S = redSumG1S / 2;
			float2 blueAveG1S = blueSumG1S / 2;
			float2 greenAveG1S = greenSumG1S / 4;
			float2 lumaG1S = redAveG1S + green1ValG1S + blueAveG1S;
			float2 grRedDiffG1S = redAveG1S - greenAveG1S;
			float2 grBlueDiffG1S = blueAveG1S - greenAveG1S;
			if ((fabs(lumaG1S.x- lumaG1C.x) <= lumaG1Thr.x) &&
				(fabs(redAveG1S.x- redAveG1C.x) <= patternRG1Thr2.x) &&
				(fabs(blueAveG1S.x- blueAveG1C.x) <= patternBG1Thr2.x) &&
				(fabs(grRedDiffG1S.x- grRedDiffG1C.x) <= gRDiffG1Thr.x) &&
				(fabs(grBlueDiffG1S.x- grBlueDiffG1C.x) <= gBDiffG1Thr.x) &&
				(fabs(greenAveG1S.x- greenAveG1C.x) <= patternGG1Thr.x))
			{
				sum.z += green1ValG1S.x;
				++count.z;
			}
			if ((fabs(lumaG1S.y- lumaG1C.y) <= lumaG1Thr.y) &&
				(fabs(redAveG1S.y- redAveG1C.y) <= patternRG1Thr2.y) &&
				(fabs(blueAveG1S.y- blueAveG1C.y) <= patternBG1Thr2.y) &&
				(fabs(grRedDiffG1S.y- grRedDiffG1C.y) <= gRDiffG1Thr.y) &&
				(fabs(grBlueDiffG1S.y- grBlueDiffG1C.y) <= gBDiffG1Thr.y) &&
				(fabs(greenAveG1S.y- greenAveG1C.y) <= patternGG1Thr.y))
			{
				sum.w += green1ValG1S.y;
				++count.w;
			}
// permutation
			if ((fabs(lumaG1S.y - lumaG1C.x) <= lumaG1Thr.x) &&
				(fabs(redAveG1S.y - redAveG1C.x) <= patternRG1Thr2.x) &&
				(fabs(blueAveG1S.y - blueAveG1C.x) <= patternBG1Thr2.x) &&
				(fabs(grRedDiffG1S.y - grRedDiffG1C.x) <= gRDiffG1Thr.x) &&
				(fabs(grBlueDiffG1S.y - grBlueDiffG1C.x) <= gBDiffG1Thr.x) &&
				(fabs(greenAveG1S.y - greenAveG1C.x) <= patternGG1Thr.x))
			{
				sum.z += green1ValG1S.y;
				++count.z;
			}
			if ((fabs(lumaG1S.x - lumaG1C.y) <= lumaG1Thr.y) &&
				(fabs(redAveG1S.x - redAveG1C.y) <= patternRG1Thr2.y) &&
				(fabs(blueAveG1S.x - blueAveG1C.y) <= patternBG1Thr2.y) &&
				(fabs(grRedDiffG1S.x - grRedDiffG1C.y) <= gRDiffG1Thr.y) &&
				(fabs(grBlueDiffG1S.x - grBlueDiffG1C.y) <= gBDiffG1Thr.y) &&
				(fabs(greenAveG1S.x - greenAveG1C.y) <= patternGG1Thr.y))
			{
				sum.w += green1ValG1S.x;
				++count.w;
			}
		}
	}
	int4 opix;
	if (isRedRow)
	{
		if (count.x > 1)
			opix.x = (int)(sum.x / count.x);
		else
			opix.x = rgrgCenter.x;
		if (count.z > 1)
			opix.y = (int)(sum.z / count.z);
		else
			opix.y = rgrgCenter.y;
		if (count.y > 1)
			opix.z = (int)(sum.y / count.y);
		else
			opix.z = rgrgCenter.z;
		if (count.w > 1)
			opix.w = (int)(sum.w / count.w);
		else
			opix.w = rgrgCenter.w;
	}
	else
	{// r,g permutation (!)
		if (count.z > 1)
			opix.x = (int)(sum.z / count.z);
		else
			opix.x = rgrgCenter.x;
		if (count.x > 1)
			opix.y = (int)(sum.x / count.x);
		else
			opix.y = rgrgCenter.y;
		if (count.w > 1)
			opix.z = (int)(sum.w / count.w);
		else
			opix.z = rgrgCenter.z;
		if (count.y > 1)
			opix.w = (int)(sum.y / count.y);
		else
			opix.w = rgrgCenter.w;
	}
	return  opix - ofs;
}


int2 doBNR32_2CH(__read_only image2d_t imgin, int2 pos, const int ofs,
				const float lumaSqtThr, bool isRedRow)
{
	int2 pLeft = { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop = { pos.x, pos.y - 1 };
	int2 pBot = { pos.x, pos.y + 1 };
	int2 pLT = { pos.x - 1, pos.y - 1 };
	int2 pRT = { pos.x + 1, pos.y - 1 };
	int2 pLBot = { pos.x - 1, pos.y + 1 };
	int2 pRBot = { pos.x + 1, pos.y + 1 };
	ushort2 rgrgCenter = readImageus2(imgin, pos);
	ushort2 rgrgLeft = readImageus2(imgin, pLeft);
	ushort2 rgrgRight = readImageus2(imgin, pRight);
	ushort2 rgrgTop = readImageus2(imgin, pTop);
	ushort2 rgrgBot = readImageus2(imgin, pBot);
	ushort2 rgrgLT = readImageus2(imgin, pLT);
	ushort2 rgrgRT = readImageus2(imgin, pRT);
	ushort2 rgrgLBot = readImageus2(imgin, pLBot);
	ushort2 rgrgRBot = readImageus2(imgin, pRBot);
	float redvalRedC, grSumHRedC, grSumVRedC, blueSumRedC, green1ValG1C, redSumG1C, blueSumG1C, greenSumG1C;
	if (isRedRow)
	{
// Red center
		redvalRedC = (float)rgrgCenter.x; // x, z are center R's
		grSumHRedC = (float)(rgrgLeft.y + rgrgCenter.y); // Horizontal 
		grSumVRedC = (float)(rgrgTop.x + rgrgBot.x);	// vertical 
		blueSumRedC = (float)(rgrgLT.y + rgrgLBot.y + rgrgTop.y + rgrgBot.y);
// Green center
		green1ValG1C = (float)rgrgCenter.y; // y, w are center G's
		redSumG1C = (float)(rgrgCenter.x + rgrgRight.x); // horizontal neigb sum
		blueSumG1C = (float)(rgrgBot.y + rgrgTop.y); // vertical neigb  sum
		greenSumG1C = (float)(rgrgTop.x + rgrgBot.x + rgrgRT.x + rgrgRBot.x);
	}
	else
	{
// Green center
		green1ValG1C = (float)rgrgCenter.x; // y, w are center G's
		redSumG1C = (float)(rgrgLeft.y + rgrgCenter.y); // horizontal neigb sum
		blueSumG1C = (float)(rgrgBot.x + rgrgTop.x); // vertical neigb  sum
		greenSumG1C = (float)(rgrgLT.y + rgrgLBot.y + rgrgTop.y + rgrgBot.y);
// "Red" (BLue) 
		redvalRedC = (float)rgrgCenter.y; // x, z are center R's
		grSumHRedC = (float)(rgrgCenter.x + rgrgRight.x); // Horizontal 
		grSumVRedC = (float)(rgrgBot.y + rgrgTop.y);	// vertical 
		blueSumRedC = (float)(rgrgTop.x + rgrgBot.x + rgrgRT.x + rgrgRBot.x);
	}
	float grSumRedC = grSumHRedC + grSumVRedC;
	float grAveRedC = grSumRedC / 4;
	float blueAveRedC = blueSumRedC / 4;
	float lumaRedC = redvalRedC + grAveRedC + blueAveRedC;
	float grBlueDiffRedC = grSumRedC - blueSumRedC;
	float redGrDiffRedC = grAveRedC - redvalRedC;
	float redAveG1C = redSumG1C / 2;
	float greenAveG1C = greenSumG1C / 4;
	float blueAveG1C = blueSumG1C / 2;
	float lumaG1C = redAveG1C + green1ValG1C + blueAveG1C;
	float grRedDiffG1C = redAveG1C - greenAveG1C;
	float grBlueDiffG1C = blueAveG1C - greenAveG1C;
//  Thresholds fill in 
	float fLumaSqtThr = lumaSqtThr;
	float fLumaGrSqtThr = lumaSqtThr;
	float fGrGrAtRedRelThr = .5;
	float fBlBlAtRedRelThr = .5;
	float fGrGrAtGrRelThr = .5;
	float fRdRdAtGreenRelThr = 1.;
	float fBlBlAtGreenRelThr = 1.;
	float fDiff4Thr = 3.;
	float fDiff24Thr = 3.;
	float fDiff14Thr = 3.;
	float lumaRThr = sqrt(lumaRedC)* fLumaSqtThr;
	float patternGRThr = lumaRThr * fGrGrAtRedRelThr;
	float patternBRThr = lumaRThr * fBlBlAtRedRelThr;
	float gBDiffRThr = lumaRThr * fDiff4Thr;
	float redGrDiffRThr = lumaRThr * fDiff14Thr;
	float lumaG1Thr = sqrt(lumaG1C) * fLumaGrSqtThr;
	float patternRG1Thr2 = lumaG1Thr * fRdRdAtGreenRelThr;
	float patternBG1Thr2 = lumaG1Thr * fBlBlAtGreenRelThr;
	float patternGG1Thr = lumaG1Thr * fGrGrAtGrRelThr;
	float gRDiffG1Thr = lumaG1Thr * fDiff24Thr;
	float gBDiffG1Thr = lumaG1Thr * fDiff24Thr;
	float2 sum = { 0,0 };
	int2  count = { 0,0 };
	int skipY = 2;
	int skipX = 1;
	int cKernSizeY = 32;
	int cKernSizeX = 32 / 2;
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos = { pos.x + i, pos.y + j };
			int2 pSLeft = { spos.x - 1, spos.y };
			int2 pSRight = { spos.x + 1, spos.y };
			int2 pSTop = { spos.x, spos.y - 1 };
			int2 pSBot = { spos.x, spos.y + 1 };
			int2 pSLT = { spos.x - 1, spos.y - 1 };
			int2 pSRT = { spos.x + 1, spos.y - 1 };
			int2 pSLBot = { spos.x - 1, spos.y + 1 };
			int2 pSRBot = { spos.x + 1, spos.y + 1 };

			ushort2 rgrgSearch = readImageus2(imgin, spos);
			ushort2 rgrgSLeft = readImageus2(imgin, pSLeft);
			ushort2 rgrgSRight = readImageus2(imgin, pSRight);
			ushort2 rgrgSTop = readImageus2(imgin, pSTop);
			ushort2 rgrgSBot = readImageus2(imgin, pSBot);
			ushort2 rgrgSLT = readImageus2(imgin, pSLT);
			ushort2 rgrgSRT = readImageus2(imgin, pSRT);
			ushort2 rgrgSLBot = readImageus2(imgin, pSLBot);
			ushort2 rgrgSRBot = readImageus2(imgin, pSRBot);
			float redvalRedS, grSumHRedS, grSumVRedS,  blueSumRedS, green1ValG1S, redSumG1S, blueSumG1S, greenSumG1S;
			if (isRedRow)
			{
// Red center
				redvalRedS = (float)rgrgSearch.x; // x, z are center R's
				grSumHRedS = (float)(rgrgSLeft.y + rgrgSearch.y); // Horizontal 
				grSumVRedS = (float)(rgrgSTop.x + rgrgSBot.x);	// vertical 
				blueSumRedS = (float)(rgrgSLT.y + rgrgSLBot.y + rgrgSTop.y + rgrgSBot.y);
// Green center
				green1ValG1S = (float)rgrgSearch.y; // y, w are center G's
				redSumG1S = (float)(rgrgSearch.x + rgrgSRight.x); // horizontal neigb sum
				blueSumG1S = (float)(rgrgSBot.y + rgrgSTop.y); // vertical neigb  sum
				greenSumG1S = (float)(rgrgSTop.x + rgrgSBot.x + rgrgSRT.x + rgrgSRBot.x);
			}
			else
			{
// Green center
				green1ValG1S = (float)rgrgSearch.x; // y, w are center G's
				redSumG1S = (float)(rgrgSLeft.y + rgrgSearch.y); // horizontal neigb sum
				blueSumG1S = (float)(rgrgSBot.x + rgrgSTop.x); // vertical neigb  sum
				greenSumG1S = (float)(rgrgSLT.y + rgrgSLBot.y + rgrgSTop.y + rgrgSBot.y);
// "Red" (BLue) 
				redvalRedS = (float)rgrgSearch.y; // x, z are center R's
				grSumHRedS = (float)(rgrgSearch.x + rgrgSRight.x); // Horizontal 
				grSumVRedS = (float)(rgrgSBot.y + rgrgSTop.y);	// vertical 
				blueSumRedS = (float)(rgrgSTop.x + rgrgSBot.x + rgrgSRT.x + rgrgSRBot.x);
			}
			float grSumRedS = grSumHRedS + grSumVRedS;
			float grAveRedS = grSumRedS / 4;
			float blueAveRedS = blueSumRedS / 4;
			float lumaRedS = redvalRedS + grAveRedS + blueAveRedS;
			float grBlueDiffRedS = grSumRedS - blueSumRedS;
			float redGrDiffRedS = grAveRedS - redvalRedS;
			if ((fabs(lumaRedS - lumaRedC) <= lumaRThr) &&
				(fabs(grAveRedS - grAveRedC) <= patternGRThr) &&
				(fabs(blueAveRedS - blueAveRedC) <= patternBRThr) &&
				(fabs(grBlueDiffRedS - grBlueDiffRedC) <= gBDiffRThr) &&
				(fabs(redGrDiffRedS - redGrDiffRedC) <= redGrDiffRThr))
			{
				sum.x += redvalRedS;
				++count.x;
			}
			// Green Search
			float redAveG1S = redSumG1S / 2;
			float blueAveG1S = blueSumG1S / 2;
			float greenAveG1S = greenSumG1S / 4;
			float lumaG1S = redAveG1S + green1ValG1S + blueAveG1S;
			float grRedDiffG1S = redAveG1S - greenAveG1S;
			float grBlueDiffG1S = blueAveG1S - greenAveG1S;
			if ((fabs(lumaG1S - lumaG1C) <= lumaG1Thr) &&
				(fabs(redAveG1S - redAveG1C) <= patternRG1Thr2) &&
				(fabs(blueAveG1S - blueAveG1C) <= patternBG1Thr2) &&
				(fabs(grRedDiffG1S - grRedDiffG1C) <= gRDiffG1Thr) &&
				(fabs(grBlueDiffG1S - grBlueDiffG1C) <= gBDiffG1Thr) &&
				(fabs(greenAveG1S - greenAveG1C) <= patternGG1Thr))
			{
				sum.y += green1ValG1S;
				++count.y;
			}
		}
	}
	int2 opix;
	if (isRedRow)
	{
		if (count.x > 1)
			opix.x = (int)(sum.x / count.x);
		else
			opix.x = rgrgCenter.x;
		if (count.y > 1)
			opix.y = (int)(sum.y / count.y);
		else
			opix.y = rgrgCenter.y;
	}
	else
	{// r,g permutation (!)
		if (count.y > 1)
			opix.x = (int)(sum.y / count.y);
		else
			opix.x = rgrgCenter.x;
		if (count.x > 1)
			opix.y = (int)(sum.x / count.x);
		else
			opix.y = rgrgCenter.y;
	}
	return opix - ofs;
}


/********************* SYNTHESIS **************************************************/
__kernel void BNR32(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaSqtThr,
	__write_only image2d_t imgout)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgout), get_image_height(imgout) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + 1 + kernSize / 4, y + kernSize + 4 };
			int4 opix = doBNR32FLOAT(imgin, ipos, ofs, lumaSqtThr, !leftmostisGreen);		// 15.6Mpix/s
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y };
			write_imageui(imgout, opos, opixu);
		}
	}
}


__kernel void BNR32Green(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaSqtThr,
	__write_only image2d_t imgOut)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgOut), get_image_height(imgOut) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + 1 + kernSize / 4, y + kernSize + 4 };
			int4 opix = doBNR32Green(imgin, ipos, ofs, lumaSqtThr, !leftmostisGreen);	// 40.4Mpix/s -> 2 passes needed -> 20.2Mpix/s 
//			int4 opix = doBNR32SingleGreen(imgin, ipos, ofs, lumaSqtThr, !leftmostisGreen); // 57Mpix/s -> 4 passes neede -> 14.25Mpix/s 
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y };
			write_imageui(imgOut, opos, opixu);
		}
	}
}


__kernel void BNR32RedBlue(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaSqtThr,
	__read_only image2d_t imgGreen, // green result
	__write_only image2d_t imgOut)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgOut), get_image_height(imgOut) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + 1 + kernSize / 4, y + kernSize + 4 };
			int4 opix = doBNR32RedBlue(imgin, ipos, ofs, lumaSqtThr, !leftmostisGreen);	
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y };
			uint4 savedGreen = read_imageui(imgGreen, bnrSampler, opos); // OpenCl 1.1 ok
			if (leftmostisGreen)
			{
				opixu.x = savedGreen.x;
				opixu.z = savedGreen.z;
			}
			else
			{
				opixu.y = savedGreen.y;
				opixu.w = savedGreen.w;
			}
			write_imageui(imgOut, opos, opixu);
		}
	}
}


__kernel void BNR16Green(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaSqtThr,
	__write_only image2d_t imgOut)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgOut), get_image_height(imgOut) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + 1 + kernSize / 4, y + kernSize + 4 };
			int4 opix = doBNR16Green(imgin, ipos, ofs, lumaSqtThr, !leftmostisGreen);	// 40.4Mpix/s -> 2 passes needed -> 20.2Mpix/s 
//			int4 opix = doBNR16SingleGreen(imgin, ipos, ofs, lumaSqtThr, !leftmostisGreen); // 57Mpix/s -> 4 passes neede -> 14.25Mpix/s 
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y };
			write_imageui(imgOut, opos, opixu);
		}
	}
}


__kernel void BNR16RedBlue(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaSqtThr,
	__read_only image2d_t imgGreen, // green result
	__write_only image2d_t imgOut)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgOut), get_image_height(imgOut) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + 1 + kernSize / 4, y + kernSize + 4 };
			int4 opix = doBNR16RedBlue(imgin, ipos, ofs, lumaSqtThr, !leftmostisGreen);
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y };
			uint4 savedGreen = read_imageui(imgGreen, bnrSampler, opos); // OpenCl 1.1 ok
			if (leftmostisGreen)
			{
				opixu.x = savedGreen.x;
				opixu.z = savedGreen.z;
			}
			else
			{
				opixu.y = savedGreen.y;
				opixu.w = savedGreen.w;
			}
			write_imageui(imgOut, opos, opixu);
		}
	}
}


__kernel void BNR32_2CH(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaSqtThr,
	__write_only image2d_t imgout)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgout), get_image_height(imgout) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + 1 + kernSize / 2, y + kernSize + 2 };
			int2 opix = doBNR32_2CH(imgin, ipos, ofs, lumaSqtThr, !leftmostisGreen);	// 40.4Mpix/s -> 2 passes needed -> 20.2Mpix/s 
			uint4 opixu = { convert_uint_sat(opix.x), convert_uint_sat(opix.y),0,0 };
			int2 opos = { x,y };
			write_imageui(imgout, opos, opixu);
		}
	}
}



__kernel void Subtract_Black_Ofs(__read_only image2d_t imgin,
	const int ofs,
	__write_only image2d_t imgout)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgout), get_image_height(imgout) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x+1, y+4 };// margin ??
			int4 ipix = readImagei4(imgin, ipos);
			int4 opix = ipix - ofs;
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y }; // margin ??
			write_imageui(imgout, opos, opixu);
		}
	}
}


/********************* C C D **************************************************/
int4 doBNR32Ccd(__read_only image2d_t imgin, int2 pos, const int ofs,
	const float lumaThr, const float  channelThr, const float  patternThr, const float patternThr2, bool isRedRow)
{
	int2 pLeft = { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop = { pos.x, pos.y - 1 };
	int2 pBot = { pos.x, pos.y + 1 };
	int2 pLT = { pos.x - 1, pos.y - 1 };
	int2 pRT = { pos.x + 1, pos.y - 1 };
	int2 pLBot = { pos.x - 1, pos.y + 1 };
	int2 pRBot = { pos.x + 1, pos.y + 1 };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	ushort4 rgrgTop = readImageus4(imgin, pTop);
	ushort4 rgrgBot = readImageus4(imgin, pBot);
	ushort4 rgrgLT = readImageus4(imgin, pLT);
	ushort4 rgrgRT = readImageus4(imgin, pRT);
	ushort4 rgrgLBot = readImageus4(imgin, pLBot);
	ushort4 rgrgRBot = readImageus4(imgin, pRBot);
	float2 redvalRedC, grSumHRedC, grSumVRedC, bluegreenSumRedMC, blueSumRedC, green1ValG1C, redSumG1C, blueSumG1C, greenSumG1C;
	if (isRedRow)
	{
		redvalRedC = (float2)(rgrgCenter.x, rgrgCenter.z); // x, z are center R's
		grSumHRedC = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // Horizontal 
		grSumVRedC = (float2)(rgrgTop.x + rgrgBot.x, rgrgTop.z + rgrgBot.z);	// vertical 
		bluegreenSumRedMC = (float2)(rgrgTop.y + rgrgBot.y, rgrgTop.z + rgrgBot.z);
		blueSumRedC = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC.x, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC.x);
		green1ValG1C = (float2)(rgrgCenter.y, rgrgCenter.w); // y, w are center G's
		redSumG1C = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC.y, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC.y);
	}
	else
	{
		redvalRedC = (float2)(rgrgCenter.y, rgrgCenter.w); // x, z are center R's
		grSumHRedC = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // Horizontal 
		grSumVRedC = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w);	// vertical 
		bluegreenSumRedMC = (float2)(rgrgTop.z + rgrgBot.z, rgrgTop.y + rgrgBot.y); // green
		blueSumRedC = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC.x, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC.x);
		green1ValG1C = (float2)(rgrgCenter.x, rgrgCenter.z); // y, w are center G's
		redSumG1C = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.x + rgrgTop.x, rgrgBot.z + rgrgTop.z); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC.y, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC.y);
	}
	float2 grSumRedC = grSumHRedC + grSumVRedC;
	float2 grAveRedC = grSumRedC / 4;
	float2 blueAveRedC = blueSumRedC / 4;
//	float2 lumaRedC = (redvalRedC + grAveRedC + blueAveRedC) / 3.;
	float2 lumaRedC = redvalRedC + grAveRedC + blueAveRedC;
	float2 grBlueDiffRedC = grSumRedC - blueSumRedC;
	float2 redGrDiffRedC = grAveRedC - redvalRedC;
	float2 redAveG1C = redSumG1C / 2;
//	float2 greenAveG1C = greenSumG1C / 4;
	float2 blueAveG1C = blueSumG1C / 2;
//	float2 lumaG1C = (redAveG1C + green1ValG1C + blueAveG1C) / 3.;
	float2 lumaG1C = redAveG1C + green1ValG1C + blueAveG1C;
	float4 sum = { 0,0,0,0 };
	int4  count = { 0,0,0,0 };
	int skipY = 2;
	int skipX = 1;
	int cKernSizeY = 32;
	int cKernSizeX = 32 / 4;
	float lumaThr_43 = (4. * lumaThr) / 3.;
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos = { pos.x + i, pos.y + j };
			int2 pSLeft = { spos.x - 1, spos.y };
			int2 pSRight = { spos.x + 1, spos.y };
			int2 pSTop = { spos.x, spos.y - 1 };
			int2 pSBot = { spos.x, spos.y + 1 };
			int2 pSLT = { spos.x - 1, spos.y - 1 };
			int2 pSRT = { spos.x + 1, spos.y - 1 };
			int2 pSLBot = { spos.x - 1, spos.y + 1 };
			int2 pSRBot = { spos.x + 1, spos.y + 1 };
			ushort4 rgrgSearch = readImageus4(imgin, spos);
			ushort4 rgrgSLeft = readImageus4(imgin, pSLeft);
			ushort4 rgrgSRight = readImageus4(imgin, pSRight);
			ushort4 rgrgSTop = readImageus4(imgin, pSTop);
			ushort4 rgrgSBot = readImageus4(imgin, pSBot);
			ushort4 rgrgSLT = readImageus4(imgin, pSLT);
			ushort4 rgrgSRT = readImageus4(imgin, pSRT);
			ushort4 rgrgSLBot = readImageus4(imgin, pSLBot);
			ushort4 rgrgSRBot = readImageus4(imgin, pSRBot);
			float2 redvalRedS, grSumHRedS, grSumVRedS, bluegreenSumRedMS, blueSumRedS, green1ValG1S, redSumG1S, blueSumG1S, greenSumG1S;
			if (isRedRow)
			{
				redvalRedS = (float2)(rgrgSearch.x, rgrgSearch.z); // x, z are center R's
				grSumHRedS = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w);
				grSumVRedS = (float2)(rgrgSTop.x + rgrgSBot.x, rgrgSTop.z + rgrgSBot.z);
				bluegreenSumRedMS = (float2)(rgrgSTop.y + rgrgSBot.y, rgrgSTop.z + rgrgSBot.z);
				blueSumRedS = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS.x,  // corners
									rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS.x);
				green1ValG1S = (float2)(rgrgSearch.y, rgrgSearch.w); // y, w are center G's
				redSumG1S = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS.y,
										rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS.y);
			}
			else
			{
				redvalRedS = (float2)(rgrgSearch.y, rgrgSearch.w); // x, z are center R's
				grSumHRedS = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x);
				grSumVRedS = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w);
				bluegreenSumRedMS = (float2)(rgrgSTop.z + rgrgSBot.z, rgrgSTop.y + rgrgSBot.y);
				blueSumRedS = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS.x,
									rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS.x);
				green1ValG1S = (float2)(rgrgSearch.x, rgrgSearch.z); // y, w are center G's
				redSumG1S = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.x + rgrgSTop.x, rgrgSBot.z + rgrgSTop.z); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS.y,
									rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS.y);
			}
			float2 grSumRedS = grSumHRedS + grSumVRedS;
			float2 grAveRedS = grSumRedS / 4;
			float2 blueAveRedS = blueSumRedS / 4;
//			float2 lumaRedS = (redvalRedS + grAveRedS + blueAveRedS) / 3.;
			float2 lumaRedS = redvalRedS + grAveRedS + blueAveRedS;
			if ((fabs(lumaRedS.x - lumaRedC.x) <= lumaThr) &&
				(fabs(redvalRedS.x - redvalRedC.x) <= channelThr) &&
				(fabs(grAveRedS.x - grAveRedC.x) <= patternThr) &&
				(fabs(blueAveRedS.x - blueAveRedC.x) <= patternThr))
			{
				sum.x += redvalRedS.x;
				++count.x;
			}
			if ((fabs(lumaRedS.y - lumaRedC.y) <= lumaThr) &&
				(fabs(redvalRedS.y - redvalRedC.y) <= channelThr) &&
				(fabs(grAveRedS.y - grAveRedC.y) <= patternThr) &&
				(fabs(blueAveRedS.y - blueAveRedC.y) <= patternThr))
			{
				sum.y += redvalRedS.y;
				++count.y;
			}
			if ((fabs(lumaRedS.y - lumaRedC.x) <= lumaThr) &&
				(fabs(redvalRedS.y - redvalRedC.x) <= channelThr) &&
				(fabs(grAveRedS.y - grAveRedC.x) <= patternThr) &&
				(fabs(blueAveRedS.y - blueAveRedC.x) <= patternThr))
			{
				sum.x += redvalRedS.y;
				++count.x;
			}
			if ((fabs(lumaRedS.x - lumaRedC.y) <= lumaThr) &&
				(fabs(redvalRedS.x - redvalRedC.y) <= channelThr) &&
				(fabs(grAveRedS.x - grAveRedC.y) <= patternThr) &&
				(fabs(blueAveRedS.x - blueAveRedC.y) <= patternThr))
			{
				sum.y += redvalRedS.x;
				++count.y;
			}
			float2 redAveG1S = redSumG1S / 2;
			float2 blueAveG1S = blueSumG1S / 2;
//			float2 greenAveG1S = greenSumG1S / 4;
//			float2 lumaG1S = (redAveG1S + green1ValG1S + blueAveG1S) / 3.;
			float2 lumaG1S = redAveG1S + green1ValG1S + blueAveG1S;
			if ((fabs(lumaG1S.x - lumaG1C.x) <= lumaThr) &&
				(fabs(green1ValG1S.x - green1ValG1C.x) <= channelThr) &&
				(fabs(redAveG1S.x - redAveG1C.x) <= patternThr2) &&
				(fabs(blueAveG1S.x - blueAveG1C.x) <= patternThr2) &&
				(fabs(greenSumG1S.x - greenSumG1C.x) <= lumaThr_43))
//				(fabs(greenAveG1S.x - greenAveG1C.x) <= lumaThr))
			{
				sum.z += green1ValG1S.x;
				++count.z;
			}
			if ((fabs(lumaG1S.y - lumaG1C.y) <= lumaThr) &&
				(fabs(green1ValG1S.y - green1ValG1C.y) <= channelThr) &&
				(fabs(redAveG1S.y - redAveG1C.y) <= patternThr2) &&
				(fabs(blueAveG1S.y - blueAveG1C.y) <= patternThr2) &&
				(fabs(greenSumG1S.y - greenSumG1C.y) <= lumaThr_43))
//				(fabs(greenAveG1S.y - greenAveG1C.y) <= lumaThr))
			{
				sum.w += green1ValG1S.y;
				++count.w;
			}
			if ((fabs(lumaG1S.y - lumaG1C.x) <= lumaThr) &&
				(fabs(green1ValG1S.y - green1ValG1C.x) <= channelThr) &&
				(fabs(redAveG1S.y - redAveG1C.x) <= patternThr2) &&
				(fabs(blueAveG1S.y - blueAveG1C.x) <= patternThr2) &&
				(fabs(greenSumG1S.y - greenSumG1C.x) <= lumaThr_43))
//				(fabs(greenAveG1S.y - greenAveG1C.x) <= lumaThr))
			{
				sum.z += green1ValG1S.y;
				++count.z;
			}
			if ((fabs(lumaG1S.x - lumaG1C.y) <= lumaThr) &&
				(fabs(green1ValG1S.x - green1ValG1C.y) <= channelThr) &&
				(fabs(redAveG1S.x - redAveG1C.y) <= patternThr2) &&
				(fabs(blueAveG1S.x - blueAveG1C.y) <= patternThr2) &&
				(fabs(greenSumG1S.x - greenSumG1C.y) <= lumaThr_43))
//				(fabs(greenAveG1S.x - greenAveG1C.y) <= lumaThr))
			{
				sum.w += green1ValG1S.x;
				++count.w;
			}
		}
	}
	int4 opix;
	if (isRedRow)
	{
		if (count.x > 1)
			opix.x = (int)(sum.x / count.x);
		else
			opix.x = rgrgCenter.x;
		if (count.z > 1)
			opix.y = (int)(sum.z / count.z);
		else
			opix.y = rgrgCenter.y;
		if (count.y > 1)
			opix.z = (int)(sum.y / count.y);
		else
			opix.z = rgrgCenter.z;
		if (count.w > 1)
			opix.w = (int)(sum.w / count.w);
		else
			opix.w = rgrgCenter.w;
	}
	else
	{// r,g permutation (!)
		if (count.z > 1)
			opix.x = (int)(sum.z / count.z);
		else
			opix.x = rgrgCenter.x;
		if (count.x > 1)
			opix.y = (int)(sum.x / count.x);
		else
			opix.y = rgrgCenter.y;
		if (count.w > 1)
			opix.z = (int)(sum.w / count.w);
		else
			opix.z = rgrgCenter.z;
		if (count.y > 1)
			opix.w = (int)(sum.y / count.y);
		else
			opix.w = rgrgCenter.w;
	}
	return  opix - ofs;
}


__kernel void BNR32CCD(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaThr,
	const float channelThr,
	const float patternThr,
	const float patternThr2,
	__write_only image2d_t imgout)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgout), get_image_height(imgout) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + 1 + kernSize / 4, y + kernSize + 4 };
			int4 opix = doBNR32Ccd(imgin, ipos, ofs, lumaThr, channelThr, patternThr, patternThr2, !leftmostisGreen);// 15.6Mpix/s
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y };
			write_imageui(imgout, opos, opixu);
		}
	}
}


int4 doBNR16Ccd(__read_only image2d_t imgin, int2 pos, const int ofs,
	const float lumaThr, const float  channelThr, const float  patternThr, const float patternThr2, bool isRedRow)
{
	int2 pLeft = { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop = { pos.x, pos.y - 1 };
	int2 pBot = { pos.x, pos.y + 1 };
	int2 pLT = { pos.x - 1, pos.y - 1 };
	int2 pRT = { pos.x + 1, pos.y - 1 };
	int2 pLBot = { pos.x - 1, pos.y + 1 };
	int2 pRBot = { pos.x + 1, pos.y + 1 };
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	ushort4 rgrgLeft = readImageus4(imgin, pLeft);
	ushort4 rgrgRight = readImageus4(imgin, pRight);
	ushort4 rgrgTop = readImageus4(imgin, pTop);
	ushort4 rgrgBot = readImageus4(imgin, pBot);
	ushort4 rgrgLT = readImageus4(imgin, pLT);
	ushort4 rgrgRT = readImageus4(imgin, pRT);
	ushort4 rgrgLBot = readImageus4(imgin, pLBot);
	ushort4 rgrgRBot = readImageus4(imgin, pRBot);
	float2 redvalRedC, grSumHRedC, grSumVRedC, bluegreenSumRedMC, blueSumRedC, green1ValG1C, redSumG1C, blueSumG1C, greenSumG1C;
	if (isRedRow)
	{
		redvalRedC = (float2)(rgrgCenter.x, rgrgCenter.z); // x, z are center R's
		grSumHRedC = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // Horizontal 
		grSumVRedC = (float2)(rgrgTop.x + rgrgBot.x, rgrgTop.z + rgrgBot.z);	// vertical 
		bluegreenSumRedMC = (float2)(rgrgTop.y + rgrgBot.y, rgrgTop.z + rgrgBot.z);
		blueSumRedC = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC.x, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC.x);
		green1ValG1C = (float2)(rgrgCenter.y, rgrgCenter.w); // y, w are center G's
		redSumG1C = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC.y, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC.y);
	}
	else
	{
		redvalRedC = (float2)(rgrgCenter.y, rgrgCenter.w); // x, z are center R's
		grSumHRedC = (float2)(rgrgCenter.x + rgrgCenter.z, rgrgCenter.z + rgrgRight.x); // Horizontal 
		grSumVRedC = (float2)(rgrgBot.y + rgrgTop.y, rgrgBot.w + rgrgTop.w);	// vertical 
		bluegreenSumRedMC = (float2)(rgrgTop.z + rgrgBot.z, rgrgTop.y + rgrgBot.y); // green
		blueSumRedC = (float2)(rgrgTop.x + rgrgBot.x + bluegreenSumRedMC.x, rgrgRT.x + rgrgRBot.x + bluegreenSumRedMC.x);
		green1ValG1C = (float2)(rgrgCenter.x, rgrgCenter.z); // y, w are center G's
		redSumG1C = (float2)(rgrgLeft.w + rgrgCenter.y, rgrgCenter.y + rgrgCenter.w); // horizontal neigb sum
		blueSumG1C = (float2)(rgrgBot.x + rgrgTop.x, rgrgBot.z + rgrgTop.z); // vertical neigb  sum
		greenSumG1C = (float2)(rgrgLT.w + rgrgLBot.w + bluegreenSumRedMC.y, rgrgTop.w + rgrgBot.w + bluegreenSumRedMC.y);
	}
	float2 grSumRedC = grSumHRedC + grSumVRedC;
	float2 grAveRedC = grSumRedC / 4;
	float2 blueAveRedC = blueSumRedC / 4;
//	float2 lumaRedC = (redvalRedC + grAveRedC + blueAveRedC) / 3.;
	float2 lumaRedC = redvalRedC + grAveRedC + blueAveRedC;
	float2 grBlueDiffRedC = grSumRedC - blueSumRedC;
	float2 redGrDiffRedC = grAveRedC - redvalRedC;
	float2 redAveG1C = redSumG1C / 2;
//	float2 greenAveG1C = greenSumG1C / 4;
	float2 blueAveG1C = blueSumG1C / 2;
//	float2 lumaG1C = (redAveG1C + green1ValG1C + blueAveG1C) / 3.;
	float2 lumaG1C = redAveG1C + green1ValG1C + blueAveG1C;
	float4 sum = { 0,0,0,0 };
	int4  count = { 0,0,0,0 };
	int skipY = 2;
	int skipX = 1;
	int cKernSizeY = 16;
	int cKernSizeX = 16 / 4;
	float lumaThr_43 = (4. * lumaThr) / 3.;
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos = { pos.x + i, pos.y + j };
			int2 pSLeft = { spos.x - 1, spos.y };
			int2 pSRight = { spos.x + 1, spos.y };
			int2 pSTop = { spos.x, spos.y - 1 };
			int2 pSBot = { spos.x, spos.y + 1 };
			int2 pSLT = { spos.x - 1, spos.y - 1 };
			int2 pSRT = { spos.x + 1, spos.y - 1 };
			int2 pSLBot = { spos.x - 1, spos.y + 1 };
			int2 pSRBot = { spos.x + 1, spos.y + 1 };
			ushort4 rgrgSearch = readImageus4(imgin, spos);
			ushort4 rgrgSLeft = readImageus4(imgin, pSLeft);
			ushort4 rgrgSRight = readImageus4(imgin, pSRight);
			ushort4 rgrgSTop = readImageus4(imgin, pSTop);
			ushort4 rgrgSBot = readImageus4(imgin, pSBot);
			ushort4 rgrgSLT = readImageus4(imgin, pSLT);
			ushort4 rgrgSRT = readImageus4(imgin, pSRT);
			ushort4 rgrgSLBot = readImageus4(imgin, pSLBot);
			ushort4 rgrgSRBot = readImageus4(imgin, pSRBot);
			float2 redvalRedS, grSumHRedS, grSumVRedS, bluegreenSumRedMS, blueSumRedS, green1ValG1S, redSumG1S, blueSumG1S, greenSumG1S;
			if (isRedRow)
			{
				redvalRedS = (float2)(rgrgSearch.x, rgrgSearch.z); // x, z are center R's
				grSumHRedS = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w);
				grSumVRedS = (float2)(rgrgSTop.x + rgrgSBot.x, rgrgSTop.z + rgrgSBot.z);
				bluegreenSumRedMS = (float2)(rgrgSTop.y + rgrgSBot.y, rgrgSTop.z + rgrgSBot.z);
				blueSumRedS = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS.x,  // corners
					rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS.x);
				green1ValG1S = (float2)(rgrgSearch.y, rgrgSearch.w); // y, w are center G's
				redSumG1S = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS.y,
					rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS.y);
			}
			else
			{
				redvalRedS = (float2)(rgrgSearch.y, rgrgSearch.w); // x, z are center R's
				grSumHRedS = (float2)(rgrgSearch.x + rgrgSearch.z, rgrgSearch.z + rgrgSRight.x);
				grSumVRedS = (float2)(rgrgSBot.y + rgrgSTop.y, rgrgSBot.w + rgrgSTop.w);
				bluegreenSumRedMS = (float2)(rgrgSTop.z + rgrgSBot.z, rgrgSTop.y + rgrgSBot.y);
				blueSumRedS = (float2)(rgrgSTop.x + rgrgSBot.x + bluegreenSumRedMS.x,
					rgrgSRT.x + rgrgSRBot.x + bluegreenSumRedMS.x);
				green1ValG1S = (float2)(rgrgSearch.x, rgrgSearch.z); // y, w are center G's
				redSumG1S = (float2)(rgrgSLeft.w + rgrgSearch.y, rgrgSearch.y + rgrgSearch.w); // horizontal neigb sum
				blueSumG1S = (float2)(rgrgSBot.x + rgrgSTop.x, rgrgSBot.z + rgrgSTop.z); // vertical neigb  sum
				greenSumG1S = (float2)(rgrgSLT.w + rgrgSLBot.w + bluegreenSumRedMS.y,
					rgrgSTop.w + rgrgSBot.w + bluegreenSumRedMS.y);
			}
			float2 grSumRedS = grSumHRedS + grSumVRedS;
			float2 grAveRedS = grSumRedS / 4;
			float2 blueAveRedS = blueSumRedS / 4;
//			float2 lumaRedS = (redvalRedS + grAveRedS + blueAveRedS) / 3.;
			float2 lumaRedS = redvalRedS + grAveRedS + blueAveRedS;
			if ((fabs(lumaRedS.x - lumaRedC.x) <= lumaThr) &&
				(fabs(redvalRedS.x - redvalRedC.x) <= channelThr) &&
				(fabs(grAveRedS.x - grAveRedC.x) <= patternThr) &&
				(fabs(blueAveRedS.x - blueAveRedC.x) <= patternThr))
			{
				sum.x += redvalRedS.x;
				++count.x;
			}
			if ((fabs(lumaRedS.y - lumaRedC.y) <= lumaThr) &&
				(fabs(redvalRedS.y - redvalRedC.y) <= channelThr) &&
				(fabs(grAveRedS.y - grAveRedC.y) <= patternThr) &&
				(fabs(blueAveRedS.y - blueAveRedC.y) <= patternThr))
			{
				sum.y += redvalRedS.y;
				++count.y;
			}
			if ((fabs(lumaRedS.y - lumaRedC.x) <= lumaThr) &&
				(fabs(redvalRedS.y - redvalRedC.x) <= channelThr) &&
				(fabs(grAveRedS.y - grAveRedC.x) <= patternThr) &&
				(fabs(blueAveRedS.y - blueAveRedC.x) <= patternThr))
			{
				sum.x += redvalRedS.y;
				++count.x;
			}
			if ((fabs(lumaRedS.x - lumaRedC.y) <= lumaThr) &&
				(fabs(redvalRedS.x - redvalRedC.y) <= channelThr) &&
				(fabs(grAveRedS.x - grAveRedC.y) <= patternThr) &&
				(fabs(blueAveRedS.x - blueAveRedC.y) <= patternThr))
			{
				sum.y += redvalRedS.x;
				++count.y;
			}
			float2 redAveG1S = redSumG1S / 2;
			float2 blueAveG1S = blueSumG1S / 2;
//			float2 greenAveG1S = greenSumG1S / 4;
//			float2 lumaG1S = (redAveG1S + green1ValG1S + blueAveG1S) / 3.;
			float2 lumaG1S = redAveG1S + green1ValG1S + blueAveG1S;
			if ((fabs(lumaG1S.x - lumaG1C.x) <= lumaThr) &&
				(fabs(green1ValG1S.x - green1ValG1C.x) <= channelThr) &&
				(fabs(redAveG1S.x - redAveG1C.x) <= patternThr2) &&
				(fabs(blueAveG1S.x - blueAveG1C.x) <= patternThr2) &&
				(fabs(greenSumG1S.x - greenSumG1C.x) <= lumaThr_43))
//				(fabs(greenAveG1S.x - greenAveG1C.x) <= lumaThr))
			{
				sum.z += green1ValG1S.x;
				++count.z;
			}
			if ((fabs(lumaG1S.y - lumaG1C.y) <= lumaThr) &&
				(fabs(green1ValG1S.y - green1ValG1C.y) <= channelThr) &&
				(fabs(redAveG1S.y - redAveG1C.y) <= patternThr2) &&
				(fabs(blueAveG1S.y - blueAveG1C.y) <= patternThr2) &&
				(fabs(greenSumG1S.y - greenSumG1C.y) <= lumaThr_43))
//				(fabs(greenAveG1S.y - greenAveG1C.y) <= lumaThr))
			{
				sum.w += green1ValG1S.y;
				++count.w;
			}
			if ((fabs(lumaG1S.y - lumaG1C.x) <= lumaThr) &&
				(fabs(green1ValG1S.y - green1ValG1C.x) <= channelThr) &&
				(fabs(redAveG1S.y - redAveG1C.x) <= patternThr2) &&
				(fabs(blueAveG1S.y - blueAveG1C.x) <= patternThr2) &&
				(fabs(greenSumG1S.y - greenSumG1C.x) <= lumaThr_43))
//				(fabs(greenAveG1S.y - greenAveG1C.x) <= lumaThr))
			{
				sum.z += green1ValG1S.y;
				++count.z;
			}
			if ((fabs(lumaG1S.x - lumaG1C.y) <= lumaThr) &&
				(fabs(green1ValG1S.x - green1ValG1C.y) <= channelThr) &&
				(fabs(redAveG1S.x - redAveG1C.y) <= patternThr2) &&
				(fabs(blueAveG1S.x - blueAveG1C.y) <= patternThr2) &&
				(fabs(greenSumG1S.x - greenSumG1C.y) <= lumaThr_43))
//				(fabs(greenAveG1S.x - greenAveG1C.y) <= lumaThr))
			{
				sum.w += green1ValG1S.x;
				++count.w;
			}
		}
	}
	int4 opix;
	if (isRedRow)
	{
		if (count.x > 1)
			opix.x = (int)(sum.x / count.x);
		else
			opix.x = rgrgCenter.x;
		if (count.z > 1)
			opix.y = (int)(sum.z / count.z);
		else
			opix.y = rgrgCenter.y;
		if (count.y > 1)
			opix.z = (int)(sum.y / count.y);
		else
			opix.z = rgrgCenter.z;
		if (count.w > 1)
			opix.w = (int)(sum.w / count.w);
		else
			opix.w = rgrgCenter.w;
	}
	else
	{// r,g permutation (!)
		if (count.z > 1)
			opix.x = (int)(sum.z / count.z);
		else
			opix.x = rgrgCenter.x;
		if (count.x > 1)
			opix.y = (int)(sum.x / count.x);
		else
			opix.y = rgrgCenter.y;
		if (count.w > 1)
			opix.z = (int)(sum.w / count.w);
		else
			opix.z = rgrgCenter.z;
		if (count.y > 1)
			opix.w = (int)(sum.y / count.y);
		else
			opix.w = rgrgCenter.w;
	}
	return  opix - ofs;
}


__kernel void BNR16CCD(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaThr,
	const float channelThr,
	const float patternThr,
	const float patternThr2,
	__write_only image2d_t imgout)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgout), get_image_height(imgout) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + (kernSize/4) + 1, y + kernSize + 4 };
			int4 opix = doBNR16Ccd(imgin, ipos, ofs, lumaThr, channelThr, patternThr, patternThr2, !leftmostisGreen);// 15.6Mpix/s
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y };
			write_imageui(imgout, opos, opixu);
		}
	}
}


float max4(float4 v)
{
	float m0 = fmax(v.x, v.y);
	float m1 = fmax(v.z, v.w);
	return fmax(m0, m1);
}


float4 max4v(float4 v0, float4 v1, float4 v2, float4 v3 )
{
	float4 m0 = fmax(v0, v1);
	float4 m1 = fmax(v2, v3);
	return fmax(m0, m1);
}


float4 GetDiff(__read_only image2d_t imgin, int2 pos2, int2 pos, float4 posPos2Ratio, const float extraPolFact)
{
	ushort4 grgr = readImageus4(imgin, pos);
	float4 r1 = convert_float4(grgr) / posPos2Ratio;
	ushort4 rgrg = readImageus4(imgin, pos2);
	float4 diff = r1 - convert_float4(rgrg);
	float4 maxVal = r1 + extraPolFact * diff;
	return maxVal;
}


ushort4 GetLeft(__read_only image2d_t imgin, int2 pos)
{
	ushort4 rgrgC = readImageus4(imgin, pos);
	int2 posL = { pos.x - 1,pos.y };
	ushort4 rgrgL = readImageus4(imgin, posL);
	ushort4 grgr = { rgrgL.w, rgrgC.x, rgrgC.y, rgrgC.z };
	return grgr;
}


ushort4 GetRight(__read_only image2d_t imgin, int2 pos)
{
	ushort4 rgrgC = readImageus4(imgin, pos);
	int2 posR = { pos.x + 1,pos.y };
	ushort4 rgrgR = readImageus4(imgin, posR);
	ushort4 grgr = { rgrgC.y, rgrgC.z, rgrgC.w, rgrgR.x };
	return grgr;
}


ushort4 GetLeft2(__read_only image2d_t imgin, int2 pos)
{
	ushort4 rgrgC = readImageus4(imgin, pos);
	int2 posL = { pos.x - 1,pos.y };
	ushort4 rgrgL = readImageus4(imgin, posL);
	ushort4 rgrg = { rgrgL.z, rgrgL.w, rgrgC.x, rgrgC.y};
	return rgrg;
}


ushort4 GetRight2(__read_only image2d_t imgin, int2 pos)
{
	ushort4 rgrgC = readImageus4(imgin, pos);
	int2 posR = { pos.x + 1, pos.y };
	ushort4 rgrgR = readImageus4(imgin, posR);
	ushort4 rgrg = { rgrgC.z, rgrgC.w, rgrgR.x, rgrgR.y };
	return rgrg;
}


float4 GetDiffOfs(__read_only image2d_t imgin, int2 pos2, int2 pos, float4 posPos2Ratio, const float extraPolFact,
	bool left)
{
	ushort4 grgr = left ? GetLeft(imgin, pos) : GetRight(imgin, pos);
	float4 r1 = convert_float4(grgr) / posPos2Ratio;
	ushort4 rgrg = left ? GetLeft2(imgin, pos2) : GetRight2(imgin, pos2);
	float4 diff = r1 - convert_float4(rgrg);
	float4 maxVal = r1 + extraPolFact * diff;
	return maxVal;
}


float4 ExtraPolTest(__read_only image2d_t imgin, int2 pos, bool isRedRow, float2 grRedRatio, float2 greenBlueRatio,
	float extraPolFact, const bool cluster)
{
	int2 posTop = { pos.x, pos.y - 1 };
	int2 posTop2 = { pos.x, pos.y - 2 };
	float4 greenRedRatio_blueGreenRatio = { isRedRow ? grRedRatio.x : 1.f / greenBlueRatio.x,
											isRedRow ? 1.f / greenBlueRatio.x : grRedRatio.x,
											isRedRow ? grRedRatio.y : 1.f / greenBlueRatio.y,
											isRedRow ? 1.f / greenBlueRatio.y : grRedRatio.y };
	float4 greenRedRatio_redGreenRatio = { isRedRow ? grRedRatio.x : 1.f / greenBlueRatio.x,
											isRedRow ? 1.f / grRedRatio.x : greenBlueRatio.x,
											isRedRow ? grRedRatio.y : 1.f / greenBlueRatio.y,
											isRedRow ? 1.f / grRedRatio.y : greenBlueRatio.y };
	float4 diffTop = GetDiff(imgin, posTop2, posTop, greenRedRatio_blueGreenRatio, extraPolFact);
	int2 posBot = { pos.x, pos.y + 1 };
	int2 posBot2 = { pos.x, pos.y + 2 };
	float4 diffBot = GetDiff(imgin, posBot2, posBot, greenRedRatio_blueGreenRatio, extraPolFact);
	float4 diffLeft = GetDiffOfs(imgin, pos, pos, greenRedRatio_redGreenRatio, extraPolFact, true);
	float4 diffRight = GetDiffOfs(imgin, pos, pos, greenRedRatio_redGreenRatio, extraPolFact, false);
	float4 blueRedRatio_greenGreenRatio = { isRedRow ? greenRedRatio_blueGreenRatio.x / greenBlueRatio.x : 1.f,
											isRedRow ? 1.f : greenRedRatio_blueGreenRatio.x / greenBlueRatio.x,
											isRedRow ? greenRedRatio_blueGreenRatio.z / greenBlueRatio.y : 1.f,
											isRedRow ? 1.f : greenRedRatio_blueGreenRatio.z / greenBlueRatio.y };
	float4 diffTL = GetDiffOfs(imgin, posTop2, posTop, blueRedRatio_greenGreenRatio, extraPolFact, true);
	float4 diffTR = GetDiffOfs(imgin, posTop2, posTop, blueRedRatio_greenGreenRatio, extraPolFact, false);
	float4 diffBL = GetDiffOfs(imgin, posBot2, posBot, blueRedRatio_greenGreenRatio, extraPolFact, true);
	float4 diffBR = GetDiffOfs(imgin, posBot2, posBot, blueRedRatio_greenGreenRatio, extraPolFact, false);
	float4 extraPolHVMax;
	float4 extraPolDiagMax;
	if (cluster)
	{
		float4 diffClusterHor = fmin(diffLeft, diffRight);
		float4 diffClusterVer = fmin(diffTop, diffBot);
		extraPolHVMax = fmax(diffClusterHor, diffClusterVer);
		float4 diffClusterD1 = fmin(diffTL, diffBR);
		float4 diffClusterD2 = fmin(diffTR, diffBL);
		extraPolDiagMax = fmax(diffClusterD1, diffClusterD2);
	}
	else
	{ 
		extraPolHVMax = max4v(diffTop, diffBot, diffLeft, diffRight);
		extraPolDiagMax = max4v(diffTL, diffTR, diffBL, diffBR);
	}
	float4 extraPolMax = fmax(extraPolHVMax,extraPolDiagMax);
	return extraPolMax;
}


#define HOTKERNSIZEX (4) 
#define HOTKERNSIZEY (4) 
// Filter kernel is 3*HOTKERNSIZEX=12 pixels wide and 2*HOTKERNSIZEY +1 = 9 pixels high
// 6*5-1 = 29 for red center
#define NREDHORIZTOT (HOTKERNSIZEX + 2) 
#define NREDVERTICTOT (HOTKERNSIZEY + 1)
#define NREDS ((NREDHORIZTOT *NREDVERTICTOT ) -1)
// redgreen rows only
#define NGRNSSMALL (30-1)
// 6*9 - 1 = 54-1 = 53 for green center full 12x9 neib
#define NGRNHORIZTOT (HOTKERNSIZEX + 2) 
#define NGRNVERTICTOT (2*HOTKERNSIZEY + 1)
#define NGRNS ((NGRNHORIZTOT * NGRNVERTICTOT)-1)
// 6 * 4 = 24 ()no center to subtract)
#define NBLUS (((HOTKERNSIZEX + 2) * HOTKERNSIZEY))
uint4 doHotPix(__read_only image2d_t imgin, int2 pos, const float sdFact, const float trivialDiffThr, 
	const float extraPolFact, bool isRedRow, const bool cluster)
{
	float3 sumRGB = { 0, 0, 0 };
	float3 sumSqRGB = { 0, 0, 0 };
	int cKernSizeY = HOTKERNSIZEY;
	int cKernSizeX = HOTKERNSIZEX / 4;
	for (int j = -cKernSizeY; j < cKernSizeY; ++j)
	{
		// Red/Green Row (!)
		for (int i = -cKernSizeX; i <= cKernSizeX; ++i)
		{
			int2 spos = { pos.x + i, pos.y + j };
			float4 rgrgSearch = convert_float4(readImageus4(imgin, spos));
			if (isRedRow)
			{
				sumRGB.x += rgrgSearch.x + rgrgSearch.z;
				sumRGB.y += rgrgSearch.y + rgrgSearch.w;
				sumSqRGB.x += (rgrgSearch.x*rgrgSearch.x) + (rgrgSearch.z * rgrgSearch.z);
				sumSqRGB.y += (rgrgSearch.y*rgrgSearch.y) + (rgrgSearch.w * rgrgSearch.w);
			}
			else
			{
				sumRGB.x += rgrgSearch.y + rgrgSearch.w;
				sumRGB.y += rgrgSearch.x + rgrgSearch.z;
				sumSqRGB.x += (rgrgSearch.y*rgrgSearch.y) + (rgrgSearch.w * rgrgSearch.w);
				sumSqRGB.y += (rgrgSearch.x*rgrgSearch.x) + (rgrgSearch.z * rgrgSearch.z);
			}
		}
		++j;
		// complementary ("Green/Blue") Row 
		for (int i = -cKernSizeX; i <= cKernSizeX; ++i)
		{
			int2 spos = { pos.x + i, pos.y + j };
			float4 rgrgSearch = convert_float4(readImageus4(imgin, spos));
			if (isRedRow)
			{
				sumRGB.y += rgrgSearch.x + rgrgSearch.z;
				sumRGB.z += rgrgSearch.y + rgrgSearch.w;
				sumSqRGB.y += (rgrgSearch.x*rgrgSearch.x) + (rgrgSearch.z * rgrgSearch.z);
				sumSqRGB.z += (rgrgSearch.y*rgrgSearch.y) + (rgrgSearch.w * rgrgSearch.w);
			}
			else
			{
				sumRGB.y += rgrgSearch.y + rgrgSearch.w;
				sumRGB.z += rgrgSearch.x + rgrgSearch.z;
				sumSqRGB.y += (rgrgSearch.y*rgrgSearch.y) + (rgrgSearch.w * rgrgSearch.w),
				sumSqRGB.z += (rgrgSearch.x*rgrgSearch.x) + (rgrgSearch.z * rgrgSearch.z);
			}
		}
	}
	// last Red/Green Row (total is odd!)
	for (int i = -cKernSizeX; i <= cKernSizeX; ++i)
	{
		int2 spos = { pos.x + i, pos.y + cKernSizeY };
		float4 rgrgSearch = convert_float4(readImageus4(imgin, spos));
		if (isRedRow)
		{
			sumRGB.x += rgrgSearch.x + rgrgSearch.z;
			sumRGB.y += rgrgSearch.y + rgrgSearch.w;
			sumSqRGB.x += (rgrgSearch.x*rgrgSearch.x) + (rgrgSearch.z * rgrgSearch.z);
			sumSqRGB.y += (rgrgSearch.y*rgrgSearch.y) + (rgrgSearch.w * rgrgSearch.w);
		}
		else
		{
			sumRGB.x += rgrgSearch.y + rgrgSearch.w;
			sumRGB.y += rgrgSearch.x + rgrgSearch.z;
			sumSqRGB.x += (rgrgSearch.y*rgrgSearch.y) + (rgrgSearch.w * rgrgSearch.w);
			sumSqRGB.y += (rgrgSearch.x*rgrgSearch.x) + (rgrgSearch.z * rgrgSearch.z);
		}
	}
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	float4 fPermrgrgCenter = { isRedRow ? (float)rgrgCenter.x : (float)rgrgCenter.y,
								isRedRow ? (float)rgrgCenter.y : (float)rgrgCenter.x,
								isRedRow ? (float)rgrgCenter.z : (float)rgrgCenter.w,
								isRedRow ? (float)rgrgCenter.w : (float)rgrgCenter.z };
	float4 sumRG = { sumRGB.x - fPermrgrgCenter.x,
					 sumRGB.y - fPermrgrgCenter.y,
					 sumRGB.x - fPermrgrgCenter.z ,
					 sumRGB.y - fPermrgrgCenter.w }; // exclude center pixel from sum
	float4 sumSqRG = { sumSqRGB.x - (fPermrgrgCenter.x*fPermrgrgCenter.x),
					   sumSqRGB.y - (fPermrgrgCenter.y*fPermrgrgCenter.y),
					   sumSqRGB.x - (fPermrgrgCenter.z*fPermrgrgCenter.z),
					   sumSqRGB.y - (fPermrgrgCenter.w*fPermrgrgCenter.w) }; // exclude center pixel from sum
	float4 meanRG = { sumRG.x / NREDS,	sumRG.y / NGRNS,	sumRG.z / NREDS,	sumRG.w / NGRNS };
	float2 greenRedRatio = { (meanRG.y + 1.f) / (meanRG.x + 1.f),(meanRG.w + 1.f) / (meanRG.z + 1.f) };
	float4 ex2RG = { sumSqRG.x / NREDS,	sumSqRG.y / NGRNS,	sumSqRG.z / NREDS, sumSqRG.w / NGRNS };
	float4 e2xRG = meanRG * meanRG;
	float4 varRG = fmax(ex2RG - e2xRG, .0f);
	float4 sdRG = sqrt(varRG);
	float sdMaxRG = max4(sdRG);

	float meanB= sumRGB.z / NBLUS;
	float ex2B = sumSqRGB.z / NBLUS;
	float e2xB = meanB * meanB;
	float varB = fmax(ex2B - e2xB,.0f);
	float sdB = sqrt(varB);
	float2 greenBlueRatio = { (meanRG.y + 1.f) / (meanB + 1.f),(meanRG.w + 1.f) / (meanB + 1.f) };

	float sdMax = fmax(sdMaxRG, sdB);
	float meanDiffContrib = fmax(sdFact * sdMax, trivialDiffThr);
	float4 thrMeanRGHiF = meanRG + meanDiffContrib;
	float4 thrMeanRGLoF = meanRG - meanDiffContrib;

	float4 extraPolMax = ExtraPolTest(imgin, pos, isRedRow, greenRedRatio, greenBlueRatio, extraPolFact,cluster);
	float4 thrRGHiF = { fmax(thrMeanRGHiF.x, isRedRow ? extraPolMax.x : extraPolMax.y),
						fmax(thrMeanRGHiF.y, isRedRow ? extraPolMax.y : extraPolMax.x),
						fmax(thrMeanRGHiF.z, isRedRow ? extraPolMax.z : extraPolMax.w),
						fmax(thrMeanRGHiF.w, isRedRow ? extraPolMax.w : extraPolMax.z) };
	ushort4 thrRGHi = convert_ushort4_sat(thrRGHiF);
	ushort4 thrRGLo = convert_ushort4_sat(thrMeanRGLoF);
	uint4 opix;
	if (isRedRow)
	{
		if ((rgrgCenter.x > thrRGHi.x) || (rgrgCenter.x < thrRGLo.x))
			opix.x = Median5RedLeft(imgin, pos);
		else
			opix.x = rgrgCenter.x;
		if ((rgrgCenter.y > thrRGHi.y) || (rgrgCenter.y < thrRGLo.y))
			opix.y = Median5GrnLeft(imgin, pos);
		else
			opix.y = rgrgCenter.y;
		if ((rgrgCenter.z > thrRGHi.z) || (rgrgCenter.z < thrRGLo.z))
			opix.z = Median5RedRight(imgin, pos);
		else
			opix.z = rgrgCenter.z;
		if ((rgrgCenter.w > thrRGHi.w) || (rgrgCenter.w < thrRGLo.w))
			opix.w = Median5GrnRight(imgin, pos);
		else
			opix.w = rgrgCenter.w;
	}
	else
	{// r,g permutation (!)
		if ((rgrgCenter.x > thrRGHi.y) || (rgrgCenter.x < thrRGLo.y))
			opix.x = Median5RedLeft(imgin, pos);
		else
			opix.x = rgrgCenter.x;
		if ((rgrgCenter.y > thrRGHi.x) || (rgrgCenter.y < thrRGLo.x))
			opix.y = Median5GrnLeft(imgin, pos);
		else
			opix.y = rgrgCenter.y;
		if ((rgrgCenter.z > thrRGHi.w) || (rgrgCenter.z < thrRGLo.w))
			opix.z = Median5RedRight(imgin, pos);
		else
			opix.z = rgrgCenter.z;
		if ((rgrgCenter.w > thrRGHi.z) || (rgrgCenter.w < thrRGLo.z))
			opix.w = Median5GrnRight(imgin, pos);
		else
			opix.w = rgrgCenter.w;
	}
	return  opix;
}


uint4 doHotPixCluster(__read_only image2d_t imgin, int2 pos, const float sdFact, const float trivialDiffThr, const float extraPolFact, bool isRedRow)
{
	float3 sumRGB = { 0, 0, 0 };
	float2 sumSqRGB = { 0, 0 };
	int cKernSizeY = HOTKERNSIZEY;
	int cKernSizeX = HOTKERNSIZEX / 4;
	for (int j = -cKernSizeY; j < cKernSizeY; ++j)
	{
		// Red/Green Row (!)
		for (int i = -cKernSizeX; i <= cKernSizeX; ++i)
		{
			int2 spos = { pos.x + i, pos.y + j };
			float4 rgrgSearch = convert_float4(readImageus4(imgin, spos));
			if (isRedRow)
			{
				sumRGB.x += rgrgSearch.x + rgrgSearch.z;
				sumRGB.y += rgrgSearch.y + rgrgSearch.w;
				sumSqRGB += (float2)((rgrgSearch.x*rgrgSearch.x) + (rgrgSearch.z * rgrgSearch.z),
									(rgrgSearch.y*rgrgSearch.y) + (rgrgSearch.w * rgrgSearch.w));
			}
			else
			{
				sumRGB.x += rgrgSearch.y + rgrgSearch.w;
				sumRGB.y += rgrgSearch.x + rgrgSearch.z;
				sumSqRGB+= (float2)((rgrgSearch.y*rgrgSearch.y) + (rgrgSearch.w * rgrgSearch.w),
									(rgrgSearch.x*rgrgSearch.x) + (rgrgSearch.z * rgrgSearch.z));
			}
		}
		++j;
		// complementary ("Green/Blue") Row 
		for (int i = -cKernSizeX; i <= cKernSizeX; ++i)
		{
			int2 spos = { pos.x + i, pos.y + j };
			float4 rgrgSearch = convert_float4(readImageus4(imgin, spos));
			if (isRedRow)
			{
				sumRGB.y += rgrgSearch.x + rgrgSearch.z;
				sumRGB.z += rgrgSearch.y + rgrgSearch.w;
				sumSqRGB.y+= (rgrgSearch.x*rgrgSearch.x) + (rgrgSearch.z * rgrgSearch.z);
			}
			else
			{
				sumRGB.y += rgrgSearch.y + rgrgSearch.w;
				sumRGB.z += rgrgSearch.x + rgrgSearch.z;
				sumSqRGB.y+= (rgrgSearch.y*rgrgSearch.y) + (rgrgSearch.w * rgrgSearch.w);
			}
		}
	}
	// last Red/Green Row (total is odd!)
	for (int i = -cKernSizeX; i <= cKernSizeX; ++i)
	{
		int2 spos = { pos.x + i, pos.y + cKernSizeY };
		float4 rgrgSearch = convert_float4(readImageus4(imgin, spos));
		if (isRedRow)
		{
			sumRGB.x += rgrgSearch.x + rgrgSearch.z;
			sumRGB.y += rgrgSearch.y + rgrgSearch.w;
			sumSqRGB+=(float2)((rgrgSearch.x*rgrgSearch.x) + (rgrgSearch.z * rgrgSearch.z),
								(rgrgSearch.y*rgrgSearch.y) + (rgrgSearch.w * rgrgSearch.w));
		}
		else
		{
			sumRGB.x += rgrgSearch.y + rgrgSearch.w;
			sumRGB.y += rgrgSearch.x + rgrgSearch.z;
			sumSqRGB+=(float2)((rgrgSearch.y*rgrgSearch.y) + (rgrgSearch.w * rgrgSearch.w),
								(rgrgSearch.x*rgrgSearch.x) + (rgrgSearch.z * rgrgSearch.z));
		}
	}
	ushort4 rgrgCenter = readImageus4(imgin, pos);
	float4 fPermrgrgCenter = { isRedRow ? (float)rgrgCenter.x : (float)rgrgCenter.y,
								isRedRow ? (float)rgrgCenter.y : (float)rgrgCenter.x,
								isRedRow ? (float)rgrgCenter.z : (float)rgrgCenter.w,
								isRedRow ? (float)rgrgCenter.w : (float)rgrgCenter.z };
	float4 sumRG = { sumRGB.x - fPermrgrgCenter.x,
					sumRGB.y - fPermrgrgCenter.y,
					sumRGB.x - fPermrgrgCenter.z ,
					sumRGB.y - fPermrgrgCenter.w }; // exclude center pixel from sum
	float4 sumSqRG = { sumSqRGB.x - (fPermrgrgCenter.x*fPermrgrgCenter.x),
					sumSqRGB.y - (fPermrgrgCenter.y*fPermrgrgCenter.y),
					sumSqRGB.x - (fPermrgrgCenter.z*fPermrgrgCenter.z),
					sumSqRGB.y - (fPermrgrgCenter.w*fPermrgrgCenter.w) }; // exclude center pixel from sum
	float4 meanRG = { sumRG.x / NREDS,	sumRG.y / NGRNS,	sumRG.z / NREDS,	sumRG.w / NGRNS };
	float2 greenRedRatio = { (meanRG.y + 1.f) / (meanRG.x + 1.f),(meanRG.w + 1.f) / (meanRG.z + 1.f) };
	float4 ex2RG = { sumSqRG.x / NREDS,	sumSqRG.y / NGRNS,	sumSqRG.z / NREDS, sumSqRG.w / NGRNS };
	float4 e2xRG = meanRG * meanRG;
	float4 varRG = fmax(ex2RG - e2xRG, .0f);
	float4 sdRG = sqrt(varRG);

	float meanB = sumRGB.z / NBLUS;
	float2 greenBlueRatio = { (meanRG.y + 1.f) / (meanB + 1.f),(meanRG.w + 1.f) / (meanB + 1.f) };

	float4 meanDiffContrib = fmax(sdFact * sdRG, trivialDiffThr);
	float4 thrMeanRGHiF = meanRG + meanDiffContrib;
	float4 thrMeanRGLoF = meanRG - meanDiffContrib;

	float4 extraPolMax = ExtraPolTest(imgin, pos, isRedRow, greenRedRatio, greenBlueRatio, extraPolFact,true);
	float4 thrRGHiF = { fmax(thrMeanRGHiF.x, isRedRow ? extraPolMax.x : extraPolMax.y),
						fmax(thrMeanRGHiF.y, isRedRow ? extraPolMax.y : extraPolMax.x),
						fmax(thrMeanRGHiF.z, isRedRow ? extraPolMax.z : extraPolMax.w),
						fmax(thrMeanRGHiF.w, isRedRow ? extraPolMax.w : extraPolMax.z) };
	ushort4 thrRGHi = convert_ushort4_sat(thrRGHiF);
	ushort4 thrRGLo = convert_ushort4_sat(thrMeanRGLoF);
	uint4 opix;
	if (isRedRow)
	{
		if ((rgrgCenter.x > thrRGHi.x) || (rgrgCenter.x < thrRGLo.x))
			opix.x = Median5RedLeft(imgin, pos);
		else
			opix.x = rgrgCenter.x;
		if ((rgrgCenter.y > thrRGHi.y) || (rgrgCenter.y < thrRGLo.y))
			opix.y = Median5GrnLeft(imgin, pos);
		else
			opix.y = rgrgCenter.y;
		if ((rgrgCenter.z > thrRGHi.z) || (rgrgCenter.z < thrRGLo.z))
			opix.z = Median5RedRight(imgin, pos);
		else
			opix.z = rgrgCenter.z;
		if ((rgrgCenter.w > thrRGHi.w) || (rgrgCenter.w < thrRGLo.w))
			opix.w = Median5GrnRight(imgin, pos);
		else
			opix.w = rgrgCenter.w;
	}
	else
	{// r,g permutation (!)
		if ((rgrgCenter.x > thrRGHi.y) || (rgrgCenter.x < thrRGLo.y))
			opix.x = Median5RedLeft(imgin, pos);
		else
			opix.x = rgrgCenter.x;
		if ((rgrgCenter.y > thrRGHi.x) || (rgrgCenter.y < thrRGLo.x))
			opix.y = Median5GrnLeft(imgin, pos);
		else
			opix.y = rgrgCenter.y;
		if ((rgrgCenter.z > thrRGHi.w) || (rgrgCenter.z < thrRGLo.w))
			opix.z = Median5RedRight(imgin, pos);
		else
			opix.z = rgrgCenter.z;
		if ((rgrgCenter.w > thrRGHi.z) || (rgrgCenter.w < thrRGLo.z))
			opix.w = Median5GrnRight(imgin, pos);
		else
			opix.w = rgrgCenter.w;
	}
	return  opix;
}


__kernel void HotPixCmosOfs(__read_only image2d_t imgin,
	const float sdFact, const float  trivialDiffThr, const float extraPolFact,
	__write_only image2d_t imgout)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgout), get_image_height(imgout) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	int2 limLo = { 0,3 };
	int2 limHi = { out_dim.x - 1,out_dim.y - 4 };
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 pos = { x, y };
			uint4 opix;
			if ((x>limLo.x) && (x<limHi.x) && (y>limLo.y) && (y<limHi.y))
				opix = doHotPix(imgin, pos, sdFact, trivialDiffThr, extraPolFact, !leftmostisGreen,false);
			else
				opix = read_imageui(imgin, bnrSampler, pos);
			write_imageui(imgout, pos, opix);
		}
	}
}


__kernel void HotPixCmosClusterOfs(__read_only image2d_t imgin,
	const float sdFact, const float  trivialDiffThr, const float extraPolFact,
	__write_only image2d_t imgout)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgout), get_image_height(imgout) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	int2 limLo = { 0,3 };
	int2 limHi = { out_dim.x - 1,out_dim.y - 4 };
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 pos = { x, y };
			uint4 opix;
			if ((x>limLo.x) && (x<limHi.x) && (y>limLo.y) && (y<limHi.y))
				opix = doHotPixCluster(imgin, pos, sdFact, trivialDiffThr, extraPolFact, !leftmostisGreen);
			else
				opix = read_imageui(imgin, bnrSampler, pos);
			write_imageui(imgout, pos, opix);
		}
	}
}


__kernel void HotPixCmos(__read_only image2d_t imgin,
	const float sdFact, const float  trivialDiffThr, const float extraPolFact,
	__write_only image2d_t imgout)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgout), get_image_height(imgout) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 pos = { x, y };
			uint4 opix;
			opix = doHotPix(imgin, pos, sdFact, trivialDiffThr, extraPolFact, !leftmostisGreen,false);
			write_imageui(imgout, pos, opix);
		}
	}
}


__kernel void HotPixCmosCluster(__read_only image2d_t imgin,
	const float sdFact, const float  trivialDiffThr, const float extraPolFact,
	__write_only image2d_t imgout)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim = { get_image_width(imgout), get_image_height(imgout) };
	float2 block_size = { (float)out_dim.x / (float)work_size.x, (float)out_dim.y / (float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		bool leftmostisGreen = ((y & 1) == 1); // lefmost rgba tuplet starts at green ?
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 pos = { x, y };
			uint4 opix;
			opix = doHotPixCluster(imgin, pos, sdFact, trivialDiffThr, extraPolFact, !leftmostisGreen);
			write_imageui(imgout, pos, opix);
		}
	}
}

ushort readImageus(__read_only image2d_t imgin, int2 pos)
{
//	uint4 upix = read_imageui(imgin, pos); // requires OpenCl 1.2
	uint4 upix = read_imageui(imgin, bnrSampler, pos); // OpenCl 1.1 ok
	ushort ipix = convert_ushort(upix.x);
	return ipix;
}


int doGrayNR8(__read_only image2d_t imgin, int2 pos, const int ofs,
	const float lumaSqtThr, const float pt2Thr)
{
	int2 pLeft =  { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop =	  { pos.x,	   pos.y - 1 };
	int2 pBot =	  { pos.x,	   pos.y + 1 };
	int2 pLT =	  { pos.x - 1, pos.y - 1 };
	int2 pRT =	  { pos.x + 1, pos.y - 1 };
	int2 pLBot =  { pos.x - 1, pos.y + 1 };
	int2 pRBot =  { pos.x + 1, pos.y + 1 };
	ushort center= readImageus(imgin, pos);
	ushort left	 = readImageus(imgin, pLeft);
	ushort right = readImageus(imgin, pRight);
	ushort top	 = readImageus(imgin, pTop);
	ushort bot	 = readImageus(imgin, pBot);
	ushort lTop	 = readImageus(imgin, pLT);
	ushort rTop	 = readImageus(imgin, pRT);
	ushort lBot	 = readImageus(imgin, pLBot);
	ushort rBot	 = readImageus(imgin, pRBot);

	float horizsumC = left + right;
	float horizvalC = horizsumC/2;
	float verticsumC = top + bot;
	float verticvalC = verticsumC/2;
	float diagNsumC = lTop + rBot;
	float diagNValC = diagNsumC/2;
	float diagZsumC = rTop + lBot;
	float diagZValC = diagZsumC/2;
	float meanHC = (center + horizvalC)  / 2;
	float meanVC = (center + verticvalC) / 2;
	float meanNC = (center + diagNValC)  / 2;
	float meanZC = (center + diagZValC)  / 2;
	float lumC   = (meanHC + meanVC + meanNC + meanZC)*.25;
	float lumaThr= sqrt(lumC) * lumaSqtThr;
	float pattern2Thr = lumaThr * pt2Thr;
	const int skipY = 1;
	const int skipX = 1;
	const int cKernSizeY = 8;
	const int cKernSizeX = 8;
	float sum  = 0;
	int count= 0;
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos =		{ pos.x + i,  pos.y + j };
			int2 pSLeft =	{ spos.x - 1, spos.y };
			int2 pSRight =	{ spos.x + 1, spos.y };
			int2 pSTop =	{ spos.x,	  spos.y - 1 };
			int2 pSBot =	{ spos.x,	  spos.y + 1 };
			int2 pSLT =		{ spos.x - 1, spos.y - 1 };
			int2 pSRT =		{ spos.x + 1, spos.y - 1 };
			int2 pSLBot =	{ spos.x - 1, spos.y + 1 };
			int2 pSRBot =	{ spos.x + 1, spos.y + 1 };

			ushort search=	readImageus(imgin, spos);
			ushort sLeft =	readImageus(imgin, pSLeft);
			ushort sRight = readImageus(imgin, pSRight);
			ushort sTop =	readImageus(imgin, pSTop);
			ushort sBot =	readImageus(imgin, pSBot);
			ushort sLT =	readImageus(imgin, pSLT);
			ushort sRT =	readImageus(imgin, pSRT);
			ushort sLBot =	readImageus(imgin, pSLBot);
			ushort sRBot =	readImageus(imgin, pSRBot);
			float horizsumS = sLeft + sRight;
			float horizvalS = horizsumS / 2;
			float verticsumS= sTop + sBot;
			float verticvalS= verticsumS / 2;
			float diagNsumS = sLT + sRBot;
			float diagNValS = diagNsumS / 2;
			float diagZsumS = sRT + sLBot;
			float diagZValS = diagZsumS / 2;
			float meanHS = (search + horizvalS) / 2;
			float meanVS = (search + verticvalS)/ 2;
			float meanNS = (search + diagNValS) / 2;
			float meanZS = (search + diagZValS) / 2;
			float lumS = (meanHS + meanVS + meanNS + meanZS)*.25;
			if ((fabs(lumS - lumC)			 <= lumaThr) &&
				(fabs(horizvalS - horizvalC) <= pattern2Thr) &&
				(fabs(verticvalS- verticvalC)<= pattern2Thr) &&
				(fabs(diagNsumS - diagNsumC) <= pattern2Thr) &&
				(fabs(diagZsumS - diagZsumC) <= pattern2Thr))
			{
				sum += search;
				++count;
			}
		}
	}
	int opix;
	if (count > 1)
		opix = (int)(sum / count);
	else
		opix = center;
	return opix - ofs;
}


/* Single pixel version - 344ms for 1/4 SonyIMX211 */
__kernel void GrayNR8(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaSqtThr,
	const float pt2Thr,
	__write_only image2d_t imgout)
{
	float2 global_id = { (float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = { get_global_size(0),	get_global_size(1) };
	int2 out_dim   = { get_image_width(imgout), get_image_height(imgout) };
	float2 block_size = { (float)out_dim.x/(float)work_size.x, (float)out_dim.y/(float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + kernSize+1, y + kernSize+1};
			int opix = doGrayNR8(imgin, ipos, ofs, lumaSqtThr, pt2Thr);
			uint opixu = convert_uint_sat(opix);
			int2 opos = { x,y };
			write_imageui(imgout, opos, opixu);
		}
	}
}


int4 doGray4NR(__read_only image2d_t imgin, int2 pos, const int ofs,
				const float lumaSqtThr, const float pt2Thr)
{
	int2 pLeft =  { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop =	  { pos.x,	   pos.y - 1 };
	int2 pBot =	  { pos.x,	   pos.y + 1 };
	int2 pLT =	  { pos.x - 1, pos.y - 1 };
	int2 pRT =	  { pos.x + 1, pos.y - 1 };
	int2 pLBot =  { pos.x - 1, pos.y + 1 };
	int2 pRBot =  { pos.x + 1, pos.y + 1 };
	ushort4 center= readImageus4(imgin, pos);
	ushort4 left  = readImageus4(imgin, pLeft);
	ushort4 right = readImageus4(imgin, pRight);
	ushort4 top	 = readImageus4(imgin, pTop);
	ushort4 bot	 = readImageus4(imgin, pBot);
	ushort4 lTop = readImageus4(imgin, pLT);
	ushort4 rTop = readImageus4(imgin, pRT);
	ushort4 lBot = readImageus4(imgin, pLBot);
	ushort4 rBot = readImageus4(imgin, pRBot);

	float4 horizsumC = {left.w + center.y, center.x + center.z, center.y + center.w, center.z + right.x};
	float4 horizvalC = horizsumC/2;
	float4 verticsumC= convert_float4(top) + convert_float4(bot);
	float4 verticvalC= verticsumC/2;
	float4 diagNsumC = {lTop.w + bot.y,	   top.x + bot.z,		top.y + bot.w,		top.z + rBot.x};
	float4 diagNValC = diagNsumC/2;
	float4 diagZsumC = {rBot.w + top.y,	   bot.x + top.z,		bot.y + top.w,		bot.z + lTop.x};
	float4 diagZValC = diagZsumC/2;
	float4 centerF = convert_float4(center);
	float4 meanHC = (centerF + horizvalC)  / 2;
	float4 meanVC = (centerF + verticvalC) / 2;
	float4 meanNC = (centerF + diagNValC)  / 2;
	float4 meanZC = (centerF + diagZValC)  / 2;
	float4 lumC   = (meanHC + meanVC + meanNC + meanZC)/4;
	float4 lumaThr= sqrt(lumC) * lumaSqtThr;
	float4 pattern2Thr = lumaThr * pt2Thr;
	const int skipY = 1;
	const int skipX = 1;
	const int cKernSizeY = 8;
	const int cKernSizeX = 8/4;
	float4 sum = {0,0,0,0};
	int4 count = {0,0,0,0};
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos =		{ pos.x + i,  pos.y + j };
			int2 pSLeft =	{ spos.x - 1, spos.y };
			int2 pSRight =	{ spos.x + 1, spos.y };
			int2 pSTop =	{ spos.x,	  spos.y - 1 };
			int2 pSBot =	{ spos.x,	  spos.y + 1 };
			int2 pSLT =		{ spos.x - 1, spos.y - 1 };
			int2 pSRT =		{ spos.x + 1, spos.y - 1 };
			int2 pSLBot =	{ spos.x - 1, spos.y + 1 };
			int2 pSRBot =	{ spos.x + 1, spos.y + 1 }; 

			ushort4 search=	readImageus4(imgin, spos);
			ushort4 sLeft =	readImageus4(imgin, pSLeft);
			ushort4 sRight = readImageus4(imgin, pSRight);
			ushort4 sTop =	readImageus4(imgin, pSTop);
			ushort4 sBot =	readImageus4(imgin, pSBot);
			ushort4 sLT =	readImageus4(imgin, pSLT);
			ushort4 sRT =	readImageus4(imgin, pSRT);
			ushort4 sLBot =	readImageus4(imgin, pSLBot);
			ushort4 sRBot =	readImageus4(imgin, pSRBot);
			float4 horizsumS = {sLeft.w + search.y,	search.x + search.z,search.y + search.w, search.z + sRight.x};
			float4 horizvalS = horizsumS / 2;
			float4 verticsumS= convert_float4(sTop) + convert_float4(sBot);
			float4 verticvalS= verticsumS / 2;
			float4 diagNsumS = {sLT.w + sBot.y,		sTop.x + sBot.z,	sTop.y + sBot.w,	sTop.z + sRBot.x};
			float4 diagNValS = diagNsumS / 2;
			float4 diagZsumS = {sRBot.w + sTop.y,	sBot.x + sTop.z,	sBot.y + sTop.w,	sBot.z + sLT.x};
			float4 diagZValS = diagZsumS / 2;
			float4 searchF = convert_float4(search);
			float4 meanHS = (searchF + horizvalS) / 2;
			float4 meanVS = (searchF + verticvalS)/ 2;
			float4 meanNS = (searchF + diagNValS) / 2;
			float4 meanZS = (searchF + diagZValS) / 2;
			float4 lumS = (meanHS + meanVS + meanNS + meanZS)/4;
			if ((fabs(lumS.x - lumC.x)			 <= lumaThr.x) &&
				(fabs(horizvalS.x - horizvalC.x) <= pattern2Thr.x) &&
				(fabs(verticvalS.x- verticvalC.x)<= pattern2Thr.x) &&
				(fabs(diagNsumS.x - diagNsumC.x) <= pattern2Thr.x) &&
				(fabs(diagZsumS.x - diagZsumC.x) <= pattern2Thr.x))
			{
				sum.x += search.x;
				++count.x;
			}
			if ((fabs(lumS.y - lumC.x)			 <= lumaThr.x) &&
				(fabs(horizvalS.y - horizvalC.x) <= pattern2Thr.x) &&
				(fabs(verticvalS.y- verticvalC.x)<= pattern2Thr.x) &&
				(fabs(diagNsumS.y - diagNsumC.x) <= pattern2Thr.x) &&
				(fabs(diagZsumS.y - diagZsumC.x) <= pattern2Thr.x))
			{
				sum.x += search.y;
				++count.x;
			}
			if ((fabs(lumS.z - lumC.x)			 <= lumaThr.x) &&
				(fabs(horizvalS.z - horizvalC.x) <= pattern2Thr.x) &&
				(fabs(verticvalS.z- verticvalC.x)<= pattern2Thr.x) &&
				(fabs(diagNsumS.z - diagNsumC.x) <= pattern2Thr.x) &&
				(fabs(diagZsumS.z - diagZsumC.x) <= pattern2Thr.x))
			{
				sum.x += search.z;
				++count.x;
			}
			if ((fabs(lumS.w - lumC.x)			 <= lumaThr.x) &&
				(fabs(horizvalS.w - horizvalC.x) <= pattern2Thr.x) &&
				(fabs(verticvalS.w- verticvalC.x)<= pattern2Thr.x) &&
				(fabs(diagNsumS.w - diagNsumC.x) <= pattern2Thr.x) &&
				(fabs(diagZsumS.w - diagZsumC.x) <= pattern2Thr.x))
			{
				sum.x += search.w;
				++count.x;
			}


			if ((fabs(lumS.x - lumC.y)			 <= lumaThr.y) &&
				(fabs(horizvalS.x - horizvalC.y) <= pattern2Thr.y) &&
				(fabs(verticvalS.x- verticvalC.y)<= pattern2Thr.y) &&
				(fabs(diagNsumS.x - diagNsumC.y) <= pattern2Thr.y) &&
				(fabs(diagZsumS.x - diagZsumC.y) <= pattern2Thr.y))
			{
				sum.y += search.x;
				++count.y;
			}
			if ((fabs(lumS.y - lumC.y)			 <= lumaThr.y) &&
				(fabs(horizvalS.y - horizvalC.y) <= pattern2Thr.y) &&
				(fabs(verticvalS.y- verticvalC.y)<= pattern2Thr.y) &&
				(fabs(diagNsumS.y - diagNsumC.y) <= pattern2Thr.y) &&
				(fabs(diagZsumS.y - diagZsumC.y) <= pattern2Thr.y))
			{
				sum.y += search.y;
				++count.y;
			}
			if ((fabs(lumS.z - lumC.y)			 <= lumaThr.y) &&
				(fabs(horizvalS.z - horizvalC.y) <= pattern2Thr.y) &&
				(fabs(verticvalS.z- verticvalC.y)<= pattern2Thr.y) &&
				(fabs(diagNsumS.z - diagNsumC.y) <= pattern2Thr.y) &&
				(fabs(diagZsumS.z - diagZsumC.y) <= pattern2Thr.y))
			{
				sum.y += search.z;
				++count.y;
			}
			if ((fabs(lumS.w - lumC.y)			 <= lumaThr.y) &&
				(fabs(horizvalS.w - horizvalC.y) <= pattern2Thr.y) &&
				(fabs(verticvalS.w- verticvalC.y)<= pattern2Thr.y) &&
				(fabs(diagNsumS.w - diagNsumC.y) <= pattern2Thr.y) &&
				(fabs(diagZsumS.w - diagZsumC.y) <= pattern2Thr.y))
			{
				sum.y += search.w;
				++count.y;
			}


			if ((fabs(lumS.x - lumC.z)			 <= lumaThr.z) &&
				(fabs(horizvalS.x - horizvalC.z) <= pattern2Thr.z) &&
				(fabs(verticvalS.x- verticvalC.z)<= pattern2Thr.z) &&
				(fabs(diagNsumS.x - diagNsumC.z) <= pattern2Thr.z) &&
				(fabs(diagZsumS.x - diagZsumC.z) <= pattern2Thr.z))
			{
				sum.z += search.x;
				++count.z;
			}
			if ((fabs(lumS.y - lumC.z)			 <= lumaThr.z) &&
				(fabs(horizvalS.y - horizvalC.z) <= pattern2Thr.z) &&
				(fabs(verticvalS.y- verticvalC.z)<= pattern2Thr.z) &&
				(fabs(diagNsumS.y - diagNsumC.z) <= pattern2Thr.z) &&
				(fabs(diagZsumS.y - diagZsumC.z) <= pattern2Thr.z))
			{
				sum.z += search.y;
				++count.z;
			}
			if ((fabs(lumS.z - lumC.z)			 <= lumaThr.z) &&
				(fabs(horizvalS.z - horizvalC.z) <= pattern2Thr.z) &&
				(fabs(verticvalS.z- verticvalC.z)<= pattern2Thr.z) &&
				(fabs(diagNsumS.z - diagNsumC.z) <= pattern2Thr.z) &&
				(fabs(diagZsumS.z - diagZsumC.z) <= pattern2Thr.z))
			{
				sum.z += search.z;
				++count.z;
			}
			if ((fabs(lumS.w - lumC.z)			 <= lumaThr.z) &&
				(fabs(horizvalS.w - horizvalC.z) <= pattern2Thr.z) &&
				(fabs(verticvalS.w- verticvalC.z)<= pattern2Thr.z) &&
				(fabs(diagNsumS.w - diagNsumC.z) <= pattern2Thr.z) &&
				(fabs(diagZsumS.w - diagZsumC.z) <= pattern2Thr.z))
			{
				sum.z += search.w;
				++count.z;
			}


			if ((fabs(lumS.x - lumC.w)			 <= lumaThr.w) &&
				(fabs(horizvalS.x - horizvalC.w) <= pattern2Thr.w) &&
				(fabs(verticvalS.x- verticvalC.w)<= pattern2Thr.w) &&
				(fabs(diagNsumS.x - diagNsumC.w) <= pattern2Thr.w) &&
				(fabs(diagZsumS.x - diagZsumC.w) <= pattern2Thr.w))
			{
				sum.w += search.x;
				++count.w;
			}
			if ((fabs(lumS.y - lumC.w)			 <= lumaThr.w) &&
				(fabs(horizvalS.y - horizvalC.w) <= pattern2Thr.w) &&
				(fabs(verticvalS.y- verticvalC.w)<= pattern2Thr.w) &&
				(fabs(diagNsumS.y - diagNsumC.w) <= pattern2Thr.w) &&
				(fabs(diagZsumS.y - diagZsumC.w) <= pattern2Thr.w))
			{
				sum.w += search.y;
				++count.w;
			}
			if ((fabs(lumS.z - lumC.w)			 <= lumaThr.w) &&
				(fabs(horizvalS.z - horizvalC.w) <= pattern2Thr.w) &&
				(fabs(verticvalS.z- verticvalC.w)<= pattern2Thr.w) &&
				(fabs(diagNsumS.z - diagNsumC.w) <= pattern2Thr.w) &&
				(fabs(diagZsumS.z - diagZsumC.w) <= pattern2Thr.w))
			{
				sum.w += search.z;
				++count.w;
			}
			if ((fabs(lumS.w - lumC.w)			 <= lumaThr.w) &&
				(fabs(horizvalS.w - horizvalC.w) <= pattern2Thr.w) &&
				(fabs(verticvalS.w- verticvalC.w)<= pattern2Thr.w) &&
				(fabs(diagNsumS.w - diagNsumC.w) <= pattern2Thr.w) &&
				(fabs(diagZsumS.w - diagZsumC.w) <= pattern2Thr.w))
			{
				sum.w += search.w;
				++count.w;
			}
		}
	}
	float4 opix = sum / convert_float4(count);
	return convert_int4(opix) - (int4)ofs;
}



int4 doGray4CrossNR(__read_only image2d_t imgin, int2 pos, const int ofs,
				const float lumaSqtThr, const float pt2Thr)
{
	int2 pLeft =  { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop =	  { pos.x,	   pos.y - 1 };
	int2 pBot =	  { pos.x,	   pos.y + 1 };
	int2 pLT =	  { pos.x - 1, pos.y - 1 };
	int2 pRT =	  { pos.x + 1, pos.y - 1 };
	int2 pLBot =  { pos.x - 1, pos.y + 1 };
	int2 pRBot =  { pos.x + 1, pos.y + 1 };
	ushort4 center= readImageus4(imgin, pos);
	ushort4 left  = readImageus4(imgin, pLeft);
	ushort4 right = readImageus4(imgin, pRight);
	ushort4 top	 = readImageus4(imgin, pTop);
	ushort4 bot	 = readImageus4(imgin, pBot);
	ushort4 lTop = readImageus4(imgin, pLT);
	ushort4 rTop = readImageus4(imgin, pRT);
	ushort4 lBot = readImageus4(imgin, pLBot);
	ushort4 rBot = readImageus4(imgin, pRBot);

	float4 horizsumC = {left.w + center.y, center.x + center.z, center.y + center.w, center.z + right.x};
	float4 horizvalC = horizsumC/2;
	float4 verticsumC= convert_float4(top) + convert_float4(bot);
	float4 verticvalC= verticsumC/2;
	float4 diagNsumC = {lTop.w + bot.y,	   top.x + bot.z,		top.y + bot.w,		top.z + rBot.x};
	float4 diagNValC = diagNsumC/2;
	float4 diagZsumC = {rBot.w + top.y,	   bot.x + top.z,		bot.y + top.w,		bot.z + lTop.x};
	float4 diagZValC = diagZsumC/2;
	float4 centerF = convert_float4(center);
	float4 meanHC = (centerF + horizvalC)  / 2;
	float4 meanVC = (centerF + verticvalC) / 2;
	float4 meanNC = (centerF + diagNValC)  / 2;
	float4 meanZC = (centerF + diagZValC)  / 2;
	float4 lumC   = (meanHC + meanVC + meanNC + meanZC)/4;
	float4 lumaThr= sqrt(lumC) * lumaSqtThr;
	float4 pattern2Thr = lumaThr * pt2Thr;
	const int skipY = 1;
	const int skipX = 1;
	const int cKernSizeY = 8;
	const int cKernSizeX = 8/4;
	float4 sum = {0,0,0,0};
	int4 count = {0,0,0,0};
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos =		{ pos.x + i,  pos.y + j };
			int2 pSLeft =	{ spos.x - 1, spos.y };
			int2 pSRight =	{ spos.x + 1, spos.y };
			int2 pSTop =	{ spos.x,	  spos.y - 1 };
			int2 pSBot =	{ spos.x,	  spos.y + 1 };
			int2 pSLT =		{ spos.x - 1, spos.y - 1 };
			int2 pSRT =		{ spos.x + 1, spos.y - 1 };
			int2 pSLBot =	{ spos.x - 1, spos.y + 1 };
			int2 pSRBot =	{ spos.x + 1, spos.y + 1 }; 

			ushort4 search=	readImageus4(imgin, spos);
			ushort4 sLeft =	readImageus4(imgin, pSLeft);
			ushort4 sRight = readImageus4(imgin, pSRight);
			ushort4 sTop =	readImageus4(imgin, pSTop);
			ushort4 sBot =	readImageus4(imgin, pSBot);
			ushort4 sLT =	readImageus4(imgin, pSLT);
			ushort4 sRT =	readImageus4(imgin, pSRT);
			ushort4 sLBot =	readImageus4(imgin, pSLBot);
			ushort4 sRBot =	readImageus4(imgin, pSRBot);
			float4 horizsumS = {sLeft.w + search.y,	search.x + search.z,search.y + search.w, search.z + sRight.x};
			float4 horizvalS = horizsumS / 2;
			float4 verticsumS= convert_float4(sTop) + convert_float4(sBot);
			float4 verticvalS= verticsumS / 2;
			float4 diagNsumS = {sLT.w + sBot.y,		sTop.x + sBot.z,	sTop.y + sBot.w,	sTop.z + sRBot.x};
			float4 diagNValS = diagNsumS / 2;
			float4 diagZsumS = {sRBot.w + sTop.y,	sBot.x + sTop.z,	sBot.y + sTop.w,	sBot.z + sLT.x};
			float4 diagZValS = diagZsumS / 2;
			float4 searchF = convert_float4(search);
			float4 meanHS = (searchF + horizvalS) / 2;
			float4 meanVS = (searchF + verticvalS)/ 2;
			float4 meanNS = (searchF + diagNValS) / 2;
			float4 meanZS = (searchF + diagZValS) / 2;
			float4 lumS = (meanHS + meanVS + meanNS + meanZS)/4;
			if ((fabs(lumS.x - lumC.x)			 <= lumaThr.x) &&
				(fabs(horizvalS.x - horizvalC.x) <= pattern2Thr.x) &&
				(fabs(verticvalS.x- verticvalC.x)<= pattern2Thr.x) &&
				(fabs(diagNsumS.x - diagNsumC.x) <= pattern2Thr.x) &&
				(fabs(diagZsumS.x - diagZsumC.x) <= pattern2Thr.x))
			{
				sum.x += search.x;
				++count.x;
			}
			if ((fabs(lumS.z - lumC.x)			 <= lumaThr.x) &&
				(fabs(horizvalS.z - horizvalC.x) <= pattern2Thr.x) &&
				(fabs(verticvalS.z- verticvalC.x)<= pattern2Thr.x) &&
				(fabs(diagNsumS.z - diagNsumC.x) <= pattern2Thr.x) &&
				(fabs(diagZsumS.z - diagZsumC.x) <= pattern2Thr.x))
			{
				sum.x += search.z;
				++count.x;
			}

			if ((fabs(lumS.y - lumC.y)			 <= lumaThr.y) &&
				(fabs(horizvalS.y - horizvalC.y) <= pattern2Thr.y) &&
				(fabs(verticvalS.y- verticvalC.y)<= pattern2Thr.y) &&
				(fabs(diagNsumS.y - diagNsumC.y) <= pattern2Thr.y) &&
				(fabs(diagZsumS.y - diagZsumC.y) <= pattern2Thr.y))
			{
				sum.y += search.y;
				++count.y;
			}
			if ((fabs(lumS.w - lumC.y)			 <= lumaThr.y) &&
				(fabs(horizvalS.w - horizvalC.y) <= pattern2Thr.y) &&
				(fabs(verticvalS.w- verticvalC.y)<= pattern2Thr.y) &&
				(fabs(diagNsumS.w - diagNsumC.y) <= pattern2Thr.y) &&
				(fabs(diagZsumS.w - diagZsumC.y) <= pattern2Thr.y))
			{
				sum.y += search.w;
				++count.y;
			}

			if ((fabs(lumS.x - lumC.z)			 <= lumaThr.z) &&
				(fabs(horizvalS.x - horizvalC.z) <= pattern2Thr.z) &&
				(fabs(verticvalS.x- verticvalC.z)<= pattern2Thr.z) &&
				(fabs(diagNsumS.x - diagNsumC.z) <= pattern2Thr.z) &&
				(fabs(diagZsumS.x - diagZsumC.z) <= pattern2Thr.z))
			{
				sum.z += search.x;
				++count.z;
			}
			if ((fabs(lumS.z - lumC.z)			 <= lumaThr.z) &&
				(fabs(horizvalS.z - horizvalC.z) <= pattern2Thr.z) &&
				(fabs(verticvalS.z- verticvalC.z)<= pattern2Thr.z) &&
				(fabs(diagNsumS.z - diagNsumC.z) <= pattern2Thr.z) &&
				(fabs(diagZsumS.z - diagZsumC.z) <= pattern2Thr.z))
			{
				sum.z += search.z;
				++count.z;
			}

			if ((fabs(lumS.y - lumC.w)			 <= lumaThr.w) &&
				(fabs(horizvalS.y - horizvalC.w) <= pattern2Thr.w) &&
				(fabs(verticvalS.y- verticvalC.w)<= pattern2Thr.w) &&
				(fabs(diagNsumS.y - diagNsumC.w) <= pattern2Thr.w) &&
				(fabs(diagZsumS.y - diagZsumC.w) <= pattern2Thr.w))
			{
				sum.w += search.y;
				++count.w;
			}
			if ((fabs(lumS.w - lumC.w)			 <= lumaThr.w) &&
				(fabs(horizvalS.w - horizvalC.w) <= pattern2Thr.w) &&
				(fabs(verticvalS.w- verticvalC.w)<= pattern2Thr.w) &&
				(fabs(diagNsumS.w - diagNsumC.w) <= pattern2Thr.w) &&
				(fabs(diagZsumS.w - diagZsumC.w) <= pattern2Thr.w))
			{
				sum.w += search.w;
				++count.w;
			}
		}
	}
	float4 opix = sum / convert_float4(count);
	return convert_int4(opix) - (int4)ofs;
}


/* This version computes 4 pixels in sequence in one go! 157ms for 1/4 SonyIMX211*/
__kernel void Gray4NR(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaSqtThr,
	const float pt2Thr,
	__write_only image2d_t imgout)
{
	float2 global_id={	(float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = {	get_global_size(0),			get_global_size(1) };
	int2 out_dim   = {	get_image_width(imgout),	get_image_height(imgout) };
	float2 block_size={ (float)out_dim.x/(float)work_size.x, (float)out_dim.y/(float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + kernSize/4 + 1, y + kernSize + 1};
			int4 opix = doGray4NR(imgin, ipos, ofs, lumaSqtThr, pt2Thr);
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y };
			write_imageui(imgout, opos, opixu);
		}
	}
}


/* This version uses only every second pixel 125ms for 1/4 SonyIMX211 */
__kernel void Gray4CrossNR(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float lumaSqtThr,
	const float pt2Thr,
	__write_only image2d_t imgout)
{
	float2 global_id={	(float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = {	get_global_size(0),			get_global_size(1) };
	int2 out_dim   = {	get_image_width(imgout),	get_image_height(imgout) };
	float2 block_size={ (float)out_dim.x/(float)work_size.x, (float)out_dim.y/(float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + kernSize/4 + 1, y + kernSize + 1};
			int4 opix = doGray4CrossNR(imgin, ipos, ofs, lumaSqtThr, pt2Thr);
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y };
			write_imageui(imgout, opos, opixu);
		}
	}
}


/********************************************* C C D ***********************************************/
int4 doGrayCcd4NR(__read_only image2d_t imgin, int2 pos, const int ofs,
				const float channelThr, const float pattern2Thr)
{
	int2 pLeft =  { pos.x - 1, pos.y };
	int2 pRight = { pos.x + 1, pos.y };
	int2 pTop =	  { pos.x,	   pos.y - 1 };
	int2 pBot =	  { pos.x,	   pos.y + 1 };
	int2 pLT =	  { pos.x - 1, pos.y - 1 };
	int2 pRT =	  { pos.x + 1, pos.y - 1 };
	int2 pLBot =  { pos.x - 1, pos.y + 1 };
	int2 pRBot =  { pos.x + 1, pos.y + 1 };
	ushort4 center= readImageus4(imgin, pos);
	ushort4 left  = readImageus4(imgin, pLeft);
	ushort4 right = readImageus4(imgin, pRight);
	ushort4 top	 = readImageus4(imgin, pTop);
	ushort4 bot	 = readImageus4(imgin, pBot);
	ushort4 lTop = readImageus4(imgin, pLT);
	ushort4 rTop = readImageus4(imgin, pRT);
	ushort4 lBot = readImageus4(imgin, pLBot);
	ushort4 rBot = readImageus4(imgin, pRBot);

	float4 horizsumC = {left.w + center.y, center.x + center.z, center.y + center.w, center.z + right.x};
	float4 verticsumC= convert_float4(top) + convert_float4(bot);
	float4 diagNsumC = {lTop.w + bot.y,	   top.x + bot.z,		top.y + bot.w,		top.z + rBot.x};
	float4 diagZsumC = {rBot.w + top.y,	   bot.x + top.z,		bot.y + top.w,		bot.z + lTop.x};
	float4 centerF = convert_float4(center);
	const int skipY = 1;
	const int skipX = 1;
	const int cKernSizeY = 8;
	const int cKernSizeX = 8/4;
	float4 sum = {0,0,0,0};
	int4 count = {0,0,0,0};
	for (int j = -cKernSizeY; j <= cKernSizeY; j += skipY)
	{
		for (int i = -cKernSizeX; i <= cKernSizeX; i += skipX)
		{
			int2 spos =		{ pos.x + i,  pos.y + j };
			int2 pSLeft =	{ spos.x - 1, spos.y };
			int2 pSRight =	{ spos.x + 1, spos.y };
			int2 pSTop =	{ spos.x,	  spos.y - 1 };
			int2 pSBot =	{ spos.x,	  spos.y + 1 };
			int2 pSLT =		{ spos.x - 1, spos.y - 1 };
			int2 pSRT =		{ spos.x + 1, spos.y - 1 };
			int2 pSLBot =	{ spos.x - 1, spos.y + 1 };
			int2 pSRBot =	{ spos.x + 1, spos.y + 1 }; 

			ushort4 search=	readImageus4(imgin, spos);
			ushort4 sLeft =	readImageus4(imgin, pSLeft);
			ushort4 sRight = readImageus4(imgin, pSRight);
			ushort4 sTop =	readImageus4(imgin, pSTop);
			ushort4 sBot =	readImageus4(imgin, pSBot);
			ushort4 sLT =	readImageus4(imgin, pSLT);
			ushort4 sRT =	readImageus4(imgin, pSRT);
			ushort4 sLBot =	readImageus4(imgin, pSLBot);
			ushort4 sRBot =	readImageus4(imgin, pSRBot);
			float4 horizsumS = {sLeft.w + search.y,	search.x + search.z,search.y + search.w, search.z + sRight.x};
			float4 verticsumS= convert_float4(sTop) + convert_float4(sBot);
			float4 diagNsumS = {sLT.w + sBot.y,		sTop.x + sBot.z,	sTop.y + sBot.w,	sTop.z + sRBot.x};
			float4 diagZsumS = {sRBot.w + sTop.y,	sBot.x + sTop.z,	sBot.y + sTop.w,	sBot.z + sLT.x};
			float4 searchF = convert_float4(search);
			if ((fabs(searchF.x - centerF.x)	 <= channelThr) &&
				(fabs(horizsumS.x - horizsumC.x) <= pattern2Thr) &&
				(fabs(verticsumS.x- verticsumC.x)<= pattern2Thr) &&
				(fabs(diagNsumS.x - diagNsumC.x) <= pattern2Thr) &&
				(fabs(diagZsumS.x - diagZsumC.x) <= pattern2Thr))
			{
				sum.x += search.x;
				++count.x;
			}
			if ((fabs(searchF.y - centerF.x)	 <= channelThr) &&
				(fabs(horizsumS.y - horizsumC.x) <= pattern2Thr) &&
				(fabs(verticsumS.y- verticsumC.x)<= pattern2Thr) &&
				(fabs(diagNsumS.y - diagNsumC.x) <= pattern2Thr) &&
				(fabs(diagZsumS.y - diagZsumC.x) <= pattern2Thr))
			{
				sum.x += search.y;
				++count.x;
			}
			if ((fabs(searchF.z - centerF.x)	 <= channelThr) &&
				(fabs(horizsumS.z - horizsumC.x) <= pattern2Thr) &&
				(fabs(verticsumS.z- verticsumC.x)<= pattern2Thr) &&
				(fabs(diagNsumS.z - diagNsumC.x) <= pattern2Thr) &&
				(fabs(diagZsumS.z - diagZsumC.x) <= pattern2Thr))
			{
				sum.x += search.z;
				++count.x;
			}
			if ((fabs(searchF.w - centerF.x)	 <= channelThr) &&
				(fabs(horizsumS.w - horizsumC.x) <= pattern2Thr) &&
				(fabs(verticsumS.w- verticsumC.x)<= pattern2Thr) &&
				(fabs(diagNsumS.w - diagNsumC.x) <= pattern2Thr) &&
				(fabs(diagZsumS.w - diagZsumC.x) <= pattern2Thr))
			{
				sum.x += search.w;
				++count.x;
			}


			if ((fabs(searchF.x - centerF.y)	 <= channelThr) &&
				(fabs(horizsumS.x - horizsumC.y) <= pattern2Thr) &&
				(fabs(verticsumS.x- verticsumC.y)<= pattern2Thr) &&
				(fabs(diagNsumS.x - diagNsumC.y) <= pattern2Thr) &&
				(fabs(diagZsumS.x - diagZsumC.y) <= pattern2Thr))
			{
				sum.y += search.x;
				++count.y;
			}
			if ((fabs(searchF.y - centerF.y)	 <= channelThr) &&
				(fabs(horizsumS.y - horizsumC.y) <= pattern2Thr) &&
				(fabs(verticsumS.y- verticsumC.y)<= pattern2Thr) &&
				(fabs(diagNsumS.y - diagNsumC.y) <= pattern2Thr) &&
				(fabs(diagZsumS.y - diagZsumC.y) <= pattern2Thr))
			{
				sum.y += search.y;
				++count.y;
			}
			if ((fabs(searchF.z - centerF.y)	 <= channelThr) &&
				(fabs(horizsumS.z - horizsumC.y) <= pattern2Thr) &&
				(fabs(verticsumS.z- verticsumC.y)<= pattern2Thr) &&
				(fabs(diagNsumS.z - diagNsumC.y) <= pattern2Thr) &&
				(fabs(diagZsumS.z - diagZsumC.y) <= pattern2Thr))
			{
				sum.y += search.z;
				++count.y;
			}
			if ((fabs(searchF.w - centerF.y)	 <= channelThr) &&
				(fabs(horizsumS.w - horizsumC.y) <= pattern2Thr) &&
				(fabs(verticsumS.w- verticsumC.y)<= pattern2Thr) &&
				(fabs(diagNsumS.w - diagNsumC.y) <= pattern2Thr) &&
				(fabs(diagZsumS.w - diagZsumC.y) <= pattern2Thr))
			{
				sum.y += search.w;
				++count.y;
			}


			if ((fabs(searchF.x - centerF.z)	 <= channelThr) &&
				(fabs(horizsumS.x - horizsumC.z) <= pattern2Thr) &&
				(fabs(verticsumS.x- verticsumC.z)<= pattern2Thr) &&
				(fabs(diagNsumS.x - diagNsumC.z) <= pattern2Thr) &&
				(fabs(diagZsumS.x - diagZsumC.z) <= pattern2Thr))
			{
				sum.z += search.x;
				++count.z;
			}
			if ((fabs(searchF.y - centerF.z)	 <= channelThr) &&
				(fabs(horizsumS.y - horizsumC.z) <= pattern2Thr) &&
				(fabs(verticsumS.y- verticsumC.z)<= pattern2Thr) &&
				(fabs(diagNsumS.y - diagNsumC.z) <= pattern2Thr) &&
				(fabs(diagZsumS.y - diagZsumC.z) <= pattern2Thr))
			{
				sum.z += search.y;
				++count.z;
			}
			if ((fabs(searchF.z - centerF.z)	 <= channelThr) &&
				(fabs(horizsumS.z - horizsumC.z) <= pattern2Thr) &&
				(fabs(verticsumS.z- verticsumC.z)<= pattern2Thr) &&
				(fabs(diagNsumS.z - diagNsumC.z) <= pattern2Thr) &&
				(fabs(diagZsumS.z - diagZsumC.z) <= pattern2Thr))
			{
				sum.z += search.z;
				++count.z;
			}
			if ((fabs(searchF.w - centerF.z)	 <= channelThr) &&
				(fabs(horizsumS.w - horizsumC.z) <= pattern2Thr) &&
				(fabs(verticsumS.w- verticsumC.z)<= pattern2Thr) &&
				(fabs(diagNsumS.w - diagNsumC.z) <= pattern2Thr) &&
				(fabs(diagZsumS.w - diagZsumC.z) <= pattern2Thr))
			{
				sum.z += search.w;
				++count.z;
			}


			if ((fabs(searchF.x - centerF.w)	 <= channelThr) &&
				(fabs(horizsumS.x - horizsumC.w) <= pattern2Thr) &&
				(fabs(verticsumS.x- verticsumC.w)<= pattern2Thr) &&
				(fabs(diagNsumS.x - diagNsumC.w) <= pattern2Thr) &&
				(fabs(diagZsumS.x - diagZsumC.w) <= pattern2Thr))
			{
				sum.w += search.x;
				++count.w;
			}
			if ((fabs(searchF.y - centerF.w)	 <= channelThr) &&
				(fabs(horizsumS.y - horizsumC.w) <= pattern2Thr) &&
				(fabs(verticsumS.y- verticsumC.w)<= pattern2Thr) &&
				(fabs(diagNsumS.y - diagNsumC.w) <= pattern2Thr) &&
				(fabs(diagZsumS.y - diagZsumC.w) <= pattern2Thr))
			{
				sum.w += search.y;
				++count.w;
			}
			if ((fabs(searchF.z - centerF.w)	 <= channelThr) &&
				(fabs(horizsumS.z - horizsumC.w) <= pattern2Thr) &&
				(fabs(verticsumS.z- verticsumC.w)<= pattern2Thr) &&
				(fabs(diagNsumS.z - diagNsumC.w) <= pattern2Thr) &&
				(fabs(diagZsumS.z - diagZsumC.w) <= pattern2Thr))
			{
				sum.w += search.z;
				++count.w;
			}
			if ((fabs(searchF.w - centerF.w)	 <= channelThr) &&
				(fabs(horizsumS.w - horizsumC.w) <= pattern2Thr) &&
				(fabs(verticsumS.w- verticsumC.w)<= pattern2Thr) &&
				(fabs(diagNsumS.w - diagNsumC.w) <= pattern2Thr) &&
				(fabs(diagZsumS.w - diagZsumC.w) <= pattern2Thr))
			{
				sum.w += search.w;
				++count.w;
			}
		}
	}
	float4 opix = sum / convert_float4(count);
	return convert_int4(opix) - (int4)ofs;
}


/* This version computes 4 pixels in sequence in one go! 157ms for 1/4 SonyIMX211*/
__kernel void GrayCcd4NR(__read_only image2d_t imgin,
	const int ofs,
	const int kernSize,
	const float chennelThr,
	const float pt2Thr,
	__write_only image2d_t imgout)
{
	float2 global_id={	(float)get_global_id(0),	(float)get_global_id(1) };
	int2 work_size = {	get_global_size(0),			get_global_size(1) };
	int2 out_dim   = {	get_image_width(imgout),	get_image_height(imgout) };
	float2 block_size={ (float)out_dim.x/(float)work_size.x, (float)out_dim.y/(float)work_size.y };
	float2 work_start = global_id * block_size;
	float2 work_end = work_start + block_size;
	for (int y = (int)work_start.y; y < (int)work_end.y; ++y)
	{
		for (int x = (int)work_start.x; x < (int)work_end.x; ++x)
		{
			int2 ipos = { x + kernSize/4 + 1, y + kernSize + 1};
			int4 opix = doGrayCcd4NR(imgin, ipos, ofs, chennelThr, pt2Thr);
			uint4 opixu = convert_uint4_sat(opix);
			int2 opos = { x,y };
			write_imageui(imgout, opos, opixu);
		}
	}
}
