124 8.ImplementingaFastDDOFSolver
struct ReduceO
{
float4 abc : SV_TARGET0;
float4 x : SV_TARGET1;
};
ReduceO InitialReduceHorz4(PS_INPUT input)
{
ReduceO output;
float fPosX = floor(input.Pos.x) * 4.0 + 3.0;
int3 i3LP = int3(fPosX, input.Pos.y, 0);
// Phase 1: Gather and compute all data necessary
// for the four-to-one reduction.
// Compute the CoC values in the support needed for
// the four-to-one reduction.
float fCoC_4 = computeCoC(i3LP, int2(-4, 0));
float fCoC_3 = computeCoC(i3LP, int2(-3, 0));
float fCoC_2 = computeCoC(i3LP, int2(-2, 0));
float fCoC_1 = computeCoC(i3LP, int2(-1, 0));
float fCoC0 = computeCoC(i3LP, int2(0, 0));
float fCoC1 = computeCoC(i3LP, int2(1, 0));
float fCoC2 = computeCoC(i3LP, int2(2, 0));
float fCoC3 = computeCoC(i3LP, int2(3, 0));
float fCoC4 = computeCoC(i3LP, int2(4, 0));
// Ensure insulation at the image borders by setting
// the CoC to 0 outside the image.
fCoC_4 = (fPosX - 4.0 >= 0.0) ? fCoC_4 : 0.0;
fCoC_3 = (fPosX - 3.0 >= 0.0) ? fCoC_3 : 0.0;
fCoC4 = (fPosX + 4.0) < g_vImageSize.x ? fCoC4 : 0.0;
fCoC3 = (fPosX + 3.0) < g_vImageSize.x ? fCoC3 : 0.0;
fCoC2 = (fPosX + 2.0) < g_vImageSize.x ? fCoC2 : 0.0;
fCoC1 = (fPosX + 1.0) < g_vImageSize.x ? fCoC1 : 0.0;
// Use the minimum CoC as the real CoC as described in Kass et al.
float fRealCoC_4 = min(fCoC_4, fCoC_3);
float fRealCoC_3 = min(fCoC_3, fCoC_2);
8.2ModifyingtheBasicCRSolver 125
float fRealCoC_2 = min(fCoC_2, fCoC_1);
float fRealCoC_1 = min(fCoC_1, fCoC0);
float fRealCoC0 = min(fCoC0, fCoC1);
float fRealCoC1 = min(fCoC1, fCoC2);
float fRealCoC2 = min(fCoC2, fCoC3);
float fRealCoC3 = min(fCoC3, fCoC4);
// Compute beta values interpreting the CoCs as the diameter.
float bt_4 = fRealCoC_4 * fRealCoC_4;
float bt_3 = fRealCoC_3 * fRealCoC_3;
float bt_2 = fRealCoC_2 * fRealCoC_2;
float bt_1 = fRealCoC_1 * fRealCoC_1;
float bt0 = fRealCoC0 * fRealCoC0;
float bt1 = fRealCoC1 * fRealCoC1;
float bt2 = fRealCoC2 * fRealCoC2;
float bt3 = fRealCoC3 * fRealCoC3;
// Now compute the a, b, c and load the x in the support
// region of the four-to-one reduction.
float3 abc_3 = float3(-bt_4, 1.0 + bt_3 + bt_4, -bt_3);
float3 x_3 = txX.Load(i3LP, int2(-3, 0)).xyz;
float3 abc_2 = float3(-bt_3 ,1.0 + bt_2 + bt_3, -bt_2);
float3 x_2 = txX.Load(i3LP, int2(-2, 0)).xyz;
float3 abc_1 = float3(-bt_2, 1.0 + bt_1 + bt_2, -bt_1);
float3 x_1 = txX.Load(i3LP, int2(-1, 0)).xyz;
float3 abc0 = float3(-bt_1, 1.0 + bt0 + bt_1, -bt0);
float3 x0 = txX.Loadi3LP, int2(0, 0)).xyz;
float3 abc1 = float3(-bt0, 1.0 + bt1 + bt0, -bt1);
float3 x1 = txX.Load(i3LP, int2(1, 0)).xyz;
float3 abc2 = float3(-bt1, 1.0 + bt2 + bt1, -bt2);
float3 x2 = txX.Load(i3LP, int2(2, 0)).xyz;
float3 abc3 = float3(-bt2, 1.0 + bt3 + bt2, -bt3);
float3 x3 = txX.Load(i3LP, int2(3, 0)).xyz;
// Phase 2: Reduce all the data by doing all two-to-one
126 8.ImplementingaFastDDOFSolver
// reductions to get to the next reduction level.
float a_1 = -abc_2.x / abc_3.y;
float g_1 = -abc_2.z / abc_1.y;
float a0 = -abc0.x / abc_1.y;
float g0 = -abc0.z / abc1.y;
float a1 = -abc2.x / abc1.y;
float g1 = -abc2.z / abc3.y;
float3 abc_p = float3(a_1 * abc_3.x,
abc_2.y + a_1 * abc_3.z + g_1 * abc_1.x, g_1 * abc_1.z);
float3 x_p = float3(x_2 + a_1 * x_3 + g_1 * x_1);
float3 abc_c = float3(a0 * abc_1.x,
abc0.y + a0 * abc_1.z + g0 * abc1.x, g0 * abc1.z);
float3 x_c = float3( x0 + a0 * x_1 + g0 * x1);
float3 abc_n = float3(a1 * abc1.x,
abc2.y + a1 * abc1.z + g1 * abc3.x, g1 * abc3.z);
float3 x_n = float3(x2 + a1 * x1 + g1 * x3);
// Phase 3: Do the final two-to-one reduction to complete
// the four-to-one reduction.
float a = -abc_c.x / abc_p.y;
float g = -abc_c.z / abc_n.y;
float3 res0 = float3(a * abc_p.x,
abc_c.y + a * abc_p.z + g * abc_n.x, g * abc_n.z);
float3 res1 = float3(x_c + a * x_p + g * x_n);
output.abc = float4(res0, 0.0);
output.x = float4(res1, 0.0);
return (output);
}
Listing 8.1. Horizontal four-to-one reduction.
3. Perform a final one-to-four solving pass to deal with the initial four-to-one
reduction pass. Again, a very hands-on approach for solving the problem at
hand is used, and it also has three phases. Since an initial four-to-one reduc-
tion shader was used, we don’t have all the data available to perform the
8.2ModifyingtheBasicCRSolver 127
needed one-to-four solving pass. Phase 1 of the shader therefore starts to re-
construct the missing data from the unchanged and full-resolution input data
in the same fashion that was used in Listing 8.1. Phase 2 uses this data to per-
form several one-to-two solving steps to produce the missing
i
y
values of the
intermediate pass that we skip. Phase 3 finally uses all that data to produce
the final result. Listing 8.2 shows a shader model 4 code fragment imple-
menting the corresponding algorithm for that final solver stage. Again, only
the code for the horizontal version of the algorithm is shown.
float4 FinalSolveHorz4(PS_INPUT input) : SV_TARGET
{
// First reconstruct the level 1 x, abc.
float fPosX = floor(input.Pos.x * 0.25) * 4.0 + 3.0;
int3 i3LoadPos = int3(fPosX, input.Pos.y, 0);
// Phase 1: Gather data to reconstruct intermediate data
// lost when skipping the first two-to-one reduction step
// of the original solver.
float fCoC_5 = computeCoC(i3LoadPos, int2(-5, 0));
float fCoC_4 = computeCoC(i3LoadPos, int2(-4, 0));
float fCoC_3 = computeCoC(i3LoadPos, int2(-3, 0));
float fCoC_2 = computeCoC(i3LoadPos, int2(-2, 0));
float fCoC_1 = computeCoC(i3LoadPos, int2(-1, 0));
float fCoC0 = computeCoC(i3LoadPos, int2(0, 0));
float fCoC1 = computeCoC(i3LoadPos, int2(1, 0));
float fCoC2 = computeCoC(i3LoadPos, int2(2, 0));
float fCoC3 = computeCoC(i3LoadPos, int2(3, 0));
float fCoC4 = computeCoC(i3LoadPos, int2(4, 0));
fCoC_5 = (fPosX - 5.0 >= 0.0) ? fCoC_5 : 0.0;
fCoC_4 = (fPosX - 4.0 >= 0.0) ? fCoC_4 : 0.0;
fCoC_3 = (fPosX - 3.0 >= 0.0) ? fCoC_3 : 0.0;
fCoC4 = (fPosX + 4.0 < g_vImageSize.x) ? fCoC4: 0.0;
fCoC3 = (fPosX + 3.0 < g_vImageSize.x) ? fCoC3 : 0.0;
fCoC2 = (fPosX + 2.0 < g_vImageSize.x) ? fCoC2 : 0.0;
fCoC1 = (fPosX + 1.0 < g_vImageSize.x) ? fCoC1 : 0.0;
float fRealCoC_5 = min(fCoC_5, fCoC_4);
float fRealCoC_4 = min(fCoC_4, fCoC_3);
float fRealCoC_3 = min(fCoC_3, fCoC_2);
128 8.ImplementingaFastDDOFSolver
float fRealCoC_2 = min(fCoC_2, fCoC_1);
float fRealCoC_1 = min(fCoC_1, fCoC0);
float fRealCoC0 = min(fCoC0, fCoC1);
float fRealCoC1 = min(fCoC1, fCoC2);
float fRealCoC2 = min(fCoC2, fCoC3);
float fRealCoC3 = min(fCoC3, fCoC4);
float b_5 = fRealCoC_5 * fRealCoC_5;
float b_4 = fRealCoC_4 * fRealCoC_4;
float b_3 = fRealCoC_3 * fRealCoC_3;
float b_2 = fRealCoC_2 * fRealCoC_2;
float b_1 = fRealCoC_1 * fRealCoC_1;
float b0 = fRealCoC0 * fRealCoC0;
float b1 = fRealCoC1 * fRealCoC1;
float b2 = fRealCoC2 * fRealCoC2;
float b3 = fRealCoC3 * fRealCoC3;
float3 abc_4 = float3(-b_5, 1.0 + b_4 + b_5, -b_4);
float3 x_4 = txX.Load(i3LoadPos, int2(-4, 0)).xyz;
float3 abc_3 = float3(-b_4, 1.0 + b_3 + b_4, -b_3);
float3 x_3 = txX.Load(i3LoadPos, int2(-3, 0)).xyz;
float3 abc_2 = float3(-b_3, 1.0 + b_2 + b_3, -b_2);
float3 x_2 = txX.Load(i3LoadPos, int2(-2, 0)).xyz;
float3 abc_1 = float3(-b_2, 1.0 + b_1 + b_2, -b_1);
float3 x_1 = xX.Load(i3LP, int2(-1, 0)).xyz;
float3 abc0 = float3(-b_1, 1.0 + b0 + b_1, -b0);
float3 x0 = txX.Load(i3LP, int2(0, 0)).xyz;
float3 abc1 = float3(-b0, 1.0 + b1 + b0, -b1);
float3 x1 = txX.Load(i3LP, int2(1, 0)).xyz;
float3 abc2 = float3(-b1, 1.0 + b2 + b1, -b2);
float3 x2 = txX.Load(i3LP, int2(2, 0)).xyz;
float3 abc3 = float3(-b2, 1.0 + b3 + b2, -b3);
float3 x3 = txX.Load(i3LP, int2(3, 0)).xyz;
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset