00001 #ifndef __F4ThomasLS_H__ 00002 #define __F4ThomasLS_H__ 00003 00004 00005 00039 #include "AbsF4TriDiagonalLS.hpp" 00040 00041 #include <math.h> 00042 #include "sseUtil.h" 00043 00044 00045 class F4ThomasLS : public AbsF4TriDiagonalLS 00046 { 00047 private: 00048 00049 Float4Vector* uOut; 00050 Float4Vector* dOut; 00051 Float4Vector* lOut; 00052 00053 Float4Vector* y; 00054 00055 protected: 00056 00057 // for parallel executions 00059 inline void computeLRdecomposition() 00060 { 00061 /* 00062 for(int line=0; line<4; line++) 00063 { 00064 dOut->set0(line, 0, A->getD0(line, 0) ); 00065 00066 for(int j=1; j<N; j++) 00067 { 00068 lOut->set(line, j, A->getL(line, j)/dOut->get(line, j) ); 00069 00070 dOut->set0(line, j, A->getD0(line, j) - lOut->get(line, j)*A->getU(line,j) ); 00071 } 00072 } 00073 */ 00074 float *dout, *ad0, *al, *au, *lout; 00075 // int j4=4; 00076 00077 00078 dout= dOut->get0(); 00079 ad0= A->getD0(); 00080 al= A->getL(); 00081 au= A->getU(); 00082 lout= lOut->get(); 00083 00084 00085 /* prefetcht0(al[j4]); // use one time 00086 prefetcht0(ad0[j4]); // use one time 00087 prefetcht0(au[j4]); // use one time 00088 */ 00089 movaps_m2r(ad0[0], xmm0); // dout0 00090 movaps_r2m(xmm0, dout[0]); 00091 00092 for(int j=4; j<N4; j+=4) // , j4+=4) 00093 { 00094 /* prefetcht0(al[j4]); // use one time 00095 prefetcht0(ad0[j4]); // use one time 00096 prefetcht0(au[j4]); // use one time 00097 */ 00098 movaps_m2r(al[j], xmm1); // al 00099 // divps_r2r(xmm0, xmm1); // al/dout 00100 rcpps_r2r(xmm0, xmm3); // dout -> 1/dout division optimisation 00101 mulps_r2r(xmm3, xmm1); // al*(1/dout) 00102 movaps_r2m(xmm1, lout[j]); // lout 00103 00104 movaps_m2r(ad0[j], xmm0); // ad0 00105 movaps_m2r(au[j], xmm2); // au 00106 mulps_r2r(xmm2, xmm1); // lout*au 00107 subps_r2r(xmm1, xmm0); // ad0 - lout*au 00108 movaps_r2m(xmm0, dout[j]); 00109 } 00110 } 00111 00113 inline void forwardSubstitution() 00114 { 00115 /* 00116 for(int line=0; line<4; line++) 00117 { 00118 y->set0(line, 0, b->get0(line, 0) ); 00119 00120 for(int j=1; j<N; j++) 00121 y->set0(line, j, b->get0(line, j) - lOut->get(line, j)*y->get(line, j) ); 00122 } 00123 */ 00124 float *yv, *bv, *lout; 00125 // int j4=4; 00126 00127 yv= y->get0(); 00128 bv= b->get0(); 00129 lout= lOut->get(); 00130 00131 /* prefetcht0(lout[j4]); // use one time 00132 prefetcht0(bv[j4]); // use one time 00133 */ 00134 movaps_m2r(bv[0], xmm0); // b0 00135 movaps_r2m(xmm0, yv[0]); // y0 00136 00137 for(int j=4; j<N4; j+=4) // , j4+=4) 00138 { 00139 /* prefetcht0(lout[j4]); // use one time 00140 prefetcht0(bv[j4]); // use one time 00141 */ 00142 movaps_m2r(bv[j], xmm1); // bj 00143 movaps_m2r(lout[j], xmm2); // lout 00144 mulps_r2r(xmm2, xmm0); // lout*y 00145 subps_r2r(xmm0, xmm1); // bj - lout*y 00146 movaps_r2r(xmm1, xmm0); 00147 movaps_r2m(xmm0, yv[j]); // yj 00148 } 00149 } 00150 00152 inline void backwardSubstitution() 00153 { 00154 /* for(int line=0; line<4; line++) 00155 { 00156 x->set(line, N, y->get(line, N)/dOut->get(line, N) ); 00157 00158 for(int j=N-1; j>0; j--) 00159 x->set(line, j, (y->get(line,j) - A->getU(line,j)*x->get0(line,j))/dOut->get(line,j) ); 00160 } 00161 */ 00162 float *xv, *yv, *dout, *au; 00163 // int j4=N4-4; 00164 00165 xv= x->get(); 00166 yv= y->get(); 00167 dout= dOut->get(); 00168 au= A->getU(); 00169 00170 prefetcht1(au[N4-4]); // use one time 00171 /* prefetcht0(au[j4]); // use one time 00172 prefetcht0(dout[j4]); // use one time 00173 prefetcht0(yv[j4]); // use one time 00174 */ 00175 movaps_m2r(yv[N4], xmm0); // y 00176 movaps_m2r(dout[N4], xmm1); // dout 00177 // divps_r2r(xmm1, xmm0); // y/dout 00178 rcpps_r2r(xmm1, xmm4); // dout -> 1/dout division optimisation 00179 mulps_r2r(xmm4, xmm0); // y*(1/dout) 00180 00181 movaps_r2m(xmm0, xv[N4]); // x 00182 00183 for(int j=N4-4; j>0; j -=4) // , j4-=4) 00184 { 00185 prefetcht1(au[j-4]); // use one time 00186 /* prefetcht0(au[j4]); // use one time 00187 prefetcht0(dout[j4]); // use one time 00188 prefetcht0(yv[j4]); // use one time 00189 */ 00190 movaps_m2r(yv[j], xmm1); // y 00191 movaps_m2r(au[j], xmm2); // au 00192 movaps_m2r(dout[j], xmm3); // dout 00193 00194 mulps_r2r(xmm0, xmm2); // x*au 00195 subps_r2r(xmm2, xmm1); // y - x*au 00196 // divps_r2r(xmm3, xmm1); // (y - x*au)/dout 00197 rcpps_r2r(xmm3, xmm4); // dout -> 1/dout division optimisation 00198 mulps_r2r(xmm4, xmm1); // (y - x*au)*(1/dout) 00199 00200 movaps_r2r(xmm1, xmm0); 00201 movaps_r2m(xmm0, xv[j]); // x 00202 } 00203 } 00204 00205 // combined LR/forward reducing data flow 00206 // for parallel executions 00208 inline void computeLR_forward() 00209 { 00210 /* 00211 for(int line=0; line<4; line++) 00212 { 00213 dOut->set0(line, 0, A->getD0(line, 0) ); 00214 y->set0(line, 0, b->get0(line, 0) ); 00215 00216 for(int j=1; j<N; j++) 00217 { 00218 lOut->set(line, j, A->getL(line, j)/dOut->get(line, j) ); 00219 dOut->set0(line, j, A->getD0(line, j) - lOut->get(line, j)*A->getU(line,j) ); 00220 00221 y->set0(line, j, b->get0(line, j) - lOut->get(line, j)*y->get(line, j) ); 00222 } 00223 } 00224 00225 */ 00226 float *dout, *ad0, *al, *au, *yv, *bv; // lout is not needed 00227 // int j4=4; 00228 00229 dout= dOut->get0(); 00230 ad0= A->getD0(); 00231 al= A->getL(); 00232 au= A->getU(); 00233 // lout= lOut->get(); // not needed 00234 yv= y->get0(); 00235 bv= b->get0(); 00236 00237 prefetcht1(bv[4]); // use one time 00238 prefetcht1(ad0[4]); // use one time 00239 /* prefetcht0(al[j4]); // use one time 00240 prefetcht0(ad0[j4]); // use one time 00241 prefetcht0(au[j4]); // use one time 00242 prefetcht0(bv[j4]); // use one time 00243 */ 00244 movaps_m2r(ad0[0], xmm0); // dout0 00245 movaps_r2m(xmm0, dout[0]); 00246 00247 movaps_m2r(bv[0], xmm4); // b0 00248 movaps_r2m(xmm4, yv[0]); // y0 00249 00250 for(int j=4; j<N4; j+=4) // , j4+=4) 00251 { 00252 prefetcht1(bv[j+4]); // use one time 00253 prefetcht1(ad0[j+4]); // use one time 00254 // load next data in Lx 00255 /* prefetcht0(al[j4]); // use one time 00256 prefetcht0(ad0[j4]); // use one time 00257 prefetcht0(au[j4]); // use one time 00258 prefetcht0(bv[j4]); // use one time 00259 */ 00260 movaps_m2r(bv[j], xmm5); // bj 00261 00262 movaps_m2r(al[j], xmm1); // al 00263 // divps_r2r(xmm0, xmm1); // al/dout 00264 rcpps_r2r(xmm0, xmm3); // dout -> 1/dout division optimisation 00265 mulps_r2r(xmm3, xmm1); // al*(1/dout) 00266 // movaps_r2m(xmm1, lout[j]); // lout NOT NEEDED anymore in results 00267 00268 movaps_m2r(ad0[j], xmm0); // ad0 00269 movaps_m2r(au[j], xmm2); // au 00270 mulps_r2r(xmm1, xmm2); // lout*au 00271 subps_r2r(xmm2, xmm0); // ad0 - lout*au 00272 movaps_r2m(xmm0, dout[j]); 00273 00274 mulps_r2r(xmm1, xmm4); // lout*y 00275 subps_r2r(xmm4, xmm5); // bj - lout*y 00276 movaps_r2r(xmm5, xmm4); 00277 movaps_r2m(xmm4, yv[j]); // yj 00278 } 00279 } 00280 00281 public: 00282 00283 F4ThomasLS(int asize):AbsF4TriDiagonalLS(asize) 00284 { 00285 uOut= new Float4Vector(N-1); 00286 dOut= new Float4Vector(N); 00287 lOut= new Float4Vector(N-1); 00288 00289 y= new Float4Vector(N); 00290 } 00291 00292 F4ThomasLS(int asize, int lines):AbsF4TriDiagonalLS(asize, lines) 00293 { 00294 uOut= new Float4Vector(N-1); 00295 dOut= new Float4Vector(N); 00296 lOut= new Float4Vector(N-1); 00297 00298 y= new Float4Vector(N); 00299 } 00300 00301 virtual ~F4ThomasLS() 00302 { 00303 delete uOut; 00304 delete dOut; 00305 delete lOut; 00306 delete y; 00307 } 00308 00309 virtual void loadA(F4TriDiagonalMatrix* m); 00310 virtual void loadMatrix(Float4Vector* u1, Float4Vector* d1, Float4Vector* l1); 00311 virtual void loadX(Float4Vector* data); 00312 virtual void loadB(Float4Vector* data); 00313 00314 00315 inline void solve() 00316 { 00317 // computeLRdecomposition(); 00318 // forwardSubstitution(); 00319 00320 computeLR_forward(); // combined optimisation 00321 backwardSubstitution(); 00322 } 00323 00324 virtual void output() 00325 { 00326 printf("F4Thomas "); 00327 AbsF4TriDiagonalLS::output(); 00328 } 00329 00330 virtual void output(FILE* file) 00331 { 00332 fprintf(file, "F4Thomas "); 00333 AbsF4TriDiagonalLS::output(file); 00334 } 00335 00336 }; 00337 00338 #endif 00339 00340