00001 #ifndef __F4ThomasLS_H__
00002 #define __F4ThomasLS_H__
00003
00004
00005
00039 #include "AbsF4TriDiagonalLS.hpp"
00040
00041 #include <math.h>
00042 #include "sseUtil.h"
00043
00044
00045 class F4ThomasLS : public AbsF4TriDiagonalLS
00046 {
00047 private:
00048
00049 Float4Vector* uOut;
00050 Float4Vector* dOut;
00051 Float4Vector* lOut;
00052
00053 Float4Vector* y;
00054
00055 protected:
00056
00057
00059
00060 {
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074 float *dout, *ad0, *al, *au, *lout;
00075
00076
00077
00078 dout= dOut->get0();
00079 ad0= A->getD0();
00080 al= A->getL();
00081 au= A->getU();
00082 lout= lOut->get();
00083
00084
00085
00086
00087
00088
00089 movaps_m2r(ad0[0], xmm0);
00090 movaps_r2m(xmm0, dout[0]);
00091
00092 for(int j=4; j<N4; j+=4)
00093 {
00094
00095
00096
00097
00098 movaps_m2r(al[j], xmm1);
00099
00100 rcpps_r2r(xmm0, xmm3);
00101 mulps_r2r(xmm3, xmm1);
00102 movaps_r2m(xmm1, lout[j]);
00103
00104 movaps_m2r(ad0[j], xmm0);
00105 movaps_m2r(au[j], xmm2);
00106 mulps_r2r(xmm2, xmm1);
00107 subps_r2r(xmm1, xmm0);
00108 movaps_r2m(xmm0, dout[j]);
00109 }
00110 }
00111
00113 inline void forwardSubstitution()
00114 {
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124 float *yv, *bv, *lout;
00125
00126
00127 yv= y->get0();
00128 bv= b->get0();
00129 lout= lOut->get();
00130
00131
00132
00133
00134 movaps_m2r(bv[0], xmm0);
00135 movaps_r2m(xmm0, yv[0]);
00136
00137 for(int j=4; j<N4; j+=4)
00138 {
00139
00140
00141
00142 movaps_m2r(bv[j], xmm1);
00143 movaps_m2r(lout[j], xmm2);
00144 mulps_r2r(xmm2, xmm0);
00145 subps_r2r(xmm0, xmm1);
00146 movaps_r2r(xmm1, xmm0);
00147 movaps_r2m(xmm0, yv[j]);
00148 }
00149 }
00150
00152 inline void backwardSubstitution()
00153 {
00154
00155
00156
00157
00158
00159
00160
00161
00162 float *xv, *yv, *dout, *au;
00163
00164
00165 xv= x->get();
00166 yv= y->get();
00167 dout= dOut->get();
00168 au= A->getU();
00169
00170 prefetcht1(au[N4-4]);
00171
00172
00173
00174
00175 movaps_m2r(yv[N4], xmm0);
00176 movaps_m2r(dout[N4], xmm1);
00177
00178 rcpps_r2r(xmm1, xmm4);
00179 mulps_r2r(xmm4, xmm0);
00180
00181 movaps_r2m(xmm0, xv[N4]);
00182
00183 for(int j=N4-4; j>0; j -=4)
00184 {
00185 prefetcht1(au[j-4]);
00186
00187
00188
00189
00190 movaps_m2r(yv[j], xmm1);
00191 movaps_m2r(au[j], xmm2);
00192 movaps_m2r(dout[j], xmm3);
00193
00194 mulps_r2r(xmm0, xmm2);
00195 subps_r2r(xmm2, xmm1);
00196
00197 rcpps_r2r(xmm3, xmm4);
00198 mulps_r2r(xmm4, xmm1);
00199
00200 movaps_r2r(xmm1, xmm0);
00201 movaps_r2m(xmm0, xv[j]);
00202 }
00203 }
00204
00205
00206
00208
00209 {
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226 float *dout, *ad0, *al, *au, *yv, *bv;
00227
00228
00229 dout= dOut->get0();
00230 ad0= A->getD0();
00231 al= A->getL();
00232 au= A->getU();
00233
00234 yv= y->get0();
00235 bv= b->get0();
00236
00237 prefetcht1(bv[4]);
00238 prefetcht1(ad0[4]);
00239
00240
00241
00242
00243
00244 movaps_m2r(ad0[0], xmm0);
00245 movaps_r2m(xmm0, dout[0]);
00246
00247 movaps_m2r(bv[0], xmm4);
00248 movaps_r2m(xmm4, yv[0]);
00249
00250 for(int j=4; j<N4; j+=4)
00251 {
00252 prefetcht1(bv[j+4]);
00253 prefetcht1(ad0[j+4]);
00254
00255
00256
00257
00258
00259
00260 movaps_m2r(bv[j], xmm5);
00261
00262 movaps_m2r(al[j], xmm1);
00263
00264 rcpps_r2r(xmm0, xmm3);
00265 mulps_r2r(xmm3, xmm1);
00266
00267
00268 movaps_m2r(ad0[j], xmm0);
00269 movaps_m2r(au[j], xmm2);
00270 mulps_r2r(xmm1, xmm2);
00271 subps_r2r(xmm2, xmm0);
00272 movaps_r2m(xmm0, dout[j]);
00273
00274 mulps_r2r(xmm1, xmm4);
00275 subps_r2r(xmm4, xmm5);
00276 movaps_r2r(xmm5, xmm4);
00277 movaps_r2m(xmm4, yv[j]);
00278 }
00279 }
00280
00281 public:
00282
00283 F4ThomasLS(int asize):AbsF4TriDiagonalLS(asize)
00284 {
00285 uOut= new Float4Vector(N-1);
00286 dOut= new Float4Vector(N);
00287 lOut= new Float4Vector(N-1);
00288
00289 y= new Float4Vector(N);
00290 }
00291
00292 F4ThomasLS(int asize, int lines):AbsF4TriDiagonalLS(asize, lines)
00293 {
00294 uOut= new Float4Vector(N-1);
00295 dOut= new Float4Vector(N);
00296 lOut= new Float4Vector(N-1);
00297
00298 y= new Float4Vector(N);
00299 }
00300
00301 virtual ~F4ThomasLS()
00302 {
00303 delete uOut;
00304 delete dOut;
00305 delete lOut;
00306 delete y;
00307 }
00308
00309 virtual void loadA(F4TriDiagonalMatrix* m);
00310 virtual void loadMatrix(Float4Vector* u1, Float4Vector* d1, Float4Vector* l1);
00311 virtual void loadX(Float4Vector* data);
00312 virtual void loadB(Float4Vector* data);
00313
00314
00315 inline void solve()
00316 {
00317
00318
00319
00320 computeLR_forward();
00321 backwardSubstitution();
00322 }
00323
00324 virtual void output()
00325 {
00326 printf("F4Thomas ");
00327 AbsF4TriDiagonalLS::output();
00328 }
00329
00330 virtual void output(FILE* file)
00331 {
00332 fprintf(file, "F4Thomas ");
00333 AbsF4TriDiagonalLS::output(file);
00334 }
00335
00336 };
00337
00338 #endif
00339
00340