44 #ifndef KLU2_SOLVE_HPP 
   45 #define KLU2_SOLVE_HPP 
   47 #include "klu2_internal.h" 
   50 template <
typename Entry, 
typename Int>
 
   54     KLU_symbolic<Entry, Int> *Symbolic,
 
   55     KLU_numeric<Entry, Int> *Numeric,
 
   64     KLU_common<Entry, Int> *Common
 
   67     Entry x [4], offik, s ;
 
   69     Entry *Offx, *X, *Bz, *Udiag ;
 
   70     Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
 
   72     Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
 
   82     if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
 
   85         Common->status = KLU_INVALID ;
 
   88     Common->status = KLU_OK ;
 
   96     nblocks = Symbolic->nblocks ;
 
  104     ASSERT (nblocks == Numeric->nblocks) ;
 
  105     Pnum = Numeric->Pnum ;
 
  106     Offp = Numeric->Offp ;
 
  107     Offi = Numeric->Offi ;
 
  108     Offx = (Entry *) Numeric->Offx ;
 
  111     Llen = Numeric->Llen ;
 
  113     Ulen = Numeric->Ulen ;
 
  114     LUbx = (Unit **) Numeric->LUbx ;
 
  115     Udiag = (Entry *) Numeric->Udiag ;
 
  118     X = (Entry *) Numeric->Xwork ;
 
  120     ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
 
  126     for (chunk = 0 ; chunk < nrhs ; chunk += 4)
 
  133         nr = MIN (nrhs - chunk, 4) ;
 
  148                     for (k = 0 ; k < n ; k++)
 
  150                         X [k] = Bz [Pnum [k]] ;
 
  156                     for (k = 0 ; k < n ; k++)
 
  160                         X [2*k + 1] = Bz  [i + d  ] ;
 
  166                     for (k = 0 ; k < n ; k++)
 
  170                         X [3*k + 1] = Bz [i + d  ] ;
 
  171                         X [3*k + 2] = Bz [i + d*2] ;
 
  177                     for (k = 0 ; k < n ; k++)
 
  181                         X [4*k + 1] = Bz [i + d  ] ;
 
  182                         X [4*k + 2] = Bz [i + d*2] ;
 
  183                         X [4*k + 3] = Bz [i + d*3] ;
 
  197                     for (k = 0 ; k < n ; k++)
 
  199                         SCALE_DIV_ASSIGN (X [k], Bz  [Pnum [k]], Rs [k]) ;
 
  205                     for (k = 0 ; k < n ; k++)
 
  209                         SCALE_DIV_ASSIGN (X [2*k], Bz [i], rs) ;
 
  210                         SCALE_DIV_ASSIGN (X [2*k + 1], Bz [i + d], rs) ;
 
  216                     for (k = 0 ; k < n ; k++)
 
  220                         SCALE_DIV_ASSIGN (X [3*k], Bz [i], rs) ;
 
  221                         SCALE_DIV_ASSIGN (X [3*k + 1], Bz [i + d], rs) ;
 
  222                         SCALE_DIV_ASSIGN (X [3*k + 2], Bz [i + d*2], rs) ;
 
  228                     for (k = 0 ; k < n ; k++)
 
  232                         SCALE_DIV_ASSIGN (X [4*k], Bz [i], rs) ;
 
  233                         SCALE_DIV_ASSIGN (X [4*k + 1], Bz [i + d], rs) ;
 
  234                         SCALE_DIV_ASSIGN (X [4*k + 2], Bz [i + d*2], rs) ;
 
  235                         SCALE_DIV_ASSIGN (X [4*k + 3], Bz [i + d*3], rs) ;
 
  245         for (block = nblocks-1 ; block >= 0 ; block--)
 
  255             PRINTF ((
"solve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
 
  265                         DIV (X [k1], X [k1], s) ;
 
  269                         DIV (X [2*k1], X [2*k1], s) ;
 
  270                         DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
 
  274                         DIV (X [3*k1], X [3*k1], s) ;
 
  275                         DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
 
  276                         DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
 
  280                         DIV (X [4*k1], X [4*k1], s) ;
 
  281                         DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
 
  282                         DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
 
  283                         DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
 
  290                 KLU_lsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
 
  292                 KLU_usolve (nk, Uip + k1, Ulen + k1, LUbx [block],
 
  293                         Udiag + k1, nr, X + nr*k1) ;
 
  307                         for (k = k1 ; k < k2 ; k++)
 
  311                             for (p = Offp [k] ; p < pend ; p++)
 
  313                                 MULT_SUB (X [Offi [p]], Offx [p], x [0]) ;
 
  320                         for (k = k1 ; k < k2 ; k++)
 
  324                             x [1] = X [2*k + 1] ;
 
  325                             for (p = Offp [k] ; p < pend ; p++)
 
  329                                 MULT_SUB (X [2*i], offik, x [0]) ;
 
  330                                 MULT_SUB (X [2*i + 1], offik, x [1]) ;
 
  337                         for (k = k1 ; k < k2 ; k++)
 
  341                             x [1] = X [3*k + 1] ;
 
  342                             x [2] = X [3*k + 2] ;
 
  343                             for (p = Offp [k] ; p < pend ; p++)
 
  347                                 MULT_SUB (X [3*i], offik, x [0]) ;
 
  348                                 MULT_SUB (X [3*i + 1], offik, x [1]) ;
 
  349                                 MULT_SUB (X [3*i + 2], offik, x [2]) ;
 
  356                         for (k = k1 ; k < k2 ; k++)
 
  360                             x [1] = X [4*k + 1] ;
 
  361                             x [2] = X [4*k + 2] ;
 
  362                             x [3] = X [4*k + 3] ;
 
  363                             for (p = Offp [k] ; p < pend ; p++)
 
  367                                 MULT_SUB (X [4*i], offik, x [0]) ;
 
  368                                 MULT_SUB (X [4*i + 1], offik, x [1]) ;
 
  369                                 MULT_SUB (X [4*i + 2], offik, x [2]) ;
 
  370                                 MULT_SUB (X [4*i + 3], offik, x [3]) ;
 
  387                 for (k = 0 ; k < n ; k++)
 
  395                 for (k = 0 ; k < n ; k++)
 
  399                     Bz  [i + d  ] = X [2*k + 1] ;
 
  405                 for (k = 0 ; k < n ; k++)
 
  409                     Bz  [i + d  ] = X [3*k + 1] ;
 
  410                     Bz  [i + d*2] = X [3*k + 2] ;
 
  416                 for (k = 0 ; k < n ; k++)
 
  420                     Bz  [i + d  ] = X [4*k + 1] ;
 
  421                     Bz  [i + d*2] = X [4*k + 2] ;
 
  422                     Bz  [i + d*3] = X [4*k + 3] ;
 
  438 template <
typename Entry, 
typename Int>
 
  442     KLU_symbolic<Entry, Int> *Symbolic,
 
  443     KLU_numeric<Entry, Int> *Numeric,
 
  454     KLU_common<Entry, Int> *Common
 
  457     Entry x [4], offik, s ;
 
  459     Entry *Offx, *X, *Bz, *Udiag ;
 
  460     Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
 
  462     Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
 
  472     if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
 
  473         B == NULL || Xout == NULL)
 
  475         Common->status = KLU_INVALID ;
 
  478     Common->status = KLU_OK ;
 
  486     nblocks = Symbolic->nblocks ;
 
  494     ASSERT (nblocks == Numeric->nblocks) ;
 
  495     Pnum = Numeric->Pnum ;
 
  496     Offp = Numeric->Offp ;
 
  497     Offi = Numeric->Offi ;
 
  498     Offx = (Entry *) Numeric->Offx ;
 
  501     Llen = Numeric->Llen ;
 
  503     Ulen = Numeric->Ulen ;
 
  504     LUbx = (Unit **) Numeric->LUbx ;
 
  505     Udiag = (Entry *) Numeric->Udiag ;
 
  508     X = (Entry *) Numeric->Xwork ;
 
  510     ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
 
  516     for (chunk = 0 ; chunk < nrhs ; chunk += 4)
 
  523         nr = MIN (nrhs - chunk, 4) ;
 
  538                     for (k = 0 ; k < n ; k++)
 
  540                         X [k] = Bz [Pnum [k]] ;
 
  546                     for (k = 0 ; k < n ; k++)
 
  550                         X [2*k + 1] = Bz  [i + d  ] ;
 
  556                     for (k = 0 ; k < n ; k++)
 
  560                         X [3*k + 1] = Bz [i + d  ] ;
 
  561                         X [3*k + 2] = Bz [i + d*2] ;
 
  567                     for (k = 0 ; k < n ; k++)
 
  571                         X [4*k + 1] = Bz [i + d  ] ;
 
  572                         X [4*k + 2] = Bz [i + d*2] ;
 
  573                         X [4*k + 3] = Bz [i + d*3] ;
 
  587                     for (k = 0 ; k < n ; k++)
 
  589                         SCALE_DIV_ASSIGN (X [k], Bz  [Pnum [k]], Rs [k]) ;
 
  595                     for (k = 0 ; k < n ; k++)
 
  599                         SCALE_DIV_ASSIGN (X [2*k], Bz [i], rs) ;
 
  600                         SCALE_DIV_ASSIGN (X [2*k + 1], Bz [i + d], rs) ;
 
  606                     for (k = 0 ; k < n ; k++)
 
  610                         SCALE_DIV_ASSIGN (X [3*k], Bz [i], rs) ;
 
  611                         SCALE_DIV_ASSIGN (X [3*k + 1], Bz [i + d], rs) ;
 
  612                         SCALE_DIV_ASSIGN (X [3*k + 2], Bz [i + d*2], rs) ;
 
  618                     for (k = 0 ; k < n ; k++)
 
  622                         SCALE_DIV_ASSIGN (X [4*k], Bz [i], rs) ;
 
  623                         SCALE_DIV_ASSIGN (X [4*k + 1], Bz [i + d], rs) ;
 
  624                         SCALE_DIV_ASSIGN (X [4*k + 2], Bz [i + d*2], rs) ;
 
  625                         SCALE_DIV_ASSIGN (X [4*k + 3], Bz [i + d*3], rs) ;
 
  635         for (block = nblocks-1 ; block >= 0 ; block--)
 
  645             PRINTF ((
"solve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
 
  655                         DIV (X [k1], X [k1], s) ;
 
  659                         DIV (X [2*k1], X [2*k1], s) ;
 
  660                         DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
 
  664                         DIV (X [3*k1], X [3*k1], s) ;
 
  665                         DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
 
  666                         DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
 
  670                         DIV (X [4*k1], X [4*k1], s) ;
 
  671                         DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
 
  672                         DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
 
  673                         DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
 
  680                 KLU_lsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
 
  682                 KLU_usolve (nk, Uip + k1, Ulen + k1, LUbx [block],
 
  683                         Udiag + k1, nr, X + nr*k1) ;
 
  697                         for (k = k1 ; k < k2 ; k++)
 
  701                             for (p = Offp [k] ; p < pend ; p++)
 
  703                                 MULT_SUB (X [Offi [p]], Offx [p], x [0]) ;
 
  710                         for (k = k1 ; k < k2 ; k++)
 
  714                             x [1] = X [2*k + 1] ;
 
  715                             for (p = Offp [k] ; p < pend ; p++)
 
  719                                 MULT_SUB (X [2*i], offik, x [0]) ;
 
  720                                 MULT_SUB (X [2*i + 1], offik, x [1]) ;
 
  727                         for (k = k1 ; k < k2 ; k++)
 
  731                             x [1] = X [3*k + 1] ;
 
  732                             x [2] = X [3*k + 2] ;
 
  733                             for (p = Offp [k] ; p < pend ; p++)
 
  737                                 MULT_SUB (X [3*i], offik, x [0]) ;
 
  738                                 MULT_SUB (X [3*i + 1], offik, x [1]) ;
 
  739                                 MULT_SUB (X [3*i + 2], offik, x [2]) ;
 
  746                         for (k = k1 ; k < k2 ; k++)
 
  750                             x [1] = X [4*k + 1] ;
 
  751                             x [2] = X [4*k + 2] ;
 
  752                             x [3] = X [4*k + 3] ;
 
  753                             for (p = Offp [k] ; p < pend ; p++)
 
  757                                 MULT_SUB (X [4*i], offik, x [0]) ;
 
  758                                 MULT_SUB (X [4*i + 1], offik, x [1]) ;
 
  759                                 MULT_SUB (X [4*i + 2], offik, x [2]) ;
 
  760                                 MULT_SUB (X [4*i + 3], offik, x [3]) ;
 
  778                 for (k = 0 ; k < n ; k++)
 
  780                     Xout  [Q [k]] = X [k] ;
 
  786                 for (k = 0 ; k < n ; k++)
 
  789                     Xout  [i      ] = X [2*k    ] ;
 
  790                     Xout  [i + d  ] = X [2*k + 1] ;
 
  796                 for (k = 0 ; k < n ; k++)
 
  799                     Xout  [i      ] = X [3*k    ] ;
 
  800                     Xout  [i + d  ] = X [3*k + 1] ;
 
  801                     Xout  [i + d*2] = X [3*k + 2] ;
 
  807                 for (k = 0 ; k < n ; k++)
 
  810                     Xout  [i      ] = X [4*k    ] ;
 
  811                     Xout  [i + d  ] = X [4*k + 1] ;
 
  812                     Xout  [i + d*2] = X [4*k + 2] ;
 
  813                     Xout  [i + d*3] = X [4*k + 3] ;