d2/d6a/distauntf_8hpp_source.html

 /* Copyright 2016 Ramakrishnan Kannan */
 #ifndef DISTNTF_DISTAUNTF_HPP_
 #define DISTNTF_DISTAUNTF_HPP_

 #include <armadillo>
 #include <string>
 #include <vector>
 #include "common/distutils.hpp"
 #include "common/ntf_utils.hpp"
 #include "dimtree/ddt.hpp"
 #include "distntf/distntfmpicomm.hpp"
 #include "distntf/distntftime.hpp"

 // #define DISTNTF_VERBOSE 1

 namespace planc {

 #define TENSOR_LOCAL_DIM (m_input_tensor.dimensions())
 #define TENSOR_LOCAL_NUMEL (m_input_tensor.numel())

 class DistAUNTF {
  protected:
   // communication related variables
   const NTFMPICommunicator &m_mpicomm;
   // NLS solve sizes
   UVEC m_nls_sizes;
   UVEC m_nls_idxs;
   // local ncp factors
   NCPFactors m_local_ncp_factors;
   NCPFactors m_local_ncp_factors_t;
   // mttkrp related variables
   MAT *ncp_mttkrp_t;
   MAT *ncp_local_mttkrp_t;
   // hadamard of the global_grams
   MAT global_gram;

   virtual MAT update(int current_mode) = 0;

  private:
   const Tensor &m_input_tensor;
   NCPFactors m_gathered_ncp_factors;
   NCPFactors m_gathered_ncp_factors_t;
   // mttkrp related variables
   MAT *ncp_krp;
   // gram related variables.
   MAT factor_local_grams;    // U in the algorithm.
   MAT *factor_global_grams;  // G in the algorithm

   // NTF related variable.
   const unsigned int m_low_rank_k;
   const unsigned int m_modes;
   const algotype m_updalgo;
   const UVEC m_global_dims;
   const UVEC m_factor_local_dims;
   unsigned int m_num_it;
   unsigned int current_mode;
   FVEC m_regularizers;
   bool m_compute_error;
   bool m_enable_dim_tree;
   unsigned int m_current_it;
   double m_rel_error;

   // needed for acceleration algorithms.
   bool m_accelerated;
   std::vector<bool> m_stale_mttkrp;
   // stats
   DistNTFTime time_stats;

   // computing error related;
   double m_global_sqnorm_A;
   MAT hadamard_all_grams;

   DenseDimensionTree *kdt;

   void update_global_gram(const int current_mode) {
     // computing U
     MPITIC;  // gram
     // force a ssyrk instead of gemm.
     MAT H = m_local_ncp_factors.factor(current_mode);
     factor_local_grams = H.t() * H;

     double temp = MPITOC;  // gram
     this->time_stats.compute_duration(temp);
     this->time_stats.gram_duration(temp);
     factor_global_grams[current_mode].zeros();
     // Computing G.
     MPITIC;  // allreduce gram
     MPI_Allreduce(factor_local_grams.memptr(),
                   factor_global_grams[current_mode].memptr(),
                   this->m_low_rank_k * this->m_low_rank_k, MPI_DOUBLE, MPI_SUM,
                   MPI_COMM_WORLD);
     temp = MPITOC;  // allreduce gram
     applyReg(this->m_regularizers(current_mode * 2),
              this->m_regularizers(current_mode * 2 + 1),
              &(factor_global_grams[current_mode]));
     this->time_stats.communication_duration(temp);
     this->time_stats.allreduce_duration(temp);
   }

   void applyReg(float lambda_l2, float lambda_l1, MAT *AtA) {
     // Frobenius norm regularization
     if (lambda_l2 > 0) {
       MAT identity = arma::eye<MAT>(this->m_low_rank_k, this->m_low_rank_k);
       (*AtA) = (*AtA) + 2 * lambda_l2 * identity;
     }

     // L1 - norm regularization
     if (lambda_l1 > 0) {
       MAT onematrix = arma::ones<MAT>(this->m_low_rank_k, this->m_low_rank_k);
       (*AtA) = (*AtA) + 2 * lambda_l1 * onematrix;
     }
   }

   void gram_hadamard(unsigned int current_mode) {
     global_gram.ones();
     MPITIC;  // gram hadamard
     for (unsigned int i = 0; i < m_modes; i++) {
       if (i != current_mode) {
         //%= element-wise multiplication
         global_gram %= factor_global_grams[i];
       }
     }
     double temp = MPITOC;  // gram hadamard
     this->time_stats.compute_duration(temp);
     this->time_stats.gram_duration(temp);
   }
   //
   void gather_ncp_factor(const int current_mode) {
     m_gathered_ncp_factors_t.factor(current_mode).zeros();
     // Had this comment for debugging memory corruption in all_gather
     // DISTPRINTINFO("::ncp_krp::" << ncp_krp[current_mode].memptr()
     //               << "::size::" << ncp_krp[current_mode].n_rows
     //               << "x" << ncp_krp[current_mode].n_cols
     //               << "::m_gathered_ncp_factors_t::"
     //               << m_gathered_ncp_factors_t.factor(current_mode).memptr()
     //               << "::diff from recvcnt::"
     //               << m_gathered_ncp_factors_t.factor(current_mode).memptr()
     //               - recvcnt * 8);

     MPI_Comm current_slice_comm = this->m_mpicomm.slice(current_mode);
     int slice_size;
     MPI_Comm_size(current_slice_comm, &slice_size);

     // int sendcnt = m_local_ncp_factors.factor(current_mode).n_elem;
     int sendcnt = m_nls_sizes[current_mode] * m_low_rank_k;

     // int recvcnt = m_local_ncp_factors.factor(current_mode).n_elem;
     std::vector<int> recvgathercnt(slice_size, 0);
     std::vector<int> recvgatherdispl(slice_size, 0);

     int dimsize = m_factor_local_dims[current_mode];
     for (int i = 0; i < slice_size; i++) {
       recvgathercnt[i] = itersplit(dimsize, slice_size, i) * m_low_rank_k;
       recvgatherdispl[i] = startidx(dimsize, slice_size, i) * m_low_rank_k;
     }

 #ifdef DISTNTF_VERBOSE
     MPI_Comm current_fiber_comm = this->m_mpicomm.fiber(current_mode);
     int fiber_size;

     MPI_Comm_size(current_fiber_comm, &fiber_size);
     MPI_Comm_size(current_slice_comm, &slice_size);
     DISTPRINTINFO("::current_mode::"
                   << current_mode << "::fiber comm size::" << fiber_size
                   << "::my_global_rank::" << MPI_RANK << "::my_slice_rank::"
                   << this->m_mpicomm.slice_rank(current_mode)
                   << "::my_fiber_rank::"
                   << this->m_mpicomm.fiber_rank(current_mode)
                   << "::sendcnt::" << sendcnt << "::gathered factor size::"
                   << m_gathered_ncp_factors_t.factor(current_mode).n_elem);
 #endif
     MPITIC;  // allgather tic
     MPI_Allgatherv(m_local_ncp_factors_t.factor(current_mode).memptr(), sendcnt,
                    MPI_DOUBLE,
                    m_gathered_ncp_factors_t.factor(current_mode).memptr(),
                    &recvgathercnt[0], &recvgatherdispl[0], MPI_DOUBLE,
                    // todo:: check whether it is slice or fiber while running
                    // and debugging the code.
                    current_slice_comm);
     // current_slice_comm);
     double temp = MPITOC;  // allgather toc
     this->time_stats.communication_duration(temp);
     this->time_stats.allgather_duration(temp);
 #ifdef DISTNTF_VERBOSE
     DISTPRINTINFO("sent local factor::"
                   << std::endl
                   << m_local_ncp_factors_t.factor(current_mode) << std::endl
                   << " gathered factor::" << std::endl
                   << m_gathered_ncp_factors_t.factor(current_mode));
 #endif
     // keep gather_ncp_factors_t consistent.
     MPITIC;  // transpose tic
     m_gathered_ncp_factors.set(
         current_mode, m_gathered_ncp_factors_t.factor(current_mode).t());
     temp = MPITOC;  // transpose toc
     this->time_stats.compute_duration(temp);
     this->time_stats.trans_duration(temp);
   }

   void distmttkrp(const int &current_mode) {
     double temp;
     if (!this->m_enable_dim_tree) {
       MPITIC;  // krp tic
       m_gathered_ncp_factors.krp_leave_out_one(current_mode,
                                                &ncp_krp[current_mode]);
       temp = MPITOC;  // krp toc
       this->time_stats.compute_duration(temp);
       this->time_stats.krp_duration(temp);
     }

     if (this->m_enable_dim_tree) {
       double multittv_time = 0;
       double mttkrp_time = 0;
       kdt->in_order_reuse_MTTKRP(current_mode,
                                  ncp_mttkrp_t[current_mode].memptr(), false,
                                  multittv_time, mttkrp_time);
       this->time_stats.compute_duration(multittv_time);
       this->time_stats.compute_duration(mttkrp_time);
       this->time_stats.multittv_duration(multittv_time);
       this->time_stats.mttkrp_duration(mttkrp_time);

     } else {
       MPITIC;  // mttkrp tic
       m_input_tensor.mttkrp(current_mode, ncp_krp[current_mode],
                             &ncp_mttkrp_t[current_mode]);
       temp = MPITOC;  // mttkrp toc
       this->time_stats.compute_duration(temp);
       this->time_stats.mttkrp_duration(temp);
     }
     // verify if the dimension tree output matches with the classic one
     // MAT kdt_ncp_mttkrp_t = ncp_mttkrp_t[current_mode];
     // bool same_mttkrp = arma::approx_equal(kdt_ncp_mttkrp_t,
     // ncp_mttkrp_t[current_mode], "absdiff", 1e-3); PRINTROOT("kdt vs
     // mttkrp_t::" << same_mttkrp); MAT ncp_mttkrp =
     // ncp_mttkrp_t[current_mode].t(); same_mttkrp =
     // arma::approx_equal(kdt_ncp_mttkrp_t, ncp_mttkrp, "absdiff", 1e-3);
     // PRINTROOT("kdt vs mttkrp::" << same_mttkrp);
     // PRINTROOT("kdt mttkrp::" << kdt_ncp_mttkrp_t);
     // PRINTROOT("classic mttkrp_t::" << ncp_mttkrp_t[current_mode]);

     MPI_Comm current_slice_comm = this->m_mpicomm.slice(current_mode);
     int slice_size;
     int slice_rank;
     MPI_Comm_size(current_slice_comm, &slice_size);
     slice_rank = this->m_mpicomm.slice_rank(current_mode);
     std::vector<int> recvmttkrpsize(slice_size);
     int dimsize = m_factor_local_dims[current_mode];
     for (int i = 0; i < slice_size; i++) {
       recvmttkrpsize[i] = itersplit(dimsize, slice_size, i) * m_low_rank_k;
     }
 #ifdef DISTNTF_VERBOSE
     MPI_Comm current_fiber_comm = this->m_mpicomm.fiber(current_mode);
     int fiber_size;
     MPI_Comm_size(current_fiber_comm, &fiber_size);
     DISTPRINTINFO("::current_mode::"
                   << current_mode << "::slice comm size::" << slice_size
                   << "::fiber comm size::" << fiber_size
                   << "::my_global_rank::" << MPI_RANK << "::my_slice_rank::"
                   << this->m_mpicomm.slice_rank(current_mode)
                   << "::my_fiber_rank::"
                   << this->m_mpicomm.fiber_rank(current_mode)
                   << "::mttkrp_size::" << ncp_mttkrp_t[current_mode].n_elem
                   << "::local_mttkrp_size::"
                   << ncp_local_mttkrp_t[current_mode].n_elem);
 #endif
     ncp_local_mttkrp_t[current_mode].zeros();
     MPITIC;  // reduce_scatter mttkrp
     MPI_Reduce_scatter(ncp_mttkrp_t[current_mode].memptr(),
                        ncp_local_mttkrp_t[current_mode].memptr(),
                        &recvmttkrpsize[0], MPI_DOUBLE, MPI_SUM,
                        current_slice_comm);
     temp = MPITOC;  // reduce_scatter mttkrp
     this->time_stats.communication_duration(temp);
     this->time_stats.reducescatter_duration(temp);
 #ifdef DISTNTF_VERBOSE
     DISTPRINTINFO(ncp_mttkrp_t[current_mode]);
     DISTPRINTINFO(ncp_local_mttkrp_t[current_mode]);
 #endif
     this->m_stale_mttkrp[current_mode] = false;
   }

   void allocateMatrices() {
     // allocate matrices.
     if (!m_enable_dim_tree) {
       ncp_krp = new MAT[m_modes];
     }
     ncp_mttkrp_t = new MAT[m_modes];
     ncp_local_mttkrp_t = new MAT[m_modes];
     factor_global_grams = new MAT[m_modes];
     factor_local_grams.zeros(this->m_low_rank_k, this->m_low_rank_k);
     global_gram.ones(this->m_low_rank_k, this->m_low_rank_k);
     UWORD current_size = 0;
     for (unsigned int i = 0; i < m_modes; i++) {
       current_size = TENSOR_LOCAL_NUMEL / TENSOR_LOCAL_DIM[i];
       if (!m_enable_dim_tree) {
         ncp_krp[i] = arma::zeros(current_size, this->m_low_rank_k);
       }
       ncp_mttkrp_t[i] = arma::zeros(this->m_low_rank_k, TENSOR_LOCAL_DIM[i]);
       ncp_local_mttkrp_t[i] = arma::zeros(m_local_ncp_factors.factor(i).n_cols,
                                           m_local_ncp_factors.factor(i).n_rows);
       factor_global_grams[i] =
           arma::zeros(this->m_low_rank_k, this->m_low_rank_k);
     }
   }

   void freeMatrices() {
     for (unsigned int i = 0; i < m_modes; i++) {
       if (!m_enable_dim_tree) {
         ncp_krp[i].clear();
       }
       ncp_mttkrp_t[i].clear();
       ncp_local_mttkrp_t[i].clear();
       factor_global_grams[i].clear();
     }
     if (!m_enable_dim_tree) {
       delete[] ncp_krp;
     }
     delete[] ncp_mttkrp_t;
     delete[] ncp_local_mttkrp_t;
     delete[] factor_global_grams;
   }

   void reportTime(const double temp, const std::string &reportstring) {
     double mintemp, maxtemp, sumtemp;
     MPI_Allreduce(&temp, &maxtemp, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
     MPI_Allreduce(&temp, &mintemp, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
     MPI_Allreduce(&temp, &sumtemp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
     PRINTROOT(reportstring  // << "::dims::" << this->m_global_dims.t()
               << "::k::" << this->m_low_rank_k << "::SIZE::" << MPI_SIZE
               << "::algo::" << this->m_updalgo << "::root::" << temp
               << "::min::" << mintemp << "::avg::" << (sumtemp) / (MPI_SIZE)
               << "::max::" << maxtemp);
   }

   void update_factor_mode(const unsigned int current_mode, const MAT &factor) {
     m_local_ncp_factors.set(current_mode, factor);
     m_local_ncp_factors.distributed_normalize(current_mode);
     MAT factor_t = m_local_ncp_factors.factor(current_mode).t();
     m_local_ncp_factors_t.set(current_mode, factor_t);
     m_local_ncp_factors_t.set_lambda(m_local_ncp_factors.lambda());
     // line 13 and 14
     update_global_gram(current_mode);
     // line 15
     gather_ncp_factor(current_mode);
     if (this->m_enable_dim_tree) {
       kdt->set_factor(m_gathered_ncp_factors_t.factor(current_mode).memptr(),
                       current_mode);
     }
     for (unsigned int mode = 0; mode < this->m_modes; mode++) {
       if (mode != current_mode) this->m_stale_mttkrp[mode] = true;
     }
   }

   virtual void accelerate() {}

   void generateReport() {
     MPI_Barrier(MPI_COMM_WORLD);
     this->reportTime(this->time_stats.duration(), "total_d");
     this->reportTime(this->time_stats.communication_duration(), "total_comm");
     this->reportTime(this->time_stats.compute_duration(), "total_comp");
     this->reportTime(this->time_stats.allgather_duration(), "total_allgather");
     this->reportTime(this->time_stats.allreduce_duration(), "total_allreduce");
     this->reportTime(this->time_stats.reducescatter_duration(),
                      "total_reducescatter");
     this->reportTime(this->time_stats.gram_duration(), "total_gram");
     this->reportTime(this->time_stats.krp_duration(), "total_krp");
     this->reportTime(this->time_stats.mttkrp_duration(), "total_mttkrp");
     this->reportTime(this->time_stats.multittv_duration(), "total_multittv");
     this->reportTime(this->time_stats.nnls_duration(), "total_nnls");
     if (this->m_compute_error) {
       this->reportTime(this->time_stats.err_compute_duration(),
                        "total_err_compute");
       this->reportTime(this->time_stats.err_compute_duration(),
                        "total_err_communication");
     }
   }

  public:
   DistAUNTF(const Tensor &i_tensor, const int i_k, algotype i_algo,
             const UVEC &i_global_dims, const UVEC &i_local_dims,
             const UVEC &i_nls_sizes, const UVEC &i_nls_idxs,
             const NTFMPICommunicator &i_mpicomm)
       : m_mpicomm(i_mpicomm),
         m_nls_sizes(i_nls_sizes),
         m_nls_idxs(i_nls_idxs),
         m_local_ncp_factors(i_nls_sizes, i_k, false),
         m_local_ncp_factors_t(i_nls_sizes, i_k, true),
         m_input_tensor(i_tensor),
         m_gathered_ncp_factors(i_tensor.dimensions(), i_k, false),
         m_gathered_ncp_factors_t(i_tensor.dimensions(), i_k, true),
         m_low_rank_k(i_k),
         m_modes(m_input_tensor.modes()),
         m_updalgo(i_algo),
         m_global_dims(i_global_dims),
         m_factor_local_dims(i_local_dims),
         time_stats(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) {
     this->m_compute_error = false;
     this->m_enable_dim_tree = false;
     this->m_accelerated = false;
     this->m_num_it = 30;
     this->m_rel_error = 1.0;
     // randomize again. otherwise all the process and factors
     // will be same.
     m_local_ncp_factors.randu(149 * i_mpicomm.rank() + 103);
     m_local_ncp_factors.distributed_normalize();
     for (unsigned int i = 0; i < this->m_modes; i++) {
       MAT current_factor = arma::trans(m_local_ncp_factors.factor(i));
       m_local_ncp_factors_t.set(i, current_factor);
       this->m_stale_mttkrp.push_back(true);
     }
     m_local_ncp_factors_t.set_lambda(m_local_ncp_factors.lambda());
     m_gathered_ncp_factors.trans(m_gathered_ncp_factors_t);
     allocateMatrices();
     double normA = i_tensor.norm();
     MPI_Allreduce(&normA, &this->m_global_sqnorm_A, 1, MPI_DOUBLE, MPI_SUM,
                   MPI_COMM_WORLD);

     DISTPRINTINFO("::NLS Solve Sizes::"
                   << m_nls_sizes << "::NLS start indices::" << m_nls_idxs);
   }
   ~DistAUNTF() {
     freeMatrices();
     if (this->m_enable_dim_tree) {
       delete kdt;
     }
   }
   void num_iterations(const int i_n) { this->m_num_it = i_n; }
   size_t modes() const { return this->m_modes; }
   size_t rank() const { return this->m_low_rank_k; }
   void regularizers(const FVEC i_regs) { this->m_regularizers = i_regs; }
   void compute_error(bool i_error) {
     this->m_compute_error = i_error;
     hadamard_all_grams =
         arma::ones<MAT>(this->m_low_rank_k, this->m_low_rank_k);
   }
   VEC lambda() { return m_local_ncp_factors.lambda(); }
   int current_it() const { return this->m_current_it; }
   double current_error() const { return this->m_rel_error; }
   void dim_tree(bool i_dim_tree) {
     this->m_enable_dim_tree = i_dim_tree;
     if (this->m_enable_dim_tree) {
       if (this->ncp_krp != NULL) {
         for (unsigned int i = 0; i < m_modes; i++) {
           ncp_krp[i].clear();
         }
         delete[] ncp_krp;
       }
     }
   }
   void accelerated(const bool &set_acceleration) {
     this->m_accelerated = set_acceleration;
     this->m_compute_error = true;
   }

   bool is_stale_mttkrp(const int &current_mode) const {
     return this->m_stale_mttkrp[current_mode];
   }

   void reset(const NCPFactors &new_factors, bool trans = false) {
     if (!trans) {
       for (unsigned int i = 0; i < m_modes; i++) {
         update_factor_mode(i, new_factors.factor(i));
       }
     } else {
       for (unsigned int i = 0; i < m_modes; i++) {
         update_factor_mode(i, new_factors.factor(i).t());
       }
     }
     m_local_ncp_factors.set_lambda(new_factors.lambda());
     m_local_ncp_factors_t.set_lambda(new_factors.lambda());
   }

   // Preferrably call this after the computeNTF().
   // This is right now called to save the factor matrices.
   void factor(int mode, double *factor_matrix) {
     gather_ncp_factor(mode);
     int sendcnt = m_gathered_ncp_factors_t.factor(mode).n_elem;
     int fiber_size = this->m_mpicomm.proc_grids()[mode];
     int global_size = this->m_global_dims[mode];
     std::vector<int> recvcnts(fiber_size, 0);
     std::vector<int> displs(fiber_size, 0);

     DISTPRINTINFO("Collecting mode::" << mode << "::sendcnt::" << sendcnt
                                       << "::fiber_size::" << fiber_size
                                       << "::global_size::" << global_size);

     // int dimsize = m_factor_local_dims[current_mode];
     for (int i = 0; i < fiber_size; i++) {
       recvcnts[i] = itersplit(global_size, fiber_size, i) * m_low_rank_k;
       displs[i] = startidx(global_size, fiber_size, i) * m_low_rank_k;
     }

     MPI_Gatherv(m_gathered_ncp_factors_t.factor(mode).memptr(), sendcnt,
                 MPI_DOUBLE, factor_matrix, &recvcnts[0], &displs[0], MPI_DOUBLE,
                 // todo:: check whether it is slice or fiber while running
                 // and debugging the code.
                 0, this->m_mpicomm.fiber(mode));
   }

   void computeNTF() {
     // initialize everything.
     // line 3,4,5 of the algorithm
     for (unsigned int i = 1; i < m_modes; i++) {
       update_global_gram(i);
       gather_ncp_factor(i);
     }
     if (this->m_enable_dim_tree) {
       // Determine optimial split when given mode ordering.
       // product of left dimensions \approx product of right dimensions.
       size_t split_criteria = arma::prod(m_input_tensor.dimensions());
       split_criteria = std::round(std::sqrt(split_criteria));
       UVEC temp_cum_prod = arma::cumprod(m_input_tensor.dimensions());
       int split_mode = 0;
       while (temp_cum_prod(split_mode) < split_criteria) {
         split_mode++;
       }
       PRINTROOT("KDT Split Mode::" << split_mode
                                    << "::split criteria::" << split_criteria
                                    << "::cum prod::" << std::endl
                                    << temp_cum_prod << std::endl);

       // check to see if split mode is left or right.
       if (split_mode > 0) {
         size_t current_left = temp_cum_prod(split_mode);
         size_t good_criteria = temp_cum_prod(temp_cum_prod.n_rows - 1) /
                                temp_cum_prod(split_mode - 1);
         if (current_left > good_criteria) split_mode--;
         PRINTROOT("KDT Split Mode::"
                   << split_mode << "::split criteria::" << split_criteria
                   << "::numerator::" << temp_cum_prod(temp_cum_prod.n_rows - 1)
                   << "::good_criteria::" << good_criteria
                   << "::current_left::" << current_left << std::endl
                   << "::cum prod::" << std::endl
                   << temp_cum_prod << std::endl);
       }
       kdt = new DenseDimensionTree(m_input_tensor, m_gathered_ncp_factors,
                                    split_mode);
     }
 #ifdef DISTNTF_VERBOSE
     DISTPRINTINFO("local factor matrices::");
     this->m_local_ncp_factors.print();
     DISTPRINTINFO("local factor matrices transpose::");
     this->m_local_ncp_factors_t.print();
     DISTPRINTINFO("gathered factor matrices::");
     this->m_gathered_ncp_factors.print();
 #endif
     for (this->m_current_it = 0; this->m_current_it < m_num_it;
          this->m_current_it++) {
       MAT unnorm_factor;
       for (unsigned int current_mode = 0; current_mode < m_modes;
            current_mode++) {
         // line 9 and 10 of the algorithm
         if (is_stale_mttkrp(current_mode)) distmttkrp(current_mode);
         // line 11 of the algorithm
         gram_hadamard(current_mode);
         // line 12 of the algorithm
 #ifdef DISTNTF_VERBOSE
         DISTPRINTINFO("local factor matrix::"
                       << this->m_local_ncp_factors.factor(current_mode));
         DISTPRINTINFO("gathered factor matrix::");
         this->m_gathered_ncp_factors.print();
         PRINTROOT("global_grams::" << std::endl << this->global_gram);
         DISTPRINTINFO("mttkrp::");
         this->ncp_local_mttkrp_t[current_mode].print();
 #endif
         MPITIC;  // nnls_tic
         MAT factor = update(current_mode);
         double temp = MPITOC;  // nnls_toc
         this->time_stats.compute_duration(temp);
         this->time_stats.nnls_duration(temp);
 #ifdef DISTNTF_VERBOSE
         DISTPRINTINFO("it::" << this->m_current_it << "::mode::" << current_mode
                              << std::endl
                              << factor);
 #endif
         if (m_compute_error && current_mode == this->m_modes - 1) {
           unnorm_factor = factor;
         }
         update_factor_mode(current_mode, factor.t());
       }
       if (m_compute_error) {
         double temp_err = computeError(unnorm_factor, this->m_modes - 1);
         this->m_rel_error = temp_err;
         double iter_time = this->time_stats.compute_duration() +
                            this->time_stats.communication_duration();
         PRINTROOT("Iter::" << this->m_current_it << "::k::"
                            << this->m_low_rank_k << "::SIZE::" << MPI_SIZE
                            << "::algo::" << this->m_updalgo << "::time::"
                            << iter_time << "::relative_error::" << temp_err);
       }
       if (this->m_accelerated) {
         // there is a acceleration possible. call accelerate method
         // in the derived class.
         accelerate();
       }
       PRINTROOT("completed it::" << this->m_current_it);
     }
     generateReport();
   }
   double computeError(const MAT &unnorm_factor, int mode) {
     // rel_Error = sqrt(max(init.nr_X^2 + lambda^T * Hadamard of all gram *
     // lambda - 2 * innerprod(X,F_kten),0))/init.nr_X;
     MPITIC;  // err compute
     hadamard_all_grams = global_gram % factor_global_grams[mode];
     VEC local_lambda = m_local_ncp_factors.lambda();
     ROWVEC temp_vec = local_lambda.t() * hadamard_all_grams;
     double sq_norm_model = arma::dot(temp_vec, local_lambda);
     // double sq_norm_model = arma::norm(hadamard_all_grams, "fro");
     // sum of the element-wise dot product between the local mttkrp and
     // the factor matrix
     double inner_product = arma::dot(ncp_local_mttkrp_t[mode], unnorm_factor);
     double temp = MPITOC;  // err compute
     this->time_stats.compute_duration(temp);
     this->time_stats.err_compute_duration(temp);
     double all_inner_product;
     MPITIC;  // err comm
     MPI_Allreduce(&inner_product, &all_inner_product, 1, MPI_DOUBLE, MPI_SUM,
                   MPI_COMM_WORLD);
     temp = MPITOC;  // err comm
     this->time_stats.communication_duration(temp);
     this->time_stats.err_communication_duration(temp);
 #ifdef DISTNTF_VERBOSE
     DISTPRINTINFO("local_lambda::" << local_lambda);
     DISTPRINTINFO("local_inner_product::" << inner_product << std::endl);
     PRINTROOT("norm_A_sq :: "
               << this->m_global_sqnorm_A << "::model_norm_sq::" << sq_norm_model
               << "::global_inner_product::" << all_inner_product << std::endl);
 #endif
     double squared_err =
         this->m_global_sqnorm_A + sq_norm_model - 2 * all_inner_product;
     if (squared_err < 0) {
       PRINTROOT("computed error is negative due to round off");
       PRINTROOT("norm_A_sq :: "
                 << this->m_global_sqnorm_A
                 << "::model_norm_sq::" << sq_norm_model
                 << "::global_inner_product::" << all_inner_product
                 << "::squared_err::" << squared_err << std::endl);
     }
     return std::sqrt(std::abs(squared_err) / this->m_global_sqnorm_A);
   }
   double computeError(const NCPFactors &new_factors_t, const int mode) {
     // rel_Error = sqrt(max(init.nr_X^2 + lambda^T * Hadamard of all gram *
     // lambda - 2 * innerprod(X,F_kten),0))/init.nr_X;
     // Reset with new factors and compute error on mode 0
     reset(new_factors_t, true);
     distmttkrp(mode);
     gram_hadamard(mode);
     hadamard_all_grams = global_gram % factor_global_grams[mode];
     VEC local_lambda = m_local_ncp_factors.lambda();
     MAT unnorm_factor =
         arma::diagmat(local_lambda) * new_factors_t.factor(mode);
     ROWVEC temp_vec = local_lambda.t() * hadamard_all_grams;
     double sq_norm_model = arma::dot(temp_vec, local_lambda);
     // double sq_norm_model = arma::norm(hadamard_all_grams, "fro");
     // sum of the element-wise dot product between the local mttkrp and
     // the factor matrix
     double inner_product = arma::dot(ncp_local_mttkrp_t[mode], unnorm_factor);
     double all_inner_product;
     MPI_Allreduce(&inner_product, &all_inner_product, 1, MPI_DOUBLE, MPI_SUM,
                   MPI_COMM_WORLD);
     double squared_err =
         this->m_global_sqnorm_A + sq_norm_model - 2 * all_inner_product;
     if (squared_err < 0) {
       PRINTROOT("computed error is negative due to round off");
       PRINTROOT("norm_A_sq :: "
                 << this->m_global_sqnorm_A
                 << "::model_norm_sq::" << sq_norm_model
                 << "::global_inner_product::" << all_inner_product
                 << "::squared_err::" << squared_err << std::endl);
     }
     return std::sqrt(std::abs(squared_err) / this->m_global_sqnorm_A);
   }
 };  // class DistAUNTF
 }  // namespace planc
 #endif  // DISTNTF_DISTAUNTF_HPP_
planc::DistAUNTF::rank
size_t rank() const
Low Rank.
Definition: distauntf.hpp:484

planc::NCPFactors::randu
void randu(const int i_seed)
initializes the local tensor with the given seed.
Definition: ncpfactors.hpp:371

planc::DistNTFTime::err_communication_duration
const double err_communication_duration() const
Definition: distntftime.hpp:91

planc::DistAUNTF::accelerated
void accelerated(const bool &set_acceleration)
Does the algorithm need acceleration?
Definition: distauntf.hpp:513

planc::Tensor
Data is stored such that the unfolding  is column major.
Definition: tensor.hpp:32

ntf_utils.hpp

planc::DistAUNTF::compute_error
void compute_error(bool i_error)
Sets whether to compute the error or not.
Definition: distauntf.hpp:488

planc::NTFMPICommunicator::rank
int rank(const int *i_coords) const
Returns the rank of current MPI process given the cartesian coordinates.
Definition: distntfmpicomm.hpp:153

planc::DistNTFTime::nnls_duration
const double nnls_duration() const
Definition: distntftime.hpp:89

planc::DistNTFTime::gram_duration
const double gram_duration() const
Definition: distntftime.hpp:85

startidx
int startidx(int n, int p, int r)
Returns the start idx of the current rank r for a global dimension n across p processes.
Definition: distutils.hpp:90

planc::DistNTFTime::duration
const double duration() const
Definition: distntftime.hpp:75

planc::NCPFactors::print
void print()
prints the entire NCPFactors including the factor matrices
Definition: ncpfactors.hpp:302

planc::NCPFactors
Definition: ncpfactors.hpp:22

planc::NTFMPICommunicator::slice
const MPI_Comm & slice(const int i) const
Returns the slice communicator.
Definition: distntfmpicomm.hpp:151

planc::DistAUNTF
Definition: distauntf.hpp:32

planc::DistNTFTime::mttkrp_duration
const double mttkrp_duration() const
Definition: distntftime.hpp:87

DenseDimensionTree::set_factor
void set_factor(const double *arma_factor_ptr, const long int mode)
Definition: ddt.hpp:93

planc::DistAUNTF::is_stale_mttkrp
bool is_stale_mttkrp(const int &current_mode) const
Definition: distauntf.hpp:518

MPITIC
#define MPITIC
Definition: distutils.h:26

planc::NCPFactors::trans
void trans(NCPFactors &factor_t)
Transposes the entire factor matrix.
Definition: ncpfactors.hpp:323

FVEC
#define FVEC
Definition: utils.h:55

planc::DistNTFTime::allreduce_duration
const double allreduce_duration() const
Definition: distntftime.hpp:81

MPITOC
#define MPITOC
Definition: distutils.h:27

algotype
algotype
Definition: utils.h:10

planc::Tensor::mttkrp
void mttkrp(const int i_n, const MAT &i_krp, MAT *o_mttkrp) const
size of krp must be product of all dimensions leaving out nxk.
Definition: tensor.hpp:242

planc::DistAUNTF::reset
void reset(const NCPFactors &new_factors, bool trans=false)
This function will completely reset all the factors and the state of AUNTF.
Definition: distauntf.hpp:529

planc::DistAUNTF::num_iterations
void num_iterations(const int i_n)
Returns number of iterations.
Definition: distauntf.hpp:480

UVEC
#define UVEC
Definition: utils.h:58

TENSOR_LOCAL_DIM
#define TENSOR_LOCAL_DIM
Definition: distauntf.hpp:29

DISTPRINTINFO
#define DISTPRINTINFO(MSG)
Definition: distutils.h:37

itersplit
int itersplit(int n, int p, int r)
The dimension a particular rank holds out of the global dimension n across p processes.
Definition: distutils.hpp:78

planc::NCPFactors::krp_leave_out_one
MAT krp_leave_out_one(const unsigned int i_n)
KRP leaving out the mode i_n.
Definition: ncpfactors.hpp:154

planc::DistNTFTime::multittv_duration
const double multittv_duration() const
Definition: distntftime.hpp:88

planc::DistNTFTime::reducescatter_duration
const double reducescatter_duration() const
Definition: distntftime.hpp:82

planc::NCPFactors::lambda
VEC lambda() const
returns the lambda vector
Definition: ncpfactors.hpp:104

planc::DistNTFTime::compute_duration
const double compute_duration() const
Definition: distntftime.hpp:76

distntfmpicomm.hpp

DenseDimensionTree::in_order_reuse_MTTKRP
void in_order_reuse_MTTKRP(long int n, double *out, bool colmajor, double &multittv_time, double &mttkrp_time)
Definition: ddt.hpp:134

planc::NTFMPICommunicator::proc_grids
UVEC proc_grids() const
Returns the process grid for which the communicators are setup.
Definition: distntfmpicomm.hpp:170

planc::DistAUNTF::computeNTF
void computeNTF()
The main computeNTF loop.
Definition: distauntf.hpp:578

distutils.hpp

planc::DistNTFTime::allgather_duration
const double allgather_duration() const
Definition: distntftime.hpp:80

planc::NTFMPICommunicator
Definition: distntfmpicomm.hpp:12

planc::DistAUNTF::regularizers
void regularizers(const FVEC i_regs)
L1 and L2 Regularization for every mode.
Definition: distauntf.hpp:486

planc::NCPFactors::set
void set(const int i_n, const MAT &i_factor)
Set the mode i_n with the given factor matrix.
Definition: ncpfactors.hpp:112

ddt.hpp

planc::DistAUNTF::current_it
int current_it() const
Returns the current outer iteration of the computeNTF.
Definition: distauntf.hpp:496

MPI_RANK
#define MPI_RANK
Definition: distutils.h:16

planc::NCPFactors::set_lambda
void set_lambda(const VEC &new_lambda)
sets the lambda vector
Definition: ncpfactors.hpp:117

planc::DistAUNTF::current_error
double current_error() const
Returns the current error.
Definition: distauntf.hpp:498

planc::DistNTFTime::krp_duration
const double krp_duration() const
Definition: distntftime.hpp:86

planc::NTFMPICommunicator::fiber
const MPI_Comm & fiber(const int i) const
Returns the fiber communicator.
Definition: distntfmpicomm.hpp:149

UWORD
#define UWORD
Definition: utils.h:60

planc::DistNTFTime::trans_duration
const double trans_duration() const
Definition: distntftime.hpp:94

planc::NTFMPICommunicator::fiber_rank
int fiber_rank(int i) const
Returns the fiber rank on a particular fiber grid.
Definition: distntfmpicomm.hpp:166

planc::DistAUNTF::DistAUNTF
DistAUNTF(const Tensor &i_tensor, const int i_k, algotype i_algo, const UVEC &i_global_dims, const UVEC &i_local_dims, const UVEC &i_nls_sizes, const UVEC &i_nls_idxs, const NTFMPICommunicator &i_mpicomm)
Definition: distauntf.hpp:431

MAT
#define MAT
Definition: utils.h:52

planc::Tensor::dimensions
UVEC dimensions() const
Returns a vector of dimensions on every mode.
Definition: tensor.hpp:161

planc::DistNTFTime::err_compute_duration
const double err_compute_duration() const
Definition: distntftime.hpp:90

planc::NTFMPICommunicator::slice_rank
int slice_rank(int i) const
Returns the slice rank on a particular slice grid.
Definition: distntfmpicomm.hpp:168

planc::Tensor::norm
double norm() const
returns the frobenius norm of the tensor
Definition: tensor.hpp:346

planc::DistAUNTF::modes
size_t modes() const
Returns the numbers of modes of the tensor.
Definition: distauntf.hpp:482

planc::DistNTFTime
Definition: distntftime.hpp:7

ROWVEC
#define ROWVEC
Definition: utils.h:54

planc
ncp_factors contains the factors of the ncp every ith factor is of size n_i * k number of factors is ...
Definition: ncpfactors.hpp:20

PRINTROOT
#define PRINTROOT(MSG)
Definition: distutils.h:32

VEC
#define VEC
Definition: utils.h:61

planc::DistNTFTime::communication_duration
const double communication_duration() const
Definition: distntftime.hpp:77

planc::DistAUNTF::factor
void factor(int mode, double *factor_matrix)
Returns the factor matrix by collected it across all the processors.
Definition: distauntf.hpp:552

planc::NCPFactors::factor
MAT & factor(const int i_n) const
factor matrix of a mode i_n
Definition: ncpfactors.hpp:100

planc::DistAUNTF::~DistAUNTF
~DistAUNTF()
Definition: distauntf.hpp:473

DenseDimensionTree
Definition: ddt.hpp:11

TENSOR_LOCAL_NUMEL
#define TENSOR_LOCAL_NUMEL
Definition: distauntf.hpp:30

distntftime.hpp

planc::DistAUNTF::dim_tree
void dim_tree(bool i_dim_tree)
MTTKRP can be computed with or without dimension trees.
Definition: distauntf.hpp:501

MPI_SIZE
#define MPI_SIZE
Definition: distutils.h:15

planc::DistAUNTF::lambda
VEC lambda()
Returns the lambda of the NCP factors.
Definition: distauntf.hpp:494