planc
Parallel Lowrank Approximation with Non-negativity Constraints
distnmf.cpp
Go to the documentation of this file.
1 /* Copyright 2016 Ramakrishnan Kannan */
2 
3 #include <string>
4 #include "common/distutils.hpp"
6 #include "common/utils.hpp"
7 #include "distnmf/distals.hpp"
9 #include "distnmf/distaoadmm.hpp"
10 #include "distnmf/disthals.hpp"
11 #include "distnmf/distio.hpp"
12 #include "distnmf/distmu.hpp"
13 #include "distnmf/mpicomm.hpp"
14 #include "distnmf/naiveanlsbpp.hpp"
15 #ifdef BUILD_CUDA
16 #include <cuda.h>
17 #include <cuda_runtime.h>
18 #endif
19 
20 namespace planc {
21 
23  private:
24  int m_argc;
25  char **m_argv;
26  int m_k;
27  UWORD m_globalm, m_globaln;
28  std::string m_Afile_name;
29  std::string m_outputfile_name;
30  int m_num_it;
31  int m_pr;
32  int m_pc;
33  FVEC m_regW;
34  FVEC m_regH;
35  algotype m_nmfalgo;
36  double m_sparsity;
37  iodistributions m_distio;
38  uint m_compute_error;
39  int m_num_k_blocks;
40  static const int kprimeoffset = 17;
41  normtype m_input_normalization;
42 
43 #ifdef BUILD_CUDA
44  void printDevProp(cudaDeviceProp devProp) {
45  printf("Major revision number: %d\n", devProp.major);
46  printf("Minor revision number: %d\n", devProp.minor);
47  printf("Name: %s\n", devProp.name);
48  printf("Total global memory: %u\n", devProp.totalGlobalMem);
49  printf("Total shared memory per block: %u\n", devProp.sharedMemPerBlock);
50  printf("Total registers per block: %d\n", devProp.regsPerBlock);
51  printf("Warp size: %d\n", devProp.warpSize);
52  printf("Maximum memory pitch: %u\n", devProp.memPitch);
53  printf("Maximum threads per block: %d\n", devProp.maxThreadsPerBlock);
54  for (int i = 0; i < 3; ++i)
55  printf("Maximum dimension %d of block: %d\n", i,
56  devProp.maxThreadsDim[i]);
57  for (int i = 0; i < 3; ++i)
58  printf("Maximum dimension %d of grid: %d\n", i, devProp.maxGridSize[i]);
59  printf("Clock rate: %d\n", devProp.clockRate);
60  printf("Total constant memory: %u\n", devProp.totalConstMem);
61  printf("Texture alignment: %u\n", devProp.textureAlignment);
62  printf("Concurrent copy and execution: %s\n",
63  (devProp.deviceOverlap ? "Yes" : "No"));
64  printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
65  printf("Kernel execution timeout: %s\n",
66  (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
67  return;
68  }
69  void gpuQuery() {
70  int devCount;
71  cudaGetDeviceCount(&devCount);
72  printf("CUDA Device Query...\n");
73  printf("There are %d CUDA devices.\n", devCount);
74  // Iterate through devices
75  for (int i = 0; i < devCount; ++i) {
76  // Get device properties
77  printf("\nCUDA Device #%d\n", i);
78  cudaDeviceProp devProp;
79  cudaGetDeviceProperties(&devProp, i);
80  printDevProp(devProp);
81  }
82  }
83 #endif
84 
85  void printConfig() {
86  cout << "a::" << this->m_nmfalgo << "::i::" << this->m_Afile_name
87  << "::k::" << this->m_k << "::m::" << this->m_globalm
88  << "::n::" << this->m_globaln << "::t::" << this->m_num_it
89  << "::pr::" << this->m_pr << "::pc::" << this->m_pc
90  << "::error::" << this->m_compute_error
91  << "::distio::" << this->m_distio << "::regW::"
92  << "l2::" << this->m_regW(0) << "::l1::" << this->m_regW(1)
93  << "::regH::"
94  << "l2::" << this->m_regH(0) << "::l1::" << this->m_regH(1)
95  << "::num_k_blocks::" << this->m_num_k_blocks
96  << "::normtype::" << this->m_input_normalization << std::endl;
97  }
98 
99  template <class NMFTYPE>
100  void callDistNMF1D() {
101  std::string rand_prefix("rand_");
102  MPICommunicator mpicomm(this->m_argc, this->m_argv);
103 #ifdef BUILD_SPARSE
104  DistIO<SP_MAT> dio(mpicomm, m_distio);
105 #else // ifdef BUILD_SPARSE
106  DistIO<MAT> dio(mpicomm, m_distio);
107 #endif // ifdef BUILD_SPARSE
108 
109  if (m_Afile_name.compare(0, rand_prefix.size(), rand_prefix) == 0) {
110  dio.readInput(m_Afile_name, this->m_globalm, this->m_globaln, this->m_k,
111  this->m_sparsity, this->m_pr, this->m_pc,
112  this->m_input_normalization);
113  } else {
114  dio.readInput(m_Afile_name);
115  }
116 #ifdef BUILD_SPARSE
117  SP_MAT Arows(dio.Arows());
118  SP_MAT Acols(dio.Acols());
119 #else // ifdef BUILD_SPARSE
120  MAT Arows(dio.Arows());
121  MAT Acols(dio.Acols());
122 #endif // ifdef BUILD_SPARSE
123 
124  if (m_Afile_name.compare(0, rand_prefix.size(), rand_prefix) != 0) {
125  this->m_globaln = Arows.n_cols;
126  this->m_globalm = Acols.n_rows;
127  }
128  INFO << mpicomm.rank()
129  << "::Completed generating 1D rand Arows=" << PRINTMATINFO(Arows)
130  << "::Acols=" << PRINTMATINFO(Acols) << std::endl;
131 #ifdef WRITE_RAND_INPUT
132  dio.writeRandInput();
133 #endif // ifdef WRITE_RAND_INPUT
134  arma::arma_rng::set_seed(random_sieve(mpicomm.rank() + kprimeoffset));
135  MAT W = arma::randu<MAT>(this->m_globalm / mpicomm.size(), this->m_k);
136  MAT H = arma::randu<MAT>(this->m_globaln / mpicomm.size(), this->m_k);
137  sleep(10);
138  MPI_Barrier(MPI_COMM_WORLD);
139  memusage(mpicomm.rank(), "b4 constructor ");
140  NMFTYPE nmfAlgorithm(Arows, Acols, W, H, mpicomm);
141  sleep(10);
142  memusage(mpicomm.rank(), "after constructor ");
143  nmfAlgorithm.num_iterations(this->m_num_it);
144  nmfAlgorithm.compute_error(this->m_compute_error);
145  nmfAlgorithm.algorithm(this->m_nmfalgo);
146  MPI_Barrier(MPI_COMM_WORLD);
147  try {
148  nmfAlgorithm.computeNMF();
149  } catch (std::exception &e) {
150  printf("Failed rank %d: %s\n", mpicomm.rank(), e.what());
151  MPI_Abort(MPI_COMM_WORLD, 1);
152  }
153 
154  if (!m_outputfile_name.empty()) {
155  dio.writeOutput(nmfAlgorithm.getLeftLowRankFactor(),
156  nmfAlgorithm.getRightLowRankFactor(), m_outputfile_name);
157  }
158  }
159 
160  template <class NMFTYPE>
161  void callDistNMF2D() {
162  std::string rand_prefix("rand_");
163  MPICommunicator mpicomm(this->m_argc, this->m_argv, this->m_pr, this->m_pc);
164 // #ifdef BUILD_CUDA
165 // if (mpicomm.rank()==0){
166 // gpuQuery();
167 // }
168 // #endif
169 #ifdef USE_PACOSS
170  std::string dim_part_file_name = this->m_Afile_name;
171  dim_part_file_name += ".dpart.part" + std::to_string(mpicomm.rank());
172  this->m_Afile_name += ".part" + std::to_string(mpicomm.rank());
173  INFO << mpicomm.rank() << ":: part_file_name::" << dim_part_file_name
174  << "::m_Afile_name::" << this->m_Afile_name << std::endl;
175  Pacoss_SparseStruct<double> ss;
176  ss.load(m_Afile_name.c_str());
177  std::vector<std::vector<Pacoss_IntPair> > dim_part;
178  Pacoss_Communicator<double>::loadDistributedDimPart(
179  dim_part_file_name.c_str(), dim_part);
180  Pacoss_Communicator<double> *rowcomm = new Pacoss_Communicator<double>(
181  MPI_COMM_WORLD, ss._idx[0], dim_part[0]);
182  Pacoss_Communicator<double> *colcomm = new Pacoss_Communicator<double>(
183  MPI_COMM_WORLD, ss._idx[1], dim_part[1]);
184  this->m_globalm = ss._dimSize[0];
185  this->m_globaln = ss._dimSize[1];
186  arma::umat locations(2, ss._idx[0].size());
187 
188  for (Pacoss_Int i = 0; i < ss._idx[0].size(); i++) {
189  locations(0, i) = ss._idx[0][i];
190  locations(1, i) = ss._idx[1][i];
191  }
192  arma::vec values(ss._idx[0].size());
193 
194  for (Pacoss_Int i = 0; i < values.size(); i++) values[i] = ss._val[i];
195  SP_MAT A(locations, values);
196  A.resize(rowcomm->localRowCount(), colcomm->localRowCount());
197 #else // ifdef USE_PACOSS
198 
199  if ((this->m_pr > 0) && (this->m_pc > 0) &&
200  (this->m_pr * this->m_pc != mpicomm.size())) {
201  ERR << "pr*pc is not MPI_SIZE" << std::endl;
202  MPI_Barrier(MPI_COMM_WORLD);
203  MPI_Abort(MPI_COMM_WORLD, 1);
204  }
205 #ifdef BUILD_SPARSE
206  DistIO<SP_MAT> dio(mpicomm, m_distio);
207 
208  if (mpicomm.rank() == 0) {
209  INFO << "sparse case" << std::endl;
210  }
211 #else // ifdef BUILD_SPARSE
212  DistIO<MAT> dio(mpicomm, m_distio);
213 #endif // ifdef BUILD_SPARSE. One outstanding PACOSS
214 
215  if (m_Afile_name.compare(0, rand_prefix.size(), rand_prefix) == 0) {
216  dio.readInput(m_Afile_name, this->m_globalm, this->m_globaln, this->m_k,
217  this->m_sparsity, this->m_pr, this->m_pc,
218  this->m_input_normalization);
219  } else {
220  dio.readInput(m_Afile_name);
221  }
222 #ifdef BUILD_SPARSE
223  // SP_MAT A(dio.A().row_indices, dio.A().col_ptrs, dio.A().values,
224  // dio.A().n_rows, dio.A().n_cols);
225  SP_MAT A(dio.A());
226 #else // ifdef BUILD_SPARSE
227  MAT A(dio.A());
228 #endif // ifdef BUILD_SPARSE. One outstanding PACOSS
229 
230  if (m_Afile_name.compare(0, rand_prefix.size(), rand_prefix) != 0) {
231  UWORD localm = A.n_rows;
232  UWORD localn = A.n_cols;
233 
234  /*MPI_Allreduce(&localm, &(this->m_globalm), 1, MPI_INT,
235  * MPI_SUM, mpicomm.commSubs()[0]);
236  * MPI_Allreduce(&localn, &(this->m_globaln), 1, MPI_INT,
237  * MPI_SUM, mpicomm.commSubs()[1]);*/
238  this->m_globalm = localm * m_pr;
239  this->m_globaln = localn * m_pc;
240  }
241 #ifdef WRITE_RAND_INPUT
242  dio.writeRandInput();
243 #endif // ifdef WRITE_RAND_INPUT
244 #endif // ifdef USE_PACOSS. Everything over. No more outstanding ifdef's.
245 
246  // don't worry about initializing with the
247  // same matrix as only one of them will be used.
248  arma::arma_rng::set_seed(mpicomm.rank());
249 #ifdef USE_PACOSS
250  MAT W = arma::randu<MAT>(rowcomm->localOwnedRowCount(), this->m_k);
251  MAT H = arma::randu<MAT>(colcomm->localOwnedRowCount(), this->m_k);
252 #else // ifdef USE_PACOSS
253  MAT W = arma::randu<MAT>(this->m_globalm / mpicomm.size(), this->m_k);
254  MAT H = arma::randu<MAT>(this->m_globaln / mpicomm.size(), this->m_k);
255 #endif // ifdef USE_PACOSS
256  // sometimes for really very large matrices starting w/
257  // rand initialization hurts ANLS BPP running time. For a better
258  // initializer we run couple of iterations of HALS.
259 #ifndef USE_PACOSS
260 #ifdef BUILD_SPARSE
261  if (m_nmfalgo == ANLSBPP) {
262  DistHALS<SP_MAT> lrinitializer(A, W, H, mpicomm, this->m_num_k_blocks);
263  lrinitializer.num_iterations(4);
264  lrinitializer.algorithm(HALS);
265  lrinitializer.computeNMF();
266  W = lrinitializer.getLeftLowRankFactor();
267  H = lrinitializer.getRightLowRankFactor();
268  }
269 #endif // ifdef BUILD_SPARSE
270 #endif // ifndef USE_PACOSS
271 
272 #ifdef MPI_VERBOSE
273  INFO << mpicomm.rank() << "::" << __PRETTY_FUNCTION__
274  << "::" << PRINTMATINFO(W) << std::endl;
275  INFO << mpicomm.rank() << "::" << __PRETTY_FUNCTION__
276  << "::" << PRINTMATINFO(H) << std::endl;
277 #endif // ifdef MPI_VERBOSE
278  MPI_Barrier(MPI_COMM_WORLD);
279  memusage(mpicomm.rank(), "b4 constructor ");
280  // TODO(ramkikannan): I was here. Need to modify the reallocations by using
281  // localOwnedRowCount instead of m_globalm.
282  NMFTYPE nmfAlgorithm(A, W, H, mpicomm, this->m_num_k_blocks);
283 #ifdef USE_PACOSS
284  nmfAlgorithm.set_rowcomm(rowcomm);
285  nmfAlgorithm.set_colcomm(colcomm);
286 #endif // ifdef USE_PACOSS
287  memusage(mpicomm.rank(), "after constructor ");
288  nmfAlgorithm.num_iterations(this->m_num_it);
289  nmfAlgorithm.compute_error(this->m_compute_error);
290  nmfAlgorithm.algorithm(this->m_nmfalgo);
291  nmfAlgorithm.regW(this->m_regW);
292  nmfAlgorithm.regH(this->m_regH);
293  MPI_Barrier(MPI_COMM_WORLD);
294  try {
295  mpitic();
296  nmfAlgorithm.computeNMF();
297  double temp = mpitoc();
298 
299  if (mpicomm.rank() == 0) printf("NMF took %.3lf secs.\n", temp);
300  } catch (std::exception &e) {
301  printf("Failed rank %d: %s\n", mpicomm.rank(), e.what());
302  MPI_Abort(MPI_COMM_WORLD, 1);
303  }
304 #ifndef USE_PACOSS
305  if (!m_outputfile_name.empty()) {
306  dio.writeOutput(nmfAlgorithm.getLeftLowRankFactor(),
307  nmfAlgorithm.getRightLowRankFactor(), m_outputfile_name);
308  }
309 #endif // ifndef USE_PACOSS
310  }
311  void parseCommandLine() {
312  ParseCommandLine pc(this->m_argc, this->m_argv);
313  pc.parseplancopts();
314  this->m_nmfalgo = pc.lucalgo();
315  this->m_k = pc.lowrankk();
316  this->m_Afile_name = pc.input_file_name();
317  this->m_pr = pc.pr();
318  this->m_pc = pc.pc();
319  this->m_sparsity = pc.sparsity();
320  this->m_num_it = pc.iterations();
321  this->m_distio = TWOD;
322  this->m_regW = pc.regW();
323  this->m_regH = pc.regH();
324  this->m_num_k_blocks = 1;
325  this->m_globalm = pc.globalm();
326  this->m_globaln = pc.globaln();
327  this->m_compute_error = pc.compute_error();
328  if (this->m_nmfalgo == NAIVEANLSBPP) {
329  this->m_distio = ONED_DOUBLE;
330  } else {
331  this->m_distio = TWOD;
332  }
333  this->m_input_normalization = pc.input_normalization();
334  pc.printConfig();
335  switch (this->m_nmfalgo) {
336  case MU:
337 #ifdef BUILD_SPARSE
338  callDistNMF2D<DistMU<SP_MAT> >();
339 #else // ifdef BUILD_SPARSE
340  callDistNMF2D<DistMU<MAT> >();
341 #endif // ifdef BUILD_SPARSE
342  break;
343  case HALS:
344 #ifdef BUILD_SPARSE
345  callDistNMF2D<DistHALS<SP_MAT> >();
346 #else // ifdef BUILD_SPARSE
347  callDistNMF2D<DistHALS<MAT> >();
348 #endif // ifdef BUILD_SPARSE
349  break;
350  case ANLSBPP:
351 #ifdef BUILD_SPARSE
352  callDistNMF2D<DistANLSBPP<SP_MAT> >();
353 #else // ifdef BUILD_SPARSE
354  callDistNMF2D<DistANLSBPP<MAT> >();
355 #endif // ifdef BUILD_SPARSE
356  break;
357  case NAIVEANLSBPP:
358 #ifdef BUILD_SPARSE
359  callDistNMF1D<DistNaiveANLSBPP<SP_MAT> >();
360 #else // ifdef BUILD_SPARSE
361  callDistNMF1D<DistNaiveANLSBPP<MAT> >();
362 #endif // ifdef BUILD_SPARSE
363  break;
364  case AOADMM:
365 #ifdef BUILD_SPARSE
366  callDistNMF2D<DistAOADMM<SP_MAT> >();
367 #else // ifdef BUILD_SPARSE
368  callDistNMF2D<DistAOADMM<MAT> >();
369 #endif // ifdef BUILD_SPARSE
370  case CPALS:
371 #ifdef BUILD_SPARSE
372  callDistNMF2D<DistALS<SP_MAT> >();
373 #else // ifdef BUILD_SPARSE
374  callDistNMF2D<DistALS<MAT> >();
375 #endif // ifdef BUILD_SPARSE
376  default:
377  ERR << "Unsupport algorithm" << this->m_nmfalgo << std::endl;
378  }
379  }
380 
381  public:
382  DistNMFDriver(int argc, char *argv[]) {
383  this->m_argc = argc;
384  this->m_argv = argv;
385  this->parseCommandLine();
386  }
387 };
388 
389 } // namespace planc
390 
391 int main(int argc, char *argv[]) {
392  try {
393  planc::DistNMFDriver dnd(argc, argv);
394  fflush(stdout);
395  } catch (const std::exception &e) {
396  INFO << "Exception with stack trace " << std::endl;
397  INFO << e.what();
398  }
399 }
FVEC regW()
L2 regularization as the first parameter and L1 as second for left lowrank factor W...
algotype lucalgo()
Returns the NMF algorithm to run. Passed as parameter –algo or -a.
iodistributions
Definition: distutils.h:12
int pc()
Returns the number of processor columns.
float sparsity()
Input parameter for generating sparse matrix. Passed as -s or –sparsity.
DistNMFDriver(int argc, char *argv[])
Definition: distnmf.cpp:382
void readInput(const std::string file_name, UWORD m=0, UWORD n=0, UWORD k=0, double sparsity=0, UWORD pr=0, UWORD pc=0, normtype i_normalization=NONE)
We need m,n,pr,pc only for rand matrices.
Definition: distio.hpp:331
bool compute_error()
Returns whether to compute error not. Passed as parameter -e or –error.
double mpitoc(int rank)
Definition: distutils.hpp:22
Definition: distutils.h:12
int main(int argc, char *argv[])
Definition: distnmf.cpp:391
int iterations()
Returns number of iterations. passed as -t or –iter.
#define FVEC
Definition: utils.h:55
int random_sieve(const int)
Definition: utils.hpp:74
algotype
Definition: utils.h:10
MAT getRightLowRankFactor()
Returns the right low rank factor matrix H.
Definition: nmf.hpp:169
#define ERR
Definition: utils.h:28
FVEC regH()
L2 regularization as the first parameter and L1 as second for right lowrank factor H...
void algorithm(algotype dat)
returns the NMF algorithm
Definition: distnmf.hpp:94
void mpitic()
Definition: distutils.hpp:11
int pr()
Returns the number of processor rows.
void computeNMF()
This is the main loop function Refer Algorithm 1 in Page 3 of the PPoPP HPC-NMF paper.
Definition: aunmf.hpp:415
const MATTYPE & A() const
Definition: distio.hpp:484
normtype
Definition: utils.h:12
void parseplancopts()
parses the command line parameters
#define SP_MAT
Definition: utils.h:57
void num_iterations(const int it)
Sets number of iterations for the NMF algorithms.
Definition: nmf.hpp:340
const MATTYPE & Acols() const
Definition: distio.hpp:483
#define INFO
Definition: utils.h:36
void memusage(const int myrank, std::string event)
Captures the memory usage of every mpi process.
Definition: distutils.hpp:49
Definition: utils.h:10
#define UWORD
Definition: utils.h:60
UWORD lowrankk()
returns the low rank. Passed as parameter –lowrank or -k
const int size() const
returns the total number of mpi processes
Definition: mpicomm.hpp:120
Definition: utils.h:10
#define MAT
Definition: utils.h:52
UWORD globalm()
return global rows. Passed as parameter -d
std::string input_file_name()
Returns input file name. Passed as -i or –input.
MAT getLeftLowRankFactor()
Returns the left low rank factor matrix W.
Definition: nmf.hpp:167
const MATTYPE & Arows() const
Definition: distio.hpp:482
void printConfig()
print the configuration received through the command line paramters
const int rank() const
returns the global rank
Definition: mpicomm.hpp:118
void writeRandInput()
Definition: distio.hpp:447
ncp_factors contains the factors of the ncp every ith factor is of size n_i * k number of factors is ...
Definition: ncpfactors.hpp:20
#define PRINTMATINFO(A)
Definition: utils.h:63
Definition: utils.h:10
Definition: utils.h:10
void writeOutput(const MAT &W, const MAT &H, const std::string &output_file_name)
Writes the factor matrix as output_file_name_W_MPISIZE_MPIRANK.
Definition: distio.hpp:439
normtype input_normalization()
To column normalize the input matrix.
Definition: utils.h:10