17 #include <cuda_runtime.h> 27 UWORD m_globalm, m_globaln;
28 std::string m_Afile_name;
29 std::string m_outputfile_name;
40 static const int kprimeoffset = 17;
44 void printDevProp(cudaDeviceProp devProp) {
45 printf(
"Major revision number: %d\n", devProp.major);
46 printf(
"Minor revision number: %d\n", devProp.minor);
47 printf(
"Name: %s\n", devProp.name);
48 printf(
"Total global memory: %u\n", devProp.totalGlobalMem);
49 printf(
"Total shared memory per block: %u\n", devProp.sharedMemPerBlock);
50 printf(
"Total registers per block: %d\n", devProp.regsPerBlock);
51 printf(
"Warp size: %d\n", devProp.warpSize);
52 printf(
"Maximum memory pitch: %u\n", devProp.memPitch);
53 printf(
"Maximum threads per block: %d\n", devProp.maxThreadsPerBlock);
54 for (
int i = 0; i < 3; ++i)
55 printf(
"Maximum dimension %d of block: %d\n", i,
56 devProp.maxThreadsDim[i]);
57 for (
int i = 0; i < 3; ++i)
58 printf(
"Maximum dimension %d of grid: %d\n", i, devProp.maxGridSize[i]);
59 printf(
"Clock rate: %d\n", devProp.clockRate);
60 printf(
"Total constant memory: %u\n", devProp.totalConstMem);
61 printf(
"Texture alignment: %u\n", devProp.textureAlignment);
62 printf(
"Concurrent copy and execution: %s\n",
63 (devProp.deviceOverlap ?
"Yes" :
"No"));
64 printf(
"Number of multiprocessors: %d\n", devProp.multiProcessorCount);
65 printf(
"Kernel execution timeout: %s\n",
66 (devProp.kernelExecTimeoutEnabled ?
"Yes" :
"No"));
71 cudaGetDeviceCount(&devCount);
72 printf(
"CUDA Device Query...\n");
73 printf(
"There are %d CUDA devices.\n", devCount);
75 for (
int i = 0; i < devCount; ++i) {
77 printf(
"\nCUDA Device #%d\n", i);
78 cudaDeviceProp devProp;
79 cudaGetDeviceProperties(&devProp, i);
80 printDevProp(devProp);
86 cout <<
"a::" << this->m_nmfalgo <<
"::i::" << this->m_Afile_name
87 <<
"::k::" << this->m_k <<
"::m::" << this->m_globalm
88 <<
"::n::" << this->m_globaln <<
"::t::" << this->m_num_it
89 <<
"::pr::" << this->m_pr <<
"::pc::" << this->m_pc
90 <<
"::error::" << this->m_compute_error
91 <<
"::distio::" << this->m_distio <<
"::regW::" 92 <<
"l2::" << this->m_regW(0) <<
"::l1::" << this->m_regW(1)
94 <<
"l2::" << this->m_regH(0) <<
"::l1::" << this->m_regH(1)
95 <<
"::num_k_blocks::" << this->m_num_k_blocks
96 <<
"::normtype::" << this->m_input_normalization << std::endl;
99 template <
class NMFTYPE>
100 void callDistNMF1D() {
101 std::string rand_prefix(
"rand_");
105 #else // ifdef BUILD_SPARSE 107 #endif // ifdef BUILD_SPARSE 109 if (m_Afile_name.compare(0, rand_prefix.size(), rand_prefix) == 0) {
110 dio.
readInput(m_Afile_name, this->m_globalm, this->m_globaln, this->m_k,
111 this->m_sparsity, this->m_pr, this->m_pc,
112 this->m_input_normalization);
119 #else // ifdef BUILD_SPARSE 122 #endif // ifdef BUILD_SPARSE 124 if (m_Afile_name.compare(0, rand_prefix.size(), rand_prefix) != 0) {
125 this->m_globaln = Arows.n_cols;
126 this->m_globalm = Acols.n_rows;
129 <<
"::Completed generating 1D rand Arows=" <<
PRINTMATINFO(Arows)
131 #ifdef WRITE_RAND_INPUT 133 #endif // ifdef WRITE_RAND_INPUT 135 MAT W = arma::randu<MAT>(this->m_globalm / mpicomm.
size(), this->m_k);
136 MAT H = arma::randu<MAT>(this->m_globaln / mpicomm.
size(), this->m_k);
138 MPI_Barrier(MPI_COMM_WORLD);
140 NMFTYPE nmfAlgorithm(Arows, Acols, W, H, mpicomm);
143 nmfAlgorithm.num_iterations(this->m_num_it);
144 nmfAlgorithm.compute_error(this->m_compute_error);
145 nmfAlgorithm.algorithm(this->m_nmfalgo);
146 MPI_Barrier(MPI_COMM_WORLD);
148 nmfAlgorithm.computeNMF();
149 }
catch (std::exception &e) {
150 printf(
"Failed rank %d: %s\n", mpicomm.
rank(), e.what());
151 MPI_Abort(MPI_COMM_WORLD, 1);
154 if (!m_outputfile_name.empty()) {
155 dio.
writeOutput(nmfAlgorithm.getLeftLowRankFactor(),
156 nmfAlgorithm.getRightLowRankFactor(), m_outputfile_name);
160 template <
class NMFTYPE>
161 void callDistNMF2D() {
162 std::string rand_prefix(
"rand_");
163 MPICommunicator mpicomm(this->m_argc, this->m_argv, this->m_pr, this->m_pc);
170 std::string dim_part_file_name = this->m_Afile_name;
171 dim_part_file_name +=
".dpart.part" + std::to_string(mpicomm.
rank());
172 this->m_Afile_name +=
".part" + std::to_string(mpicomm.
rank());
173 INFO << mpicomm.
rank() <<
":: part_file_name::" << dim_part_file_name
174 <<
"::m_Afile_name::" << this->m_Afile_name << std::endl;
175 Pacoss_SparseStruct<double> ss;
176 ss.load(m_Afile_name.c_str());
177 std::vector<std::vector<Pacoss_IntPair> > dim_part;
178 Pacoss_Communicator<double>::loadDistributedDimPart(
179 dim_part_file_name.c_str(), dim_part);
180 Pacoss_Communicator<double> *rowcomm =
new Pacoss_Communicator<double>(
181 MPI_COMM_WORLD, ss._idx[0], dim_part[0]);
182 Pacoss_Communicator<double> *colcomm =
new Pacoss_Communicator<double>(
183 MPI_COMM_WORLD, ss._idx[1], dim_part[1]);
184 this->m_globalm = ss._dimSize[0];
185 this->m_globaln = ss._dimSize[1];
186 arma::umat locations(2, ss._idx[0].size());
188 for (Pacoss_Int i = 0; i < ss._idx[0].size(); i++) {
189 locations(0, i) = ss._idx[0][i];
190 locations(1, i) = ss._idx[1][i];
192 arma::vec values(ss._idx[0].size());
194 for (Pacoss_Int i = 0; i < values.size(); i++) values[i] = ss._val[i];
195 SP_MAT A(locations, values);
196 A.resize(rowcomm->localRowCount(), colcomm->localRowCount());
197 #else // ifdef USE_PACOSS 199 if ((this->m_pr > 0) && (this->m_pc > 0) &&
200 (this->m_pr * this->m_pc != mpicomm.
size())) {
201 ERR <<
"pr*pc is not MPI_SIZE" << std::endl;
202 MPI_Barrier(MPI_COMM_WORLD);
203 MPI_Abort(MPI_COMM_WORLD, 1);
208 if (mpicomm.
rank() == 0) {
209 INFO <<
"sparse case" << std::endl;
211 #else // ifdef BUILD_SPARSE 213 #endif // ifdef BUILD_SPARSE. One outstanding PACOSS 215 if (m_Afile_name.compare(0, rand_prefix.size(), rand_prefix) == 0) {
216 dio.
readInput(m_Afile_name, this->m_globalm, this->m_globaln, this->m_k,
217 this->m_sparsity, this->m_pr, this->m_pc,
218 this->m_input_normalization);
226 #else // ifdef BUILD_SPARSE 228 #endif // ifdef BUILD_SPARSE. One outstanding PACOSS 230 if (m_Afile_name.compare(0, rand_prefix.size(), rand_prefix) != 0) {
231 UWORD localm = A.n_rows;
232 UWORD localn = A.n_cols;
238 this->m_globalm = localm * m_pr;
239 this->m_globaln = localn * m_pc;
241 #ifdef WRITE_RAND_INPUT 243 #endif // ifdef WRITE_RAND_INPUT 244 #endif // ifdef USE_PACOSS. Everything over. No more outstanding ifdef's. 248 arma::arma_rng::set_seed(mpicomm.
rank());
250 MAT W = arma::randu<MAT>(rowcomm->localOwnedRowCount(), this->m_k);
251 MAT H = arma::randu<MAT>(colcomm->localOwnedRowCount(), this->m_k);
252 #else // ifdef USE_PACOSS 253 MAT W = arma::randu<MAT>(this->m_globalm / mpicomm.
size(), this->m_k);
254 MAT H = arma::randu<MAT>(this->m_globaln / mpicomm.
size(), this->m_k);
255 #endif // ifdef USE_PACOSS 269 #endif // ifdef BUILD_SPARSE 270 #endif // ifndef USE_PACOSS 273 INFO << mpicomm.
rank() <<
"::" << __PRETTY_FUNCTION__
275 INFO << mpicomm.
rank() <<
"::" << __PRETTY_FUNCTION__
277 #endif // ifdef MPI_VERBOSE 278 MPI_Barrier(MPI_COMM_WORLD);
282 NMFTYPE nmfAlgorithm(A, W, H, mpicomm, this->m_num_k_blocks);
284 nmfAlgorithm.set_rowcomm(rowcomm);
285 nmfAlgorithm.set_colcomm(colcomm);
286 #endif // ifdef USE_PACOSS 288 nmfAlgorithm.num_iterations(this->m_num_it);
289 nmfAlgorithm.compute_error(this->m_compute_error);
290 nmfAlgorithm.algorithm(this->m_nmfalgo);
291 nmfAlgorithm.regW(this->m_regW);
292 nmfAlgorithm.regH(this->m_regH);
293 MPI_Barrier(MPI_COMM_WORLD);
296 nmfAlgorithm.computeNMF();
299 if (mpicomm.
rank() == 0) printf(
"NMF took %.3lf secs.\n", temp);
300 }
catch (std::exception &e) {
301 printf(
"Failed rank %d: %s\n", mpicomm.
rank(), e.what());
302 MPI_Abort(MPI_COMM_WORLD, 1);
305 if (!m_outputfile_name.empty()) {
306 dio.
writeOutput(nmfAlgorithm.getLeftLowRankFactor(),
307 nmfAlgorithm.getRightLowRankFactor(), m_outputfile_name);
309 #endif // ifndef USE_PACOSS 311 void parseCommandLine() {
314 this->m_nmfalgo = pc.
lucalgo();
317 this->m_pr = pc.
pr();
318 this->m_pc = pc.
pc();
321 this->m_distio =
TWOD;
322 this->m_regW = pc.
regW();
323 this->m_regH = pc.
regH();
324 this->m_num_k_blocks = 1;
325 this->m_globalm = pc.
globalm();
326 this->m_globaln = pc.
globaln();
331 this->m_distio =
TWOD;
335 switch (this->m_nmfalgo) {
338 callDistNMF2D<DistMU<SP_MAT> >();
339 #else // ifdef BUILD_SPARSE 340 callDistNMF2D<DistMU<MAT> >();
341 #endif // ifdef BUILD_SPARSE 345 callDistNMF2D<DistHALS<SP_MAT> >();
346 #else // ifdef BUILD_SPARSE 347 callDistNMF2D<DistHALS<MAT> >();
348 #endif // ifdef BUILD_SPARSE 352 callDistNMF2D<DistANLSBPP<SP_MAT> >();
353 #else // ifdef BUILD_SPARSE 354 callDistNMF2D<DistANLSBPP<MAT> >();
355 #endif // ifdef BUILD_SPARSE 359 callDistNMF1D<DistNaiveANLSBPP<SP_MAT> >();
360 #else // ifdef BUILD_SPARSE 361 callDistNMF1D<DistNaiveANLSBPP<MAT> >();
362 #endif // ifdef BUILD_SPARSE 366 callDistNMF2D<DistAOADMM<SP_MAT> >();
367 #else // ifdef BUILD_SPARSE 368 callDistNMF2D<DistAOADMM<MAT> >();
369 #endif // ifdef BUILD_SPARSE 372 callDistNMF2D<DistALS<SP_MAT> >();
373 #else // ifdef BUILD_SPARSE 374 callDistNMF2D<DistALS<MAT> >();
375 #endif // ifdef BUILD_SPARSE 377 ERR <<
"Unsupport algorithm" << this->m_nmfalgo << std::endl;
385 this->parseCommandLine();
391 int main(
int argc,
char *argv[]) {
395 }
catch (
const std::exception &e) {
396 INFO <<
"Exception with stack trace " << std::endl;
FVEC regW()
L2 regularization as the first parameter and L1 as second for left lowrank factor W...
algotype lucalgo()
Returns the NMF algorithm to run. Passed as parameter –algo or -a.
int pc()
Returns the number of processor columns.
float sparsity()
Input parameter for generating sparse matrix. Passed as -s or –sparsity.
DistNMFDriver(int argc, char *argv[])
void readInput(const std::string file_name, UWORD m=0, UWORD n=0, UWORD k=0, double sparsity=0, UWORD pr=0, UWORD pc=0, normtype i_normalization=NONE)
We need m,n,pr,pc only for rand matrices.
bool compute_error()
Returns whether to compute error not. Passed as parameter -e or –error.
int main(int argc, char *argv[])
int iterations()
Returns number of iterations. passed as -t or –iter.
int random_sieve(const int)
MAT getRightLowRankFactor()
Returns the right low rank factor matrix H.
FVEC regH()
L2 regularization as the first parameter and L1 as second for right lowrank factor H...
void algorithm(algotype dat)
returns the NMF algorithm
int pr()
Returns the number of processor rows.
void computeNMF()
This is the main loop function Refer Algorithm 1 in Page 3 of the PPoPP HPC-NMF paper.
const MATTYPE & A() const
void parseplancopts()
parses the command line parameters
void num_iterations(const int it)
Sets number of iterations for the NMF algorithms.
const MATTYPE & Acols() const
void memusage(const int myrank, std::string event)
Captures the memory usage of every mpi process.
UWORD lowrankk()
returns the low rank. Passed as parameter –lowrank or -k
const int size() const
returns the total number of mpi processes
UWORD globalm()
return global rows. Passed as parameter -d
std::string input_file_name()
Returns input file name. Passed as -i or –input.
MAT getLeftLowRankFactor()
Returns the left low rank factor matrix W.
const MATTYPE & Arows() const
void printConfig()
print the configuration received through the command line paramters
const int rank() const
returns the global rank
ncp_factors contains the factors of the ncp every ith factor is of size n_i * k number of factors is ...
void writeOutput(const MAT &W, const MAT &H, const std::string &output_file_name)
Writes the factor matrix as output_file_name_W_MPISIZE_MPIRANK.
normtype input_normalization()
To column normalize the input matrix.