Fast Auxiliary Space Preconditioning 2.7.7 Aug/28/2022
Loading...
Searching...
No Matches
BlaSpmvBSR.c
Go to the documentation of this file.
1
14#include <math.h>
15
16#ifdef _OPENMP
17#include <omp.h>
18#endif
19
20#include "fasp.h"
21#include "fasp_functs.h"
22
23/*---------------------------------*/
24/*-- Public Functions --*/
25/*---------------------------------*/
26
46 const dBSRmat* A, const REAL alpha, const dBSRmat* B, const REAL beta, dBSRmat* C)
47{
48 INT i, j, k, l;
49 INT count = 0, added, countrow;
50 INT nb = A->nb, nb2 = nb * nb;
51
52 SHORT status = FASP_SUCCESS, use_openmp = FALSE;
53
54#ifdef _OPENMP
55 INT mybegin, myend, myid, nthreads;
56 if (A->nnz > OPENMP_HOLDS) {
57 use_openmp = TRUE;
58 nthreads = fasp_get_num_threads();
59 }
60#endif
61
62 if (A->ROW != B->ROW || A->COL != B->COL || A->nb != B->nb) {
63 printf("### ERROR: Matrix sizes do not match!\n");
64 status = ERROR_MAT_SIZE;
65 goto FINISHED;
66 }
67
68 if (A == NULL && B == NULL) {
69 C->ROW = 0;
70 C->COL = 0;
71 C->NNZ = 0;
72 C->nb = nb;
74 status = FASP_SUCCESS;
75 goto FINISHED;
76 }
77
78 if (A->NNZ == 0 && B->NNZ == 0) {
79 C->ROW = A->ROW;
80 C->COL = A->COL;
81 C->NNZ = A->NNZ;
82 C->nb = nb;
84 status = FASP_SUCCESS;
85 goto FINISHED;
86 }
87
88 // empty matrix A
89 if (A->NNZ == 0 || A == NULL) {
90 fasp_dbsr_alloc(B->ROW, B->COL, B->NNZ, B->nb, B->storage_manner, C);
91 memcpy(C->IA, B->IA, (B->ROW + 1) * sizeof(INT));
92 memcpy(C->JA, B->JA, (B->NNZ) * sizeof(INT));
93
94 if (use_openmp) {
95#ifdef _OPENMP
96#pragma omp parallel private(myid, mybegin, myend, i)
97 {
98 myid = omp_get_thread_num();
99 fasp_get_start_end(myid, nthreads, B->NNZ, &mybegin, &myend);
100 for (i = mybegin; i < myend; ++i)
101 fasp_blas_smat_axm1(&B->val[i * nb2], nb, beta, &C->val[i * nb2]);
102 }
103#endif
104 } else {
105 for (i = 0; i < B->NNZ; ++i) {
106 // C->val[i] = B->val[i] * beta;
107 fasp_blas_smat_axm1(&B->val[i * nb2], nb, beta, &C->val[i * nb2]);
108 }
109 }
110
111 status = FASP_SUCCESS;
112 goto FINISHED;
113 }
114
115 // empty matrix B
116 if (B->NNZ == 0 || B == NULL) {
117 fasp_dbsr_alloc(A->ROW, A->COL, A->NNZ, A->nb, A->storage_manner, C);
118 memcpy(C->IA, A->IA, (A->ROW + 1) * sizeof(INT));
119 memcpy(C->JA, A->JA, (A->NNZ) * sizeof(INT));
120
121 if (use_openmp) {
122#ifdef _OPENMP
123 INT mybegin, myend, myid;
124#pragma omp parallel private(myid, mybegin, myend, i)
125 {
126 myid = omp_get_thread_num();
127 fasp_get_start_end(myid, nthreads, A->NNZ, &mybegin, &myend);
128 for (i = mybegin; i < myend; ++i)
129 fasp_blas_smat_axm1(&A->val[i * nb2], nb, alpha, &C->val[i * nb2]);
130 }
131#endif
132 } else {
133 for (i = 0; i < A->NNZ; ++i)
134 // C->val[i] = A->val[i] * alpha;
135 fasp_blas_smat_axm1(&A->val[i * nb2], nb, alpha, &C->val[i * nb2]);
136 }
137
138 status = FASP_SUCCESS;
139 goto FINISHED;
140 }
141
142 C->ROW = A->ROW;
143 C->COL = A->COL;
144
145 C->nb = A->nb;
147
148 C->IA = (INT*)fasp_mem_calloc(C->ROW + 1, sizeof(INT));
149
150 // allocate work space for C->JA and C->val
151 C->JA = (INT*)fasp_mem_calloc(A->NNZ + B->NNZ, sizeof(INT));
152
153 C->val = (REAL*)fasp_mem_calloc((A->NNZ + B->NNZ) * nb2, sizeof(REAL));
154
155 // initial C->IA
156 memset(C->IA, 0, sizeof(INT) * (C->ROW + 1));
157 memset(C->JA, -1, sizeof(INT) * (A->NNZ + B->NNZ));
158
159 for (i = 0; i < A->ROW; ++i) {
160 countrow = 0;
161 for (j = A->IA[i]; j < A->IA[i + 1]; ++j) {
162 // C->val[count] = alpha * A->val[j];
163 fasp_blas_smat_axm1(&A->val[j * nb2], nb, alpha, &C->val[count * nb2]);
164 C->JA[count] = A->JA[j];
165 C->IA[i + 1]++;
166 count++;
167 countrow++;
168 } // end for js
169
170 for (k = B->IA[i]; k < B->IA[i + 1]; ++k) {
171 added = 0;
172
173 for (l = C->IA[i]; l < C->IA[i] + countrow + 1; l++) {
174 if (B->JA[k] == C->JA[l]) {
175 // C->val[l] = C->val[l] + beta * B->val[k];
176 fasp_blas_smat_add(&C->val[l * nb2], &B->val[k * nb2], nb, 1.0,
177 beta, &C->val[l * nb2]);
178 added = 1;
179 break;
180 }
181 } // end for l
182
183 if (added == 0) {
184 // C->val[count] = beta * B->val[k];
185 fasp_blas_smat_axm1(&B->val[k * nb2], nb, beta, &C->val[count * nb2]);
186 C->JA[count] = B->JA[k];
187 C->IA[i + 1]++;
188 count++;
189 }
190
191 } // end for k
192
193 C->IA[i + 1] += C->IA[i];
194 }
195
196 C->NNZ = count;
197 C->JA = (INT*)fasp_mem_realloc(C->JA, (count) * sizeof(INT));
198 C->val = (REAL*)fasp_mem_realloc(C->val, (nb2 * count) * sizeof(REAL));
199
200FINISHED:
201 return status;
202}
203
215void fasp_blas_dbsr_axm(dBSRmat* A, const REAL alpha)
216{
217 const INT nnz = A->NNZ;
218 const INT nb = A->nb;
219
220 // A direct calculation can be written as:
221 fasp_blas_darray_ax(nnz * nb * nb, alpha, A->val);
222}
223
244 const REAL alpha, dBSRmat* A, REAL* x, const REAL beta, REAL* y)
245{
246 /* members of A */
247 INT ROW = A->ROW;
248 INT nb = A->nb;
249 INT* IA = A->IA;
250 INT* JA = A->JA;
251 REAL* val = A->val;
252
253 /* local variables */
254 INT size = ROW * nb;
255 INT jump = nb * nb;
256 INT i, j, k, iend;
257 REAL temp;
258 REAL* pA = NULL;
259 REAL* px0 = NULL;
260 REAL* py0 = NULL;
261 REAL* py = NULL;
262
263 SHORT nthreads = 1, use_openmp = FALSE;
264
265#ifdef _OPENMP
266 if (ROW > OPENMP_HOLDS) {
267 use_openmp = TRUE;
268 nthreads = fasp_get_num_threads();
269 }
270#endif
271
272 //----------------------------------------------
273 // Treat (alpha == 0.0) computation
274 //----------------------------------------------
275
276 if (alpha == 0.0) {
277 fasp_blas_darray_ax(size, beta, y);
278 return;
279 }
280
281 //-------------------------------------------------
282 // y = (beta/alpha)*y
283 //-------------------------------------------------
284
285 temp = beta / alpha;
286 if (temp != 1.0) {
287 if (temp == 0.0) {
288 memset(y, 0X0, size * sizeof(REAL));
289 } else {
290 // for (i = size; i--; ) y[i] *= temp; // modified by Xiaozhe, 03/11/2011
291 fasp_blas_darray_ax(size, temp, y);
292 }
293 }
294
295 //-----------------------------------------------------------------
296 // y += A*x (Core Computation)
297 // each non-zero block elements are stored in row-major order
298 //-----------------------------------------------------------------
299
300 switch (nb) {
301 case 2:
302 {
303 if (use_openmp) {
304 INT myid, mybegin, myend;
305#ifdef _OPENMP
306#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py, iend)
307#endif
308 for (myid = 0; myid < nthreads; myid++) {
309 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
310 for (i = mybegin; i < myend; ++i) {
311 py0 = &y[i * 2];
312 iend = IA[i + 1];
313 for (k = IA[i]; k < iend; ++k) {
314 j = JA[k];
315 pA = val + k * 4; // &val[k*jump];
316 px0 = x + j * 2; // &x[j*nb];
317 py = py0;
318 fasp_blas_smat_ypAx_nc2(pA, px0, py);
319 }
320 }
321 }
322 } else {
323 for (i = 0; i < ROW; ++i) {
324 py0 = &y[i * 2];
325 iend = IA[i + 1];
326 for (k = IA[i]; k < iend; ++k) {
327 j = JA[k];
328 pA = val + k * 4; // &val[k*jump];
329 px0 = x + j * 2; // &x[j*nb];
330 py = py0;
331 fasp_blas_smat_ypAx_nc2(pA, px0, py);
332 }
333 }
334 }
335 }
336 break;
337
338 case 3:
339 {
340 if (use_openmp) {
341 INT myid, mybegin, myend;
342#ifdef _OPENMP
343#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py, iend)
344#endif
345 for (myid = 0; myid < nthreads; myid++) {
346 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
347 for (i = mybegin; i < myend; ++i) {
348 py0 = &y[i * 3];
349 iend = IA[i + 1];
350 for (k = IA[i]; k < iend; ++k) {
351 j = JA[k];
352 pA = val + k * 9; // &val[k*jump];
353 px0 = x + j * 3; // &x[j*nb];
354 py = py0;
355 fasp_blas_smat_ypAx_nc3(pA, px0, py);
356 }
357 }
358 }
359 } else {
360 for (i = 0; i < ROW; ++i) {
361 py0 = &y[i * 3];
362 iend = IA[i + 1];
363 for (k = IA[i]; k < iend; ++k) {
364 j = JA[k];
365 pA = val + k * 9; // &val[k*jump];
366 px0 = x + j * 3; // &x[j*nb];
367 py = py0;
368 fasp_blas_smat_ypAx_nc3(pA, px0, py);
369 }
370 }
371 }
372 }
373 break;
374
375 case 5:
376 {
377 if (use_openmp) {
378 INT myid, mybegin, myend;
379#ifdef _OPENMP
380#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py, iend)
381#endif
382 for (myid = 0; myid < nthreads; myid++) {
383 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
384 for (i = mybegin; i < myend; ++i) {
385 py0 = &y[i * 5];
386 iend = IA[i + 1];
387 for (k = IA[i]; k < iend; ++k) {
388 j = JA[k];
389 pA = val + k * 25; // &val[k*jump];
390 px0 = x + j * 5; // &x[j*nb];
391 py = py0;
392 fasp_blas_smat_ypAx_nc5(pA, px0, py);
393 }
394 }
395 }
396 } else {
397 for (i = 0; i < ROW; ++i) {
398 py0 = &y[i * 5];
399 iend = IA[i + 1];
400 for (k = IA[i]; k < iend; ++k) {
401 j = JA[k];
402 pA = val + k * 25; // &val[k*jump];
403 px0 = x + j * 5; // &x[j*nb];
404 py = py0;
405 fasp_blas_smat_ypAx_nc5(pA, px0, py);
406 }
407 }
408 }
409 }
410 break;
411
412 case 7:
413 {
414 if (use_openmp) {
415 INT myid, mybegin, myend;
416#ifdef _OPENMP
417#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py, iend)
418#endif
419 for (myid = 0; myid < nthreads; myid++) {
420 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
421 for (i = mybegin; i < myend; ++i) {
422 py0 = &y[i * 7];
423 iend = IA[i + 1];
424 for (k = IA[i]; k < iend; ++k) {
425 j = JA[k];
426 pA = val + k * 49; // &val[k*jump];
427 px0 = x + j * 7; // &x[j*nb]��
428 py = py0;
429 fasp_blas_smat_ypAx_nc7(pA, px0, py);
430 }
431 }
432 }
433 } else {
434 for (i = 0; i < ROW; ++i) {
435 py0 = &y[i * 7];
436 iend = IA[i + 1];
437 for (k = IA[i]; k < iend; ++k) {
438 j = JA[k];
439 pA = val + k * 49; // &val[k*jump];
440 px0 = x + j * 7; // &x[j*nb];
441 py = py0;
442 fasp_blas_smat_ypAx_nc7(pA, px0, py);
443 }
444 }
445 }
446 }
447 break;
448
449 default:
450 {
451 if (use_openmp) {
452 INT myid, mybegin, myend;
453#ifdef _OPENMP
454#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py, iend)
455#endif
456 for (myid = 0; myid < nthreads; myid++) {
457 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
458 for (i = mybegin; i < myend; ++i) {
459 py0 = &y[i * nb];
460 iend = IA[i + 1];
461 for (k = IA[i]; k < iend; ++k) {
462 j = JA[k];
463 pA = val + k * jump; // &val[k*jump];
464 px0 = x + j * nb; // &x[j*nb];
465 py = py0;
466 fasp_blas_smat_ypAx(pA, px0, py, nb);
467 }
468 }
469 }
470 } else {
471 for (i = 0; i < ROW; ++i) {
472 py0 = &y[i * nb];
473 iend = IA[i + 1];
474 for (k = IA[i]; k < iend; ++k) {
475 j = JA[k];
476 pA = val + k * jump; // &val[k*jump];
477 px0 = x + j * nb; // &x[j*nb];
478 py = py0;
479 fasp_blas_smat_ypAx(pA, px0, py, nb);
480 }
481 }
482 }
483 }
484 break;
485 }
486
487 //------------------------------------------
488 // y = alpha*y
489 //------------------------------------------
490
491 if (alpha != 1.0) {
492 fasp_blas_darray_ax(size, alpha, y);
493 }
494}
495
514void fasp_blas_dbsr_aAxpy(const REAL alpha, const dBSRmat* A, const REAL* x, REAL* y)
515{
516 /* members of A */
517 const INT ROW = A->ROW;
518 const INT nb = A->nb;
519 const INT* IA = A->IA;
520 const INT* JA = A->JA;
521 const REAL* val = A->val;
522
523 /* local variables */
524 const REAL* pA = NULL;
525 const REAL* px0 = NULL;
526 REAL* py0 = NULL;
527 REAL* py = NULL;
528
529 REAL temp = 0.0;
530 INT size = ROW * nb;
531 INT jump = nb * nb;
532 INT i, j, k, iend;
533
534 SHORT nthreads = 1, use_openmp = FALSE;
535
536#ifdef _OPENMP
537 if (ROW > OPENMP_HOLDS) {
538 use_openmp = TRUE;
539 nthreads = fasp_get_num_threads();
540 }
541#endif
542
543 //----------------------------------------------
544 // Treat (alpha == 0.0) computation
545 //----------------------------------------------
546
547 if (alpha == 0.0) {
548 return; // Nothing to compute
549 }
550
551 //-------------------------------------------------
552 // y = (1.0/alpha)*y
553 //-------------------------------------------------
554
555 if (alpha != 1.0) {
556 temp = 1.0 / alpha;
557 fasp_blas_darray_ax(size, temp, y);
558 }
559
560 //-----------------------------------------------------------------
561 // y += A*x (Core Computation)
562 // each non-zero block elements are stored in row-major order
563 //-----------------------------------------------------------------
564
565 switch (nb) {
566 case 2:
567 {
568 if (use_openmp) {
569 INT myid, mybegin, myend;
570#ifdef _OPENMP
571#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py, iend)
572#endif
573 for (myid = 0; myid < nthreads; myid++) {
574 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
575 for (i = mybegin; i < myend; ++i) {
576 py0 = &y[i * 2];
577 iend = IA[i + 1];
578 for (k = IA[i]; k < iend; ++k) {
579 j = JA[k];
580 pA = val + k * 4; // &val[k*jump];
581 px0 = x + j * 2; // &x[j*nb];
582 py = py0;
583 fasp_blas_smat_ypAx_nc2(pA, px0, py);
584 }
585 }
586 }
587 } else {
588 for (i = 0; i < ROW; ++i) {
589 py0 = &y[i * 2];
590 iend = IA[i + 1];
591 for (k = IA[i]; k < iend; ++k) {
592 j = JA[k];
593 pA = val + k * 4; // &val[k*jump];
594 px0 = x + j * 2; // &x[j*nb];
595 py = py0;
596 fasp_blas_smat_ypAx_nc2(pA, px0, py);
597 }
598 }
599 }
600 }
601 break;
602
603 case 3:
604 {
605 if (use_openmp) {
606 INT myid, mybegin, myend;
607#ifdef _OPENMP
608#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py, iend)
609#endif
610 for (myid = 0; myid < nthreads; myid++) {
611 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
612 for (i = mybegin; i < myend; ++i) {
613 py0 = &y[i * 3];
614 iend = IA[i + 1];
615 for (k = IA[i]; k < iend; ++k) {
616 j = JA[k];
617 pA = val + k * 9; // &val[k*jump];
618 px0 = x + j * 3; // &x[j*nb];
619 py = py0;
620 fasp_blas_smat_ypAx_nc3(pA, px0, py);
621 }
622 }
623 }
624 } else {
625 for (i = 0; i < ROW; ++i) {
626 py0 = &y[i * 3];
627 iend = IA[i + 1];
628 for (k = IA[i]; k < iend; ++k) {
629 j = JA[k];
630 pA = val + k * 9; // &val[k*jump];
631 px0 = x + j * 3; // &x[j*nb];
632 py = py0;
633 fasp_blas_smat_ypAx_nc3(pA, px0, py);
634 }
635 }
636 }
637 }
638 break;
639
640 case 5:
641 {
642 if (use_openmp) {
643 INT myid, mybegin, myend;
644#ifdef _OPENMP
645#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py, iend)
646#endif
647 for (myid = 0; myid < nthreads; myid++) {
648 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
649 for (i = mybegin; i < myend; ++i) {
650 py0 = &y[i * 5];
651 iend = IA[i + 1];
652 for (k = IA[i]; k < iend; ++k) {
653 j = JA[k];
654 pA = val + k * 25; // &val[k*jump];
655 px0 = x + j * 5; // &x[j*nb];
656 py = py0;
657 fasp_blas_smat_ypAx_nc5(pA, px0, py);
658 }
659 }
660 }
661 } else {
662 for (i = 0; i < ROW; ++i) {
663 py0 = &y[i * 5];
664 iend = IA[i + 1];
665 for (k = IA[i]; k < iend; ++k) {
666 j = JA[k];
667 pA = val + k * 25; // &val[k*jump];
668 px0 = x + j * 5; // &x[j*nb];
669 py = py0;
670 fasp_blas_smat_ypAx_nc5(pA, px0, py);
671 }
672 }
673 }
674 }
675 break;
676
677 case 7:
678 {
679 if (use_openmp) {
680 INT myid, mybegin, myend;
681#ifdef _OPENMP
682#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py, iend)
683#endif
684 for (myid = 0; myid < nthreads; myid++) {
685 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
686 for (i = mybegin; i < myend; ++i) {
687 py0 = &y[i * 7];
688 iend = IA[i + 1];
689 for (k = IA[i]; k < iend; ++k) {
690 j = JA[k];
691 pA = val + k * 49; // &val[k*jump];
692 px0 = x + j * 7; // &x[j*nb];
693 py = py0;
694 fasp_blas_smat_ypAx_nc7(pA, px0, py);
695 }
696 }
697 }
698 } else {
699 for (i = 0; i < ROW; ++i) {
700 py0 = &y[i * 7];
701 iend = IA[i + 1];
702 for (k = IA[i]; k < iend; ++k) {
703 j = JA[k];
704 pA = val + k * 49; // &val[k*jump];
705 px0 = x + j * 7; // &x[j*nb];
706 py = py0;
707 fasp_blas_smat_ypAx_nc7(pA, px0, py);
708 }
709 }
710 }
711 }
712 break;
713
714 default:
715 {
716 if (use_openmp) {
717 INT myid, mybegin, myend;
718#ifdef _OPENMP
719#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py, iend)
720#endif
721 for (myid = 0; myid < nthreads; myid++) {
722 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
723 for (i = mybegin; i < myend; ++i) {
724 py0 = &y[i * nb];
725 iend = IA[i + 1];
726 for (k = IA[i]; k < iend; ++k) {
727 j = JA[k];
728 pA = val + k * jump; // &val[k*jump];
729 px0 = x + j * nb; // &x[j*nb];
730 py = py0;
731 fasp_blas_smat_ypAx(pA, px0, py, nb);
732 }
733 }
734 }
735 } else {
736 for (i = 0; i < ROW; ++i) {
737 py0 = &y[i * nb];
738 iend = IA[i + 1];
739 for (k = IA[i]; k < iend; ++k) {
740 j = JA[k];
741 pA = val + k * jump; // &val[k*jump];
742 px0 = x + j * nb; // &x[j*nb];
743 py = py0;
744 fasp_blas_smat_ypAx(pA, px0, py, nb);
745 }
746 }
747 }
748 }
749 break;
750 }
751
752 //------------------------------------------
753 // y = alpha*y
754 //------------------------------------------
755
756 if (alpha != 1.0) {
757 fasp_blas_darray_ax(size, alpha, y);
758 }
759 return;
760}
761
779 const dBSRmat* A,
780 const REAL* x,
781 REAL* y)
782{
783 /* members of A */
784 const INT ROW = A->ROW;
785 const INT nb = A->nb;
786 const INT* IA = A->IA;
787 const INT* JA = A->JA;
788
789 /* local variables */
790 const REAL* px0 = NULL;
791 REAL * py0 = NULL, *py = NULL;
792 SHORT nthreads = 1, use_openmp = FALSE;
793
794 INT size = ROW * nb;
795 INT i, j, k, iend;
796 REAL temp = 0.0;
797
798#ifdef _OPENMP
799 if (ROW > OPENMP_HOLDS) {
800 use_openmp = TRUE;
801 nthreads = fasp_get_num_threads();
802 }
803#endif
804
805 //----------------------------------------------
806 // Treat (alpha == 0.0) computation
807 //----------------------------------------------
808
809 if (alpha == 0.0) {
810 return; // Nothing to compute
811 }
812
813 //-------------------------------------------------
814 // y = (1.0/alpha)*y
815 //-------------------------------------------------
816
817 if (alpha != 1.0) {
818 temp = 1.0 / alpha;
819 fasp_blas_darray_ax(size, temp, y);
820 }
821
822 //-----------------------------------------------------------------
823 // y += A*x (Core Computation)
824 // each non-zero block elements are stored in row-major order
825 //-----------------------------------------------------------------
826
827 switch (nb) {
828 case 2:
829 {
830 if (use_openmp) {
831 INT myid, mybegin, myend;
832#ifdef _OPENMP
833#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, px0, py, iend)
834#endif
835 for (myid = 0; myid < nthreads; myid++) {
836 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
837 for (i = mybegin; i < myend; ++i) {
838 py0 = &y[i * 2];
839 iend = IA[i + 1];
840 for (k = IA[i]; k < iend; ++k) {
841 j = JA[k];
842 px0 = x + j * 2; // &x[j*nb];
843 py = py0;
844 py[0] += px0[0];
845 py[1] += px0[1];
846 }
847 }
848 }
849 } else {
850 for (i = 0; i < ROW; ++i) {
851 py0 = &y[i * 2];
852 iend = IA[i + 1];
853 for (k = IA[i]; k < iend; ++k) {
854 j = JA[k];
855 px0 = x + j * 2; // &x[j*nb];
856 py = py0;
857 py[0] += px0[0];
858 py[1] += px0[1];
859 }
860 }
861 }
862 }
863 break;
864
865 case 3:
866 {
867 if (use_openmp) {
868 INT myid, mybegin, myend;
869#ifdef _OPENMP
870#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, px0, py, iend)
871#endif
872 for (myid = 0; myid < nthreads; myid++) {
873 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
874 for (i = mybegin; i < myend; ++i) {
875 py0 = &y[i * 3];
876 iend = IA[i + 1];
877 for (k = IA[i]; k < iend; ++k) {
878 j = JA[k];
879 px0 = x + j * 3; // &x[j*nb];
880 py = py0;
881 py[0] += px0[0];
882 py[1] += px0[1];
883 py[2] += px0[2];
884 }
885 }
886 }
887 } else {
888 for (i = 0; i < ROW; ++i) {
889 py0 = &y[i * 3];
890 iend = IA[i + 1];
891 for (k = IA[i]; k < iend; ++k) {
892 j = JA[k];
893 px0 = x + j * 3; // &x[j*nb];
894 py = py0;
895 py[0] += px0[0];
896 py[1] += px0[1];
897 py[2] += px0[2];
898 }
899 }
900 }
901 }
902 break;
903
904 case 5:
905 {
906 if (use_openmp) {
907 INT myid, mybegin, myend;
908#ifdef _OPENMP
909#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, px0, py, iend)
910#endif
911 for (myid = 0; myid < nthreads; myid++) {
912 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
913 for (i = mybegin; i < myend; ++i) {
914 py0 = &y[i * 5];
915 iend = IA[i + 1];
916 for (k = IA[i]; k < iend; ++k) {
917 j = JA[k];
918 px0 = x + j * 5; // &x[j*nb];
919 py = py0;
920 py[0] += px0[0];
921 py[1] += px0[1];
922 py[2] += px0[2];
923 py[3] += px0[3];
924 py[4] += px0[4];
925 }
926 }
927 }
928 } else {
929 for (i = 0; i < ROW; ++i) {
930 py0 = &y[i * 5];
931 iend = IA[i + 1];
932 for (k = IA[i]; k < iend; ++k) {
933 j = JA[k];
934 px0 = x + j * 5; // &x[j*nb];
935 py = py0;
936 py[0] += px0[0];
937 py[1] += px0[1];
938 py[2] += px0[2];
939 py[3] += px0[3];
940 py[4] += px0[4];
941 }
942 }
943 }
944 }
945 break;
946
947 case 7:
948 {
949 if (use_openmp) {
950 INT myid, mybegin, myend;
951#ifdef _OPENMP
952#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, px0, py, iend)
953#endif
954 for (myid = 0; myid < nthreads; myid++) {
955 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
956 for (i = mybegin; i < myend; ++i) {
957 py0 = &y[i * 7];
958 iend = IA[i + 1];
959 for (k = IA[i]; k < iend; ++k) {
960 j = JA[k];
961 px0 = x + j * 7; // &x[j*nb];
962 py = py0;
963 py[0] += px0[0];
964 py[1] += px0[1];
965 py[2] += px0[2];
966 py[3] += px0[3];
967 py[4] += px0[4];
968 py[5] += px0[5];
969 py[6] += px0[6];
970 }
971 }
972 }
973 } else {
974 for (i = 0; i < ROW; ++i) {
975 py0 = &y[i * 7];
976 iend = IA[i + 1];
977 for (k = IA[i]; k < iend; ++k) {
978 j = JA[k];
979 px0 = x + j * 7; // &x[j*nb];
980 py = py0;
981 py[0] += px0[0];
982 py[1] += px0[1];
983 py[2] += px0[2];
984 py[3] += px0[3];
985 py[4] += px0[4];
986 py[5] += px0[5];
987 py[6] += px0[6];
988 }
989 }
990 }
991 }
992 break;
993
994 default:
995 {
996 if (use_openmp) {
997 INT myid, mybegin, myend;
998#ifdef _OPENMP
999#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, px0, py, iend)
1000#endif
1001 for (myid = 0; myid < nthreads; myid++) {
1002 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
1003 for (i = mybegin; i < myend; ++i) {
1004 py0 = &y[i * nb];
1005 iend = IA[i + 1];
1006 for (k = IA[i]; k < iend; ++k) {
1007 j = JA[k];
1008 px0 = x + j * nb; // &x[j*nb];
1009 py = py0;
1010 fasp_blas_darray_axpy(nb, 1.0, px0, py);
1011 }
1012 }
1013 }
1014 } else {
1015 for (i = 0; i < ROW; ++i) {
1016 py0 = &y[i * nb];
1017 iend = IA[i + 1];
1018 for (k = IA[i]; k < iend; ++k) {
1019 j = JA[k];
1020 px0 = x + j * nb; // &x[j*nb];
1021 py = py0;
1022 fasp_blas_darray_axpy(nb, 1.0, px0, py);
1023 }
1024 }
1025 }
1026 }
1027 break;
1028 }
1029
1030 //------------------------------------------
1031 // y = alpha*y
1032 //------------------------------------------
1033
1034 if (alpha != 1.0) fasp_blas_darray_ax(size, alpha, y);
1035
1036 return;
1037}
1038
1055void fasp_blas_dbsr_mxv(const dBSRmat* A, const REAL* x, REAL* y)
1056{
1057 /* members of A */
1058 const INT ROW = A->ROW;
1059 const INT nb = A->nb;
1060 const INT* IA = A->IA;
1061 const INT* JA = A->JA;
1062 const REAL* val = A->val;
1063
1064 /* local variables */
1065 INT size = ROW * nb;
1066 INT jump = nb * nb;
1067 INT i, j, k, num_nnz_row;
1068
1069 const REAL* pA = NULL;
1070 const REAL* px0 = NULL;
1071 REAL* py0 = NULL;
1072 REAL* py = NULL;
1073
1074 SHORT use_openmp = FALSE;
1075
1076#ifdef _OPENMP
1077 INT myid, mybegin, myend, nthreads;
1078 if (ROW > OPENMP_HOLDS) {
1079 use_openmp = TRUE;
1080 nthreads = fasp_get_num_threads();
1081 }
1082#endif
1083
1084 //-----------------------------------------------------------------
1085 // zero out 'y'
1086 //-----------------------------------------------------------------
1087 fasp_darray_set(size, y, 0.0);
1088
1089 //-----------------------------------------------------------------
1090 // y = A*x (Core Computation)
1091 // each non-zero block elements are stored in row-major order
1092 //-----------------------------------------------------------------
1093
1094 switch (nb) {
1095 case 3:
1096 {
1097 if (use_openmp) {
1098#ifdef _OPENMP
1099#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, \
1100 py)
1101 {
1102 myid = omp_get_thread_num();
1103 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
1104 for (i = mybegin; i < myend; ++i) {
1105 py0 = &y[i * 3];
1106 num_nnz_row = IA[i + 1] - IA[i];
1107 switch (num_nnz_row) {
1108 case 3:
1109 k = IA[i];
1110 j = JA[k];
1111 pA = val + k * 9;
1112 px0 = x + j * 3;
1113 py = py0;
1114 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1115
1116 k++;
1117 j = JA[k];
1118 pA = val + k * 9;
1119 px0 = x + j * 3;
1120 py = py0;
1121 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1122
1123 k++;
1124 j = JA[k];
1125 pA = val + k * 9;
1126 px0 = x + j * 3;
1127 py = py0;
1128 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1129
1130 break;
1131
1132 case 4:
1133 k = IA[i];
1134 j = JA[k];
1135 pA = val + k * 9;
1136 px0 = x + j * 3;
1137 py = py0;
1138 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1139
1140 k++;
1141 j = JA[k];
1142 pA = val + k * 9;
1143 px0 = x + j * 3;
1144 py = py0;
1145 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1146
1147 k++;
1148 j = JA[k];
1149 pA = val + k * 9;
1150 px0 = x + j * 3;
1151 py = py0;
1152 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1153
1154 k++;
1155 j = JA[k];
1156 pA = val + k * 9;
1157 px0 = x + j * 3;
1158 py = py0;
1159 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1160
1161 break;
1162
1163 case 5:
1164 k = IA[i];
1165 j = JA[k];
1166 pA = val + k * 9;
1167 px0 = x + j * 3;
1168 py = py0;
1169 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1170
1171 k++;
1172 j = JA[k];
1173 pA = val + k * 9;
1174 px0 = x + j * 3;
1175 py = py0;
1176 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1177
1178 k++;
1179 j = JA[k];
1180 pA = val + k * 9;
1181 px0 = x + j * 3;
1182 py = py0;
1183 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1184
1185 k++;
1186 j = JA[k];
1187 pA = val + k * 9;
1188 px0 = x + j * 3;
1189 py = py0;
1190 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1191
1192 k++;
1193 j = JA[k];
1194 pA = val + k * 9;
1195 px0 = x + j * 3;
1196 py = py0;
1197 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1198
1199 break;
1200
1201 case 6:
1202 k = IA[i];
1203 j = JA[k];
1204 pA = val + k * 9;
1205 px0 = x + j * 3;
1206 py = py0;
1207 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1208
1209 k++;
1210 j = JA[k];
1211 pA = val + k * 9;
1212 px0 = x + j * 3;
1213 py = py0;
1214 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1215
1216 k++;
1217 j = JA[k];
1218 pA = val + k * 9;
1219 px0 = x + j * 3;
1220 py = py0;
1221 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1222
1223 k++;
1224 j = JA[k];
1225 pA = val + k * 9;
1226 px0 = x + j * 3;
1227 py = py0;
1228 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1229
1230 k++;
1231 j = JA[k];
1232 pA = val + k * 9;
1233 px0 = x + j * 3;
1234 py = py0;
1235 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1236
1237 k++;
1238 j = JA[k];
1239 pA = val + k * 9;
1240 px0 = x + j * 3;
1241 py = py0;
1242 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1243
1244 break;
1245
1246 case 7:
1247 k = IA[i];
1248 j = JA[k];
1249 pA = val + k * 9;
1250 px0 = x + j * 3;
1251 py = py0;
1252 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1253
1254 k++;
1255 j = JA[k];
1256 pA = val + k * 9;
1257 px0 = x + j * 3;
1258 py = py0;
1259 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1260
1261 k++;
1262 j = JA[k];
1263 pA = val + k * 9;
1264 px0 = x + j * 3;
1265 py = py0;
1266 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1267
1268 k++;
1269 j = JA[k];
1270 pA = val + k * 9;
1271 px0 = x + j * 3;
1272 py = py0;
1273 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1274
1275 k++;
1276 j = JA[k];
1277 pA = val + k * 9;
1278 px0 = x + j * 3;
1279 py = py0;
1280 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1281
1282 k++;
1283 j = JA[k];
1284 pA = val + k * 9;
1285 px0 = x + j * 3;
1286 py = py0;
1287 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1288
1289 k++;
1290 j = JA[k];
1291 pA = val + k * 9;
1292 px0 = x + j * 3;
1293 py = py0;
1294 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1295
1296 break;
1297
1298 default:
1299 for (k = IA[i]; k < IA[i + 1]; ++k) {
1300 j = JA[k];
1301 pA = val + k * 9;
1302 px0 = x + j * 3;
1303 py = py0;
1304 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1305 }
1306 break;
1307 }
1308 }
1309 }
1310#endif
1311 } else {
1312 for (i = 0; i < ROW; ++i) {
1313 py0 = &y[i * 3];
1314 num_nnz_row = IA[i + 1] - IA[i];
1315 switch (num_nnz_row) {
1316 case 3:
1317 k = IA[i];
1318 j = JA[k];
1319 pA = val + k * 9; // &val[k*jump];
1320 px0 = x + j * 3; // &x[j*nb];
1321 py = py0;
1322 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1323
1324 k++;
1325 j = JA[k];
1326 pA = val + k * 9; // &val[k*jump];
1327 px0 = x + j * 3; // &x[j*nb];
1328 py = py0;
1329 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1330
1331 k++;
1332 j = JA[k];
1333 pA = val + k * 9; // &val[k*jump];
1334 px0 = x + j * 3; // &x[j*nb];
1335 py = py0;
1336 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1337
1338 break;
1339
1340 case 4:
1341 k = IA[i];
1342 j = JA[k];
1343 pA = val + k * 9; // &val[k*jump];
1344 px0 = x + j * 3; // &x[j*nb];
1345 py = py0;
1346 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1347
1348 k++;
1349 j = JA[k];
1350 pA = val + k * 9; // &val[k*jump];
1351 px0 = x + j * 3; // &x[j*nb];
1352 py = py0;
1353 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1354
1355 k++;
1356 j = JA[k];
1357 pA = val + k * 9; // &val[k*jump];
1358 px0 = x + j * 3; // &x[j*nb];
1359 py = py0;
1360 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1361
1362 k++;
1363 j = JA[k];
1364 pA = val + k * 9; // &val[k*jump];
1365 px0 = x + j * 3; // &x[j*nb];
1366 py = py0;
1367 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1368
1369 break;
1370
1371 case 5:
1372 k = IA[i];
1373 j = JA[k];
1374 pA = val + k * 9; // &val[k*jump];
1375 px0 = x + j * 3; // &x[j*nb];
1376 py = py0;
1377 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1378
1379 k++;
1380 j = JA[k];
1381 pA = val + k * 9; // &val[k*jump];
1382 px0 = x + j * 3; // &x[j*nb];
1383 py = py0;
1384 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1385
1386 k++;
1387 j = JA[k];
1388 pA = val + k * 9; // &val[k*jump];
1389 px0 = x + j * 3; // &x[j*nb];
1390 py = py0;
1391 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1392
1393 k++;
1394 j = JA[k];
1395 pA = val + k * 9; // &val[k*jump];
1396 px0 = x + j * 3; // &x[j*nb];
1397 py = py0;
1398 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1399
1400 k++;
1401 j = JA[k];
1402 pA = val + k * 9; // &val[k*jump];
1403 px0 = x + j * 3; // &x[j*nb];
1404 py = py0;
1405 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1406
1407 break;
1408
1409 case 6:
1410 k = IA[i];
1411 j = JA[k];
1412 pA = val + k * 9; // &val[k*jump];
1413 px0 = x + j * 3; // &x[j*nb];
1414 py = py0;
1415 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1416
1417 k++;
1418 j = JA[k];
1419 pA = val + k * 9; // &val[k*jump];
1420 px0 = x + j * 3; // &x[j*nb];
1421 py = py0;
1422 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1423
1424 k++;
1425 j = JA[k];
1426 pA = val + k * 9; // &val[k*jump];
1427 px0 = x + j * 3; // &x[j*nb];
1428 py = py0;
1429 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1430
1431 k++;
1432 j = JA[k];
1433 pA = val + k * 9; // &val[k*jump];
1434 px0 = x + j * 3; // &x[j*nb];
1435 py = py0;
1436 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1437
1438 k++;
1439 j = JA[k];
1440 pA = val + k * 9; // &val[k*jump];
1441 px0 = x + j * 3; // &x[j*nb];
1442 py = py0;
1443 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1444
1445 k++;
1446 j = JA[k];
1447 pA = val + k * 9; // &val[k*jump];
1448 px0 = x + j * 3; // &x[j*nb];
1449 py = py0;
1450 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1451
1452 break;
1453
1454 case 7:
1455 k = IA[i];
1456 j = JA[k];
1457 pA = val + k * 9; // &val[k*jump];
1458 px0 = x + j * 3; // &x[j*nb];
1459 py = py0;
1460 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1461
1462 k++;
1463 j = JA[k];
1464 pA = val + k * 9; // &val[k*jump];
1465 px0 = x + j * 3; // &x[j*nb];
1466 py = py0;
1467 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1468
1469 k++;
1470 j = JA[k];
1471 pA = val + k * 9; // &val[k*jump];
1472 px0 = x + j * 3; // &x[j*nb];
1473 py = py0;
1474 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1475
1476 k++;
1477 j = JA[k];
1478 pA = val + k * 9; // &val[k*jump];
1479 px0 = x + j * 3; // &x[j*nb];
1480 py = py0;
1481 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1482
1483 k++;
1484 j = JA[k];
1485 pA = val + k * 9; // &val[k*jump];
1486 px0 = x + j * 3; // &x[j*nb];
1487 py = py0;
1488 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1489
1490 k++;
1491 j = JA[k];
1492 pA = val + k * 9; // &val[k*jump];
1493 px0 = x + j * 3; // &x[j*nb];
1494 py = py0;
1495 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1496
1497 k++;
1498 j = JA[k];
1499 pA = val + k * 9; // &val[k*jump];
1500 px0 = x + j * 3; // &x[j*nb];
1501 py = py0;
1502 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1503
1504 break;
1505
1506 default:
1507 for (k = IA[i]; k < IA[i + 1]; ++k) {
1508 j = JA[k];
1509 pA = val + k * 9; // &val[k*jump];
1510 px0 = x + j * 3; // &x[j*nb];
1511 py = py0;
1512 fasp_blas_smat_ypAx_nc3(pA, px0, py);
1513 }
1514 break;
1515 }
1516 }
1517 }
1518 }
1519 break;
1520
1521 case 5:
1522 {
1523 if (use_openmp) {
1524#ifdef _OPENMP
1525#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, \
1526 py)
1527 {
1528 myid = omp_get_thread_num();
1529 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
1530 for (i = mybegin; i < myend; ++i) {
1531 py0 = &y[i * 5];
1532 num_nnz_row = IA[i + 1] - IA[i];
1533 switch (num_nnz_row) {
1534 case 3:
1535 k = IA[i];
1536 j = JA[k];
1537 pA = val + k * 25; // &val[k*jump];
1538 px0 = x + j * 5; // &x[j*nb];
1539 py = py0;
1540 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1541
1542 k++;
1543 j = JA[k];
1544 pA = val + k * 25; // &val[k*jump];
1545 px0 = x + j * 5; // &x[j*nb];
1546 py = py0;
1547 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1548
1549 k++;
1550 j = JA[k];
1551 pA = val + k * 25; // &val[k*jump];
1552 px0 = x + j * 5; // &x[j*nb];
1553 py = py0;
1554 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1555
1556 break;
1557
1558 case 4:
1559 k = IA[i];
1560 j = JA[k];
1561 pA = val + k * 25; // &val[k*jump];
1562 px0 = x + j * 5; // &x[j*nb];
1563 py = py0;
1564 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1565
1566 k++;
1567 j = JA[k];
1568 pA = val + k * 25; // &val[k*jump];
1569 px0 = x + j * 5; // &x[j*nb];
1570 py = py0;
1571 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1572
1573 k++;
1574 j = JA[k];
1575 pA = val + k * 25; // &val[k*jump];
1576 px0 = x + j * 5; // &x[j*nb];
1577 py = py0;
1578 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1579
1580 k++;
1581 j = JA[k];
1582 pA = val + k * 25; // &val[k*jump];
1583 px0 = x + j * 5; // &x[j*nb];
1584 py = py0;
1585 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1586
1587 break;
1588
1589 case 5:
1590 k = IA[i];
1591 j = JA[k];
1592 pA = val + k * 25; // &val[k*jump];
1593 px0 = x + j * 5; // &x[j*nb];
1594 py = py0;
1595 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1596
1597 k++;
1598 j = JA[k];
1599 pA = val + k * 25; // &val[k*jump];
1600 px0 = x + j * 5; // &x[j*nb];
1601 py = py0;
1602 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1603
1604 k++;
1605 j = JA[k];
1606 pA = val + k * 25; // &val[k*jump];
1607 px0 = x + j * 5; // &x[j*nb];
1608 py = py0;
1609 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1610
1611 k++;
1612 j = JA[k];
1613 pA = val + k * 25; // &val[k*jump];
1614 px0 = x + j * 5; // &x[j*nb];
1615 py = py0;
1616 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1617
1618 k++;
1619 j = JA[k];
1620 pA = val + k * 25; // &val[k*jump];
1621 px0 = x + j * 5; // &x[j*nb];
1622 py = py0;
1623 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1624
1625 break;
1626
1627 case 6:
1628 k = IA[i];
1629 j = JA[k];
1630 pA = val + k * 25; // &val[k*jump];
1631 px0 = x + j * 5; // &x[j*nb];
1632 py = py0;
1633 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1634
1635 k++;
1636 j = JA[k];
1637 pA = val + k * 25; // &val[k*jump];
1638 px0 = x + j * 5; // &x[j*nb];
1639 py = py0;
1640 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1641
1642 k++;
1643 j = JA[k];
1644 pA = val + k * 25; // &val[k*jump];
1645 px0 = x + j * 5; // &x[j*nb];
1646 py = py0;
1647 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1648
1649 k++;
1650 j = JA[k];
1651 pA = val + k * 25; // &val[k*jump];
1652 px0 = x + j * 5; // &x[j*nb];
1653 py = py0;
1654 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1655
1656 k++;
1657 j = JA[k];
1658 pA = val + k * 25; // &val[k*jump];
1659 px0 = x + j * 5; // &x[j*nb];
1660 py = py0;
1661 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1662
1663 k++;
1664 j = JA[k];
1665 pA = val + k * 25; // &val[k*jump];
1666 px0 = x + j * 5; // &x[j*nb];
1667 py = py0;
1668 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1669
1670 break;
1671
1672 case 7:
1673 k = IA[i];
1674 j = JA[k];
1675 pA = val + k * 25; // &val[k*jump];
1676 px0 = x + j * 5; // &x[j*nb];
1677 py = py0;
1678 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1679
1680 k++;
1681 j = JA[k];
1682 pA = val + k * 25; // &val[k*jump];
1683 px0 = x + j * 5; // &x[j*nb];
1684 py = py0;
1685 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1686
1687 k++;
1688 j = JA[k];
1689 pA = val + k * 25; // &val[k*jump];
1690 px0 = x + j * 5; // &x[j*nb];
1691 py = py0;
1692 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1693
1694 k++;
1695 j = JA[k];
1696 pA = val + k * 25; // &val[k*jump];
1697 px0 = x + j * 5; // &x[j*nb];
1698 py = py0;
1699 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1700
1701 k++;
1702 j = JA[k];
1703 pA = val + k * 25; // &val[k*jump];
1704 px0 = x + j * 5; // &x[j*nb];
1705 py = py0;
1706 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1707
1708 k++;
1709 j = JA[k];
1710 pA = val + k * 25; // &val[k*jump];
1711 px0 = x + j * 5; // &x[j*nb];
1712 py = py0;
1713 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1714
1715 k++;
1716 j = JA[k];
1717 pA = val + k * 25; // &val[k*jump];
1718 px0 = x + j * 5; // &x[j*nb];
1719 py = py0;
1720 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1721
1722 break;
1723
1724 default:
1725 for (k = IA[i]; k < IA[i + 1]; ++k) {
1726 j = JA[k];
1727 pA = val + k * 25; // &val[k*jump];
1728 px0 = x + j * 5; // &x[j*nb];
1729 py = py0;
1730 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1731 }
1732 break;
1733 }
1734 }
1735 }
1736#endif
1737 } else {
1738 for (i = 0; i < ROW; ++i) {
1739 py0 = &y[i * 5];
1740 num_nnz_row = IA[i + 1] - IA[i];
1741 switch (num_nnz_row) {
1742 case 3:
1743 k = IA[i];
1744 j = JA[k];
1745 pA = val + k * 25; // &val[k*jump];
1746 px0 = x + j * 5; // &x[j*nb];
1747 py = py0;
1748 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1749
1750 k++;
1751 j = JA[k];
1752 pA = val + k * 25; // &val[k*jump];
1753 px0 = x + j * 5; // &x[j*nb];
1754 py = py0;
1755 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1756
1757 k++;
1758 j = JA[k];
1759 pA = val + k * 25; // &val[k*jump];
1760 px0 = x + j * 5; // &x[j*nb];
1761 py = py0;
1762 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1763
1764 break;
1765
1766 case 4:
1767 k = IA[i];
1768 j = JA[k];
1769 pA = val + k * 25; // &val[k*jump];
1770 px0 = x + j * 5; // &x[j*nb];
1771 py = py0;
1772 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1773
1774 k++;
1775 j = JA[k];
1776 pA = val + k * 25; // &val[k*jump];
1777 px0 = x + j * 5; // &x[j*nb];
1778 py = py0;
1779 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1780
1781 k++;
1782 j = JA[k];
1783 pA = val + k * 25; // &val[k*jump];
1784 px0 = x + j * 5; // &x[j*nb];
1785 py = py0;
1786 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1787
1788 k++;
1789 j = JA[k];
1790 pA = val + k * 25; // &val[k*jump];
1791 px0 = x + j * 5; // &x[j*nb];
1792 py = py0;
1793 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1794
1795 break;
1796
1797 case 5:
1798 k = IA[i];
1799 j = JA[k];
1800 pA = val + k * 25; // &val[k*jump];
1801 px0 = x + j * 5; // &x[j*nb];
1802 py = py0;
1803 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1804
1805 k++;
1806 j = JA[k];
1807 pA = val + k * 25; // &val[k*jump];
1808 px0 = x + j * 5; // &x[j*nb];
1809 py = py0;
1810 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1811
1812 k++;
1813 j = JA[k];
1814 pA = val + k * 25; // &val[k*jump];
1815 px0 = x + j * 5; // &x[j*nb];
1816 py = py0;
1817 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1818
1819 k++;
1820 j = JA[k];
1821 pA = val + k * 25; // &val[k*jump];
1822 px0 = x + j * 5; // &x[j*nb];
1823 py = py0;
1824 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1825
1826 k++;
1827 j = JA[k];
1828 pA = val + k * 25; // &val[k*jump];
1829 px0 = x + j * 5; // &x[j*nb];
1830 py = py0;
1831 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1832
1833 break;
1834
1835 case 6:
1836 k = IA[i];
1837 j = JA[k];
1838 pA = val + k * 25; // &val[k*jump];
1839 px0 = x + j * 5; // &x[j*nb];
1840 py = py0;
1841 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1842
1843 k++;
1844 j = JA[k];
1845 pA = val + k * 25; // &val[k*jump];
1846 px0 = x + j * 5; // &x[j*nb];
1847 py = py0;
1848 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1849
1850 k++;
1851 j = JA[k];
1852 pA = val + k * 25; // &val[k*jump];
1853 px0 = x + j * 5; // &x[j*nb];
1854 py = py0;
1855 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1856
1857 k++;
1858 j = JA[k];
1859 pA = val + k * 25; // &val[k*jump];
1860 px0 = x + j * 5; // &x[j*nb];
1861 py = py0;
1862 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1863
1864 k++;
1865 j = JA[k];
1866 pA = val + k * 25; // &val[k*jump];
1867 px0 = x + j * 5; // &x[j*nb];
1868 py = py0;
1869 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1870
1871 k++;
1872 j = JA[k];
1873 pA = val + k * 25; // &val[k*jump];
1874 px0 = x + j * 5; // &x[j*nb];
1875 py = py0;
1876 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1877
1878 break;
1879
1880 case 7:
1881 k = IA[i];
1882 j = JA[k];
1883 pA = val + k * 25; // &val[k*jump];
1884 px0 = x + j * 5; // &x[j*nb];
1885 py = py0;
1886 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1887
1888 k++;
1889 j = JA[k];
1890 pA = val + k * 25; // &val[k*jump];
1891 px0 = x + j * 5; // &x[j*nb];
1892 py = py0;
1893 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1894
1895 k++;
1896 j = JA[k];
1897 pA = val + k * 25; // &val[k*jump];
1898 px0 = x + j * 5; // &x[j*nb];
1899 py = py0;
1900 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1901
1902 k++;
1903 j = JA[k];
1904 pA = val + k * 25; // &val[k*jump];
1905 px0 = x + j * 5; // &x[j*nb];
1906 py = py0;
1907 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1908
1909 k++;
1910 j = JA[k];
1911 pA = val + k * 25; // &val[k*jump];
1912 px0 = x + j * 5; // &x[j*nb];
1913 py = py0;
1914 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1915
1916 k++;
1917 j = JA[k];
1918 pA = val + k * 25; // &val[k*jump];
1919 px0 = x + j * 5; // &x[j*nb];
1920 py = py0;
1921 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1922
1923 k++;
1924 j = JA[k];
1925 pA = val + k * 25; // &val[k*jump];
1926 px0 = x + j * 5; // &x[j*nb];
1927 py = py0;
1928 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1929
1930 break;
1931
1932 default:
1933 for (k = IA[i]; k < IA[i + 1]; ++k) {
1934 j = JA[k];
1935 pA = val + k * 25; // &val[k*jump];
1936 px0 = x + j * 5; // &x[j*nb];
1937 py = py0;
1938 fasp_blas_smat_ypAx_nc5(pA, px0, py);
1939 }
1940 break;
1941 }
1942 }
1943 }
1944 }
1945 break;
1946
1947 case 7:
1948 {
1949 if (use_openmp) {
1950#ifdef _OPENMP
1951#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, \
1952 py)
1953 {
1954 myid = omp_get_thread_num();
1955 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
1956 for (i = mybegin; i < myend; ++i) {
1957 py0 = &y[i * 7];
1958 num_nnz_row = IA[i + 1] - IA[i];
1959 switch (num_nnz_row) {
1960 case 3:
1961 k = IA[i];
1962 j = JA[k];
1963 pA = val + k * 49; // &val[k*jump];
1964 px0 = x + j * 7; // &x[j*nb];
1965 py = py0;
1966 fasp_blas_smat_ypAx_nc7(pA, px0, py);
1967
1968 k++;
1969 j = JA[k];
1970 pA = val + k * 49; // &val[k*jump];
1971 px0 = x + j * 7; // &x[j*nb];
1972 py = py0;
1973 fasp_blas_smat_ypAx_nc7(pA, px0, py);
1974
1975 k++;
1976 j = JA[k];
1977 pA = val + k * 49; // &val[k*jump];
1978 px0 = x + j * 7; // &x[j*nb];
1979 py = py0;
1980 fasp_blas_smat_ypAx_nc7(pA, px0, py);
1981
1982 break;
1983
1984 case 4:
1985 k = IA[i];
1986 j = JA[k];
1987 pA = val + k * 49; // &val[k*jump];
1988 px0 = x + j * 7; // &x[j*nb];
1989 py = py0;
1990 fasp_blas_smat_ypAx_nc7(pA, px0, py);
1991
1992 k++;
1993 j = JA[k];
1994 pA = val + k * 49; // &val[k*jump];
1995 px0 = x + j * 7; // &x[j*nb];
1996 py = py0;
1997 fasp_blas_smat_ypAx_nc7(pA, px0, py);
1998
1999 k++;
2000 j = JA[k];
2001 pA = val + k * 49; // &val[k*jump];
2002 px0 = x + j * 7; // &x[j*nb];
2003 py = py0;
2004 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2005
2006 k++;
2007 j = JA[k];
2008 pA = val + k * 49; // &val[k*jump];
2009 px0 = x + j * 7; // &x[j*nb];
2010 py = py0;
2011 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2012
2013 break;
2014
2015 case 5:
2016 k = IA[i];
2017 j = JA[k];
2018 pA = val + k * 49; // &val[k*jump];
2019 px0 = x + j * 7; // &x[j*nb];
2020 py = py0;
2021 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2022
2023 k++;
2024 j = JA[k];
2025 pA = val + k * 49; // &val[k*jump];
2026 px0 = x + j * 7; // &x[j*nb];
2027 py = py0;
2028 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2029
2030 k++;
2031 j = JA[k];
2032 pA = val + k * 49; // &val[k*jump];
2033 px0 = x + j * 7; // &x[j*nb];
2034 py = py0;
2035 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2036
2037 k++;
2038 j = JA[k];
2039 pA = val + k * 49; // &val[k*jump];
2040 px0 = x + j * 7; // &x[j*nb];
2041 py = py0;
2042 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2043
2044 k++;
2045 j = JA[k];
2046 pA = val + k * 49; // &val[k*jump];
2047 px0 = x + j * 7; // &x[j*nb];
2048 py = py0;
2049 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2050
2051 break;
2052
2053 case 6:
2054 k = IA[i];
2055 j = JA[k];
2056 pA = val + k * 49; // &val[k*jump];
2057 px0 = x + j * 7; // &x[j*nb];
2058 py = py0;
2059 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2060
2061 k++;
2062 j = JA[k];
2063 pA = val + k * 49; // &val[k*jump];
2064 px0 = x + j * 7; // &x[j*nb];
2065 py = py0;
2066 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2067
2068 k++;
2069 j = JA[k];
2070 pA = val + k * 49; // &val[k*jump];
2071 px0 = x + j * 7; // &x[j*nb];
2072 py = py0;
2073 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2074
2075 k++;
2076 j = JA[k];
2077 pA = val + k * 49; // &val[k*jump];
2078 px0 = x + j * 7; // &x[j*nb];
2079 py = py0;
2080 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2081
2082 k++;
2083 j = JA[k];
2084 pA = val + k * 49; // &val[k*jump];
2085 px0 = x + j * 7; // &x[j*nb];
2086 py = py0;
2087 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2088
2089 k++;
2090 j = JA[k];
2091 pA = val + k * 49; // &val[k*jump];
2092 px0 = x + j * 7; // &x[j*nb];
2093 py = py0;
2094 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2095
2096 break;
2097
2098 case 7:
2099 k = IA[i];
2100 j = JA[k];
2101 pA = val + k * 49; // &val[k*jump];
2102 px0 = x + j * 7; // &x[j*nb];
2103 py = py0;
2104 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2105
2106 k++;
2107 j = JA[k];
2108 pA = val + k * 49; // &val[k*jump];
2109 px0 = x + j * 7; // &x[j*nb];
2110 py = py0;
2111 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2112
2113 k++;
2114 j = JA[k];
2115 pA = val + k * 49; // &val[k*jump];
2116 px0 = x + j * 7; // &x[j*nb];
2117 py = py0;
2118 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2119
2120 k++;
2121 j = JA[k];
2122 pA = val + k * 49; // &val[k*jump];
2123 px0 = x + j * 7; // &x[j*nb];
2124 py = py0;
2125 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2126
2127 k++;
2128 j = JA[k];
2129 pA = val + k * 49; // &val[k*jump];
2130 px0 = x + j * 7; // &x[j*nb];
2131 py = py0;
2132 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2133
2134 k++;
2135 j = JA[k];
2136 pA = val + k * 49; // &val[k*jump];
2137 px0 = x + j * 7; // &x[j*nb];
2138 py = py0;
2139 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2140
2141 k++;
2142 j = JA[k];
2143 pA = val + k * 49; // &val[k*jump];
2144 px0 = x + j * 7; // &x[j*nb];
2145 py = py0;
2146 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2147
2148 break;
2149
2150 default:
2151 for (k = IA[i]; k < IA[i + 1]; ++k) {
2152 j = JA[k];
2153 pA = val + k * 49; // &val[k*jump];
2154 px0 = x + j * 7; // &x[j*nb];
2155 py = py0;
2156 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2157 }
2158 break;
2159 }
2160 }
2161 }
2162#endif
2163 } else {
2164 for (i = 0; i < ROW; ++i) {
2165 py0 = &y[i * 7];
2166 num_nnz_row = IA[i + 1] - IA[i];
2167 switch (num_nnz_row) {
2168 case 3:
2169 k = IA[i];
2170 j = JA[k];
2171 pA = val + k * 49; // &val[k*jump];
2172 px0 = x + j * 7; // &x[j*nb];
2173 py = py0;
2174 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2175
2176 k++;
2177 j = JA[k];
2178 pA = val + k * 49; // &val[k*jump];
2179 px0 = x + j * 7; // &x[j*nb];
2180 py = py0;
2181 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2182
2183 k++;
2184 j = JA[k];
2185 pA = val + k * 49; // &val[k*jump];
2186 px0 = x + j * 7; // &x[j*nb];
2187 py = py0;
2188 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2189
2190 break;
2191
2192 case 4:
2193 k = IA[i];
2194 j = JA[k];
2195 pA = val + k * 49; // &val[k*jump];
2196 px0 = x + j * 7; // &x[j*nb];
2197 py = py0;
2198 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2199
2200 k++;
2201 j = JA[k];
2202 pA = val + k * 49; // &val[k*jump];
2203 px0 = x + j * 7; // &x[j*nb];
2204 py = py0;
2205 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2206
2207 k++;
2208 j = JA[k];
2209 pA = val + k * 49; // &val[k*jump];
2210 px0 = x + j * 7; // &x[j*nb];
2211 py = py0;
2212 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2213
2214 k++;
2215 j = JA[k];
2216 pA = val + k * 49; // &val[k*jump];
2217 px0 = x + j * 7; // &x[j*nb];
2218 py = py0;
2219 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2220
2221 break;
2222
2223 case 5:
2224 k = IA[i];
2225 j = JA[k];
2226 pA = val + k * 49; // &val[k*jump];
2227 px0 = x + j * 7; // &x[j*nb];
2228 py = py0;
2229 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2230
2231 k++;
2232 j = JA[k];
2233 pA = val + k * 49; // &val[k*jump];
2234 px0 = x + j * 7; // &x[j*nb];
2235 py = py0;
2236 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2237
2238 k++;
2239 j = JA[k];
2240 pA = val + k * 49; // &val[k*jump];
2241 px0 = x + j * 7; // &x[j*nb];
2242 py = py0;
2243 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2244
2245 k++;
2246 j = JA[k];
2247 pA = val + k * 49; // &val[k*jump];
2248 px0 = x + j * 7; // &x[j*nb];
2249 py = py0;
2250 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2251
2252 k++;
2253 j = JA[k];
2254 pA = val + k * 49; // &val[k*jump];
2255 px0 = x + j * 7; // &x[j*nb];
2256 py = py0;
2257 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2258
2259 break;
2260
2261 case 6:
2262 k = IA[i];
2263 j = JA[k];
2264 pA = val + k * 49; // &val[k*jump];
2265 px0 = x + j * 7; // &x[j*nb];
2266 py = py0;
2267 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2268
2269 k++;
2270 j = JA[k];
2271 pA = val + k * 49; // &val[k*jump];
2272 px0 = x + j * 7; // &x[j*nb];
2273 py = py0;
2274 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2275
2276 k++;
2277 j = JA[k];
2278 pA = val + k * 49; // &val[k*jump];
2279 px0 = x + j * 7; // &x[j*nb];
2280 py = py0;
2281 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2282
2283 k++;
2284 j = JA[k];
2285 pA = val + k * 49; // &val[k*jump];
2286 px0 = x + j * 7; // &x[j*nb];
2287 py = py0;
2288 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2289
2290 k++;
2291 j = JA[k];
2292 pA = val + k * 49; // &val[k*jump];
2293 px0 = x + j * 7; // &x[j*nb];
2294 py = py0;
2295 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2296
2297 k++;
2298 j = JA[k];
2299 pA = val + k * 49; // &val[k*jump];
2300 px0 = x + j * 7; // &x[j*nb];
2301 py = py0;
2302 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2303
2304 break;
2305
2306 case 7:
2307 k = IA[i];
2308 j = JA[k];
2309 pA = val + k * 49; // &val[k*jump];
2310 px0 = x + j * 7; // &x[j*nb];
2311 py = py0;
2312 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2313
2314 k++;
2315 j = JA[k];
2316 pA = val + k * 49; // &val[k*jump];
2317 px0 = x + j * 7; // &x[j*nb];
2318 py = py0;
2319 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2320
2321 k++;
2322 j = JA[k];
2323 pA = val + k * 49; // &val[k*jump];
2324 px0 = x + j * 7; // &x[j*nb];
2325 py = py0;
2326 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2327
2328 k++;
2329 j = JA[k];
2330 pA = val + k * 49; // &val[k*jump];
2331 px0 = x + j * 7; // &x[j*nb];
2332 py = py0;
2333 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2334
2335 k++;
2336 j = JA[k];
2337 pA = val + k * 49; // &val[k*jump];
2338 px0 = x + j * 7; // &x[j*nb];
2339 py = py0;
2340 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2341
2342 k++;
2343 j = JA[k];
2344 pA = val + k * 49; // &val[k*jump];
2345 px0 = x + j * 7; // &x[j*nb];
2346 py = py0;
2347 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2348
2349 k++;
2350 j = JA[k];
2351 pA = val + k * 49; // &val[k*jump];
2352 px0 = x + j * 7; // &x[j*nb];
2353 py = py0;
2354 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2355
2356 break;
2357
2358 default:
2359 for (k = IA[i]; k < IA[i + 1]; ++k) {
2360 j = JA[k];
2361 pA = val + k * 49; // &val[k*jump];
2362 px0 = x + j * 7; // &x[j*nb];
2363 py = py0;
2364 fasp_blas_smat_ypAx_nc7(pA, px0, py);
2365 }
2366 break;
2367 }
2368 }
2369 }
2370 }
2371 break;
2372
2373 default:
2374 {
2375 if (use_openmp) {
2376#ifdef _OPENMP
2377#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, \
2378 py)
2379 {
2380 myid = omp_get_thread_num();
2381 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
2382 for (i = mybegin; i < myend; ++i) {
2383 py0 = &y[i * nb];
2384 num_nnz_row = IA[i + 1] - IA[i];
2385 switch (num_nnz_row) {
2386 case 3:
2387 k = IA[i];
2388 j = JA[k];
2389 pA = val + k * jump; // &val[k*jump];
2390 px0 = x + j * nb; // &x[j*nb];
2391 py = py0;
2392 fasp_blas_smat_ypAx(pA, px0, py, nb);
2393
2394 k++;
2395 j = JA[k];
2396 pA = val + k * jump; // &val[k*jump];
2397 px0 = x + j * nb; // &x[j*nb];
2398 py = py0;
2399 fasp_blas_smat_ypAx(pA, px0, py, nb);
2400
2401 k++;
2402 j = JA[k];
2403 pA = val + k * jump; // &val[k*jump];
2404 px0 = x + j * nb; // &x[j*nb];
2405 py = py0;
2406 fasp_blas_smat_ypAx(pA, px0, py, nb);
2407
2408 break;
2409
2410 case 4:
2411 k = IA[i];
2412 j = JA[k];
2413 pA = val + k * jump; // &val[k*jump];
2414 px0 = x + j * nb; // &x[j*nb];
2415 py = py0;
2416 fasp_blas_smat_ypAx(pA, px0, py, nb);
2417
2418 k++;
2419 j = JA[k];
2420 pA = val + k * jump; // &val[k*jump];
2421 px0 = x + j * nb; // &x[j*nb];
2422 py = py0;
2423 fasp_blas_smat_ypAx(pA, px0, py, nb);
2424
2425 k++;
2426 j = JA[k];
2427 pA = val + k * jump; // &val[k*jump];
2428 px0 = x + j * nb; // &x[j*nb];
2429 py = py0;
2430 fasp_blas_smat_ypAx(pA, px0, py, nb);
2431
2432 k++;
2433 j = JA[k];
2434 pA = val + k * jump; // &val[k*jump];
2435 px0 = x + j * nb; // &x[j*nb];
2436 py = py0;
2437 fasp_blas_smat_ypAx(pA, px0, py, nb);
2438
2439 break;
2440
2441 case 5:
2442 k = IA[i];
2443 j = JA[k];
2444 pA = val + k * jump; // &val[k*jump];
2445 px0 = x + j * nb; // &x[j*nb];
2446 py = py0;
2447 fasp_blas_smat_ypAx(pA, px0, py, nb);
2448
2449 k++;
2450 j = JA[k];
2451 pA = val + k * jump; // &val[k*jump];
2452 px0 = x + j * nb; // &x[j*nb];
2453 py = py0;
2454 fasp_blas_smat_ypAx(pA, px0, py, nb);
2455
2456 k++;
2457 j = JA[k];
2458 pA = val + k * jump; // &val[k*jump];
2459 px0 = x + j * nb; // &x[j*nb];
2460 py = py0;
2461 fasp_blas_smat_ypAx(pA, px0, py, nb);
2462
2463 k++;
2464 j = JA[k];
2465 pA = val + k * jump; // &val[k*jump];
2466 px0 = x + j * nb; // &x[j*nb];
2467 py = py0;
2468 fasp_blas_smat_ypAx(pA, px0, py, nb);
2469
2470 k++;
2471 j = JA[k];
2472 pA = val + k * jump; // &val[k*jump];
2473 px0 = x + j * nb; // &x[j*nb];
2474 py = py0;
2475 fasp_blas_smat_ypAx(pA, px0, py, nb);
2476
2477 break;
2478
2479 case 6:
2480 k = IA[i];
2481 j = JA[k];
2482 pA = val + k * jump; // &val[k*jump];
2483 px0 = x + j * nb; // &x[j*nb];
2484 py = py0;
2485 fasp_blas_smat_ypAx(pA, px0, py, nb);
2486
2487 k++;
2488 j = JA[k];
2489 pA = val + k * jump; // &val[k*jump];
2490 px0 = x + j * nb; // &x[j*nb];
2491 py = py0;
2492 fasp_blas_smat_ypAx(pA, px0, py, nb);
2493
2494 k++;
2495 j = JA[k];
2496 pA = val + k * jump; // &val[k*jump];
2497 px0 = x + j * nb; // &x[j*nb];
2498 py = py0;
2499 fasp_blas_smat_ypAx(pA, px0, py, nb);
2500
2501 k++;
2502 j = JA[k];
2503 pA = val + k * jump; // &val[k*jump];
2504 px0 = x + j * nb; // &x[j*nb];
2505 py = py0;
2506 fasp_blas_smat_ypAx(pA, px0, py, nb);
2507
2508 k++;
2509 j = JA[k];
2510 pA = val + k * jump; // &val[k*jump];
2511 px0 = x + j * nb; // &x[j*nb];
2512 py = py0;
2513 fasp_blas_smat_ypAx(pA, px0, py, nb);
2514
2515 k++;
2516 j = JA[k];
2517 pA = val + k * jump; // &val[k*jump];
2518 px0 = x + j * nb; // &x[j*nb];
2519 py = py0;
2520 fasp_blas_smat_ypAx(pA, px0, py, nb);
2521
2522 break;
2523
2524 case 7:
2525 k = IA[i];
2526 j = JA[k];
2527 pA = val + k * jump; // &val[k*jump];
2528 px0 = x + j * nb; // &x[j*nb];
2529 py = py0;
2530 fasp_blas_smat_ypAx(pA, px0, py, nb);
2531
2532 k++;
2533 j = JA[k];
2534 pA = val + k * jump; // &val[k*jump];
2535 px0 = x + j * nb; // &x[j*nb];
2536 py = py0;
2537 fasp_blas_smat_ypAx(pA, px0, py, nb);
2538
2539 k++;
2540 j = JA[k];
2541 pA = val + k * jump; // &val[k*jump];
2542 px0 = x + j * nb; // &x[j*nb];
2543 py = py0;
2544 fasp_blas_smat_ypAx(pA, px0, py, nb);
2545
2546 k++;
2547 j = JA[k];
2548 pA = val + k * jump; // &val[k*jump];
2549 px0 = x + j * nb; // &x[j*nb];
2550 py = py0;
2551 fasp_blas_smat_ypAx(pA, px0, py, nb);
2552
2553 k++;
2554 j = JA[k];
2555 pA = val + k * jump; // &val[k*jump];
2556 px0 = x + j * nb; // &x[j*nb];
2557 py = py0;
2558 fasp_blas_smat_ypAx(pA, px0, py, nb);
2559
2560 k++;
2561 j = JA[k];
2562 pA = val + k * jump; // &val[k*jump];
2563 px0 = x + j * nb; // &x[j*nb];
2564 py = py0;
2565 fasp_blas_smat_ypAx(pA, px0, py, nb);
2566
2567 k++;
2568 j = JA[k];
2569 pA = val + k * jump; // &val[k*jump];
2570 px0 = x + j * nb; // &x[j*nb];
2571 py = py0;
2572 fasp_blas_smat_ypAx(pA, px0, py, nb);
2573
2574 break;
2575
2576 default:
2577 for (k = IA[i]; k < IA[i + 1]; ++k) {
2578 j = JA[k];
2579 pA = val + k * jump; // &val[k*jump];
2580 px0 = x + j * nb; // &x[j*nb];
2581 py = py0;
2582 fasp_blas_smat_ypAx(pA, px0, py, nb);
2583 }
2584 break;
2585 }
2586 }
2587 }
2588#endif
2589 } else {
2590 for (i = 0; i < ROW; ++i) {
2591 py0 = &y[i * nb];
2592 num_nnz_row = IA[i + 1] - IA[i];
2593 switch (num_nnz_row) {
2594 case 3:
2595 k = IA[i];
2596 j = JA[k];
2597 pA = val + k * jump; // &val[k*jump];
2598 px0 = x + j * nb; // &x[j*nb];
2599 py = py0;
2600 fasp_blas_smat_ypAx(pA, px0, py, nb);
2601
2602 k++;
2603 j = JA[k];
2604 pA = val + k * jump; // &val[k*jump];
2605 px0 = x + j * nb; // &x[j*nb];
2606 py = py0;
2607 fasp_blas_smat_ypAx(pA, px0, py, nb);
2608
2609 k++;
2610 j = JA[k];
2611 pA = val + k * jump; // &val[k*jump];
2612 px0 = x + j * nb; // &x[j*nb];
2613 py = py0;
2614 fasp_blas_smat_ypAx(pA, px0, py, nb);
2615
2616 break;
2617
2618 case 4:
2619 k = IA[i];
2620 j = JA[k];
2621 pA = val + k * jump; // &val[k*jump];
2622 px0 = x + j * nb; // &x[j*nb];
2623 py = py0;
2624 fasp_blas_smat_ypAx(pA, px0, py, nb);
2625
2626 k++;
2627 j = JA[k];
2628 pA = val + k * jump; // &val[k*jump];
2629 px0 = x + j * nb; // &x[j*nb];
2630 py = py0;
2631 fasp_blas_smat_ypAx(pA, px0, py, nb);
2632
2633 k++;
2634 j = JA[k];
2635 pA = val + k * jump; // &val[k*jump];
2636 px0 = x + j * nb; // &x[j*nb];
2637 py = py0;
2638 fasp_blas_smat_ypAx(pA, px0, py, nb);
2639
2640 k++;
2641 j = JA[k];
2642 pA = val + k * jump; // &val[k*jump];
2643 px0 = x + j * nb; // &x[j*nb];
2644 py = py0;
2645 fasp_blas_smat_ypAx(pA, px0, py, nb);
2646
2647 break;
2648
2649 case 5:
2650 k = IA[i];
2651 j = JA[k];
2652 pA = val + k * jump; // &val[k*jump];
2653 px0 = x + j * nb; // &x[j*nb];
2654 py = py0;
2655 fasp_blas_smat_ypAx(pA, px0, py, nb);
2656
2657 k++;
2658 j = JA[k];
2659 pA = val + k * jump; // &val[k*jump];
2660 px0 = x + j * nb; // &x[j*nb];
2661 py = py0;
2662 fasp_blas_smat_ypAx(pA, px0, py, nb);
2663
2664 k++;
2665 j = JA[k];
2666 pA = val + k * jump; // &val[k*jump];
2667 px0 = x + j * nb; // &x[j*nb];
2668 py = py0;
2669 fasp_blas_smat_ypAx(pA, px0, py, nb);
2670
2671 k++;
2672 j = JA[k];
2673 pA = val + k * jump; // &val[k*jump];
2674 px0 = x + j * nb; // &x[j*nb];
2675 py = py0;
2676 fasp_blas_smat_ypAx(pA, px0, py, nb);
2677
2678 k++;
2679 j = JA[k];
2680 pA = val + k * jump; // &val[k*jump];
2681 px0 = x + j * nb; // &x[j*nb];
2682 py = py0;
2683 fasp_blas_smat_ypAx(pA, px0, py, nb);
2684
2685 break;
2686
2687 case 6:
2688 k = IA[i];
2689 j = JA[k];
2690 pA = val + k * jump; // &val[k*jump];
2691 px0 = x + j * nb; // &x[j*nb];
2692 py = py0;
2693 fasp_blas_smat_ypAx(pA, px0, py, nb);
2694
2695 k++;
2696 j = JA[k];
2697 pA = val + k * jump; // &val[k*jump];
2698 px0 = x + j * nb; // &x[j*nb];
2699 py = py0;
2700 fasp_blas_smat_ypAx(pA, px0, py, nb);
2701
2702 k++;
2703 j = JA[k];
2704 pA = val + k * jump; // &val[k*jump];
2705 px0 = x + j * nb; // &x[j*nb];
2706 py = py0;
2707 fasp_blas_smat_ypAx(pA, px0, py, nb);
2708
2709 k++;
2710 j = JA[k];
2711 pA = val + k * jump; // &val[k*jump];
2712 px0 = x + j * nb; // &x[j*nb];
2713 py = py0;
2714 fasp_blas_smat_ypAx(pA, px0, py, nb);
2715
2716 k++;
2717 j = JA[k];
2718 pA = val + k * jump; // &val[k*jump];
2719 px0 = x + j * nb; // &x[j*nb];
2720 py = py0;
2721 fasp_blas_smat_ypAx(pA, px0, py, nb);
2722
2723 k++;
2724 j = JA[k];
2725 pA = val + k * jump; // &val[k*jump];
2726 px0 = x + j * nb; // &x[j*nb];
2727 py = py0;
2728 fasp_blas_smat_ypAx(pA, px0, py, nb);
2729
2730 break;
2731
2732 case 7:
2733 k = IA[i];
2734 j = JA[k];
2735 pA = val + k * jump; // &val[k*jump];
2736 px0 = x + j * nb; // &x[j*nb];
2737 py = py0;
2738 fasp_blas_smat_ypAx(pA, px0, py, nb);
2739
2740 k++;
2741 j = JA[k];
2742 pA = val + k * jump; // &val[k*jump];
2743 px0 = x + j * nb; // &x[j*nb];
2744 py = py0;
2745 fasp_blas_smat_ypAx(pA, px0, py, nb);
2746
2747 k++;
2748 j = JA[k];
2749 pA = val + k * jump; // &val[k*jump];
2750 px0 = x + j * nb; // &x[j*nb];
2751 py = py0;
2752 fasp_blas_smat_ypAx(pA, px0, py, nb);
2753
2754 k++;
2755 j = JA[k];
2756 pA = val + k * jump; // &val[k*jump];
2757 px0 = x + j * nb; // &x[j*nb];
2758 py = py0;
2759 fasp_blas_smat_ypAx(pA, px0, py, nb);
2760
2761 k++;
2762 j = JA[k];
2763 pA = val + k * jump; // &val[k*jump];
2764 px0 = x + j * nb; // &x[j*nb];
2765 py = py0;
2766 fasp_blas_smat_ypAx(pA, px0, py, nb);
2767
2768 k++;
2769 j = JA[k];
2770 pA = val + k * jump; // &val[k*jump];
2771 px0 = x + j * nb; // &x[j*nb];
2772 py = py0;
2773 fasp_blas_smat_ypAx(pA, px0, py, nb);
2774
2775 k++;
2776 j = JA[k];
2777 pA = val + k * jump; // &val[k*jump];
2778 px0 = x + j * nb; // &x[j*nb];
2779 py = py0;
2780 fasp_blas_smat_ypAx(pA, px0, py, nb);
2781
2782 break;
2783
2784 default:
2785 for (k = IA[i]; k < IA[i + 1]; ++k) {
2786 j = JA[k];
2787 pA = val + k * jump; // &val[k*jump];
2788 px0 = x + j * nb; // &x[j*nb];
2789 py = py0;
2790 fasp_blas_smat_ypAx(pA, px0, py, nb);
2791 }
2792 break;
2793 }
2794 }
2795 }
2796 }
2797 break;
2798 }
2799}
2800
2815void fasp_blas_dbsr_mxv_agg(const dBSRmat* A, const REAL* x, REAL* y)
2816{
2817 /* members of A */
2818 const INT ROW = A->ROW;
2819 const INT nb = A->nb;
2820 const INT size = ROW * nb;
2821 const INT* IA = A->IA;
2822 const INT* JA = A->JA;
2823
2824 /* local variables */
2825 const REAL* px0 = NULL;
2826 REAL * py0 = NULL, *py = NULL;
2827 INT i, j, k, num_nnz_row;
2828 SHORT use_openmp = FALSE;
2829
2830#ifdef _OPENMP
2831 const REAL* val = A->val;
2832 const REAL* pA;
2833 INT myid, mybegin, myend, nthreads;
2834 if (ROW > OPENMP_HOLDS) {
2835 use_openmp = TRUE;
2836 nthreads = fasp_get_num_threads();
2837 }
2838#endif
2839
2840 //-----------------------------------------------------------------
2841 // zero out 'y'
2842 //-----------------------------------------------------------------
2843 fasp_darray_set(size, y, 0.0);
2844
2845 //-----------------------------------------------------------------
2846 // y = A*x (Core Computation)
2847 // each non-zero block elements are stored in row-major order
2848 //-----------------------------------------------------------------
2849
2850 switch (nb) {
2851 case 3:
2852 {
2853 if (use_openmp) {
2854#ifdef _OPENMP
2855#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, \
2856 py)
2857 {
2858 myid = omp_get_thread_num();
2859 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
2860 for (i = mybegin; i < myend; ++i) {
2861 py0 = &y[i * 3];
2862 num_nnz_row = IA[i + 1] - IA[i];
2863 switch (num_nnz_row) {
2864 case 3:
2865 k = IA[i];
2866 j = JA[k];
2867 pA = val + k * 9;
2868 px0 = x + j * 3;
2869 py = py0;
2870 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2871
2872 k++;
2873 j = JA[k];
2874 pA = val + k * 9;
2875 px0 = x + j * 3;
2876 py = py0;
2877 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2878
2879 k++;
2880 j = JA[k];
2881 pA = val + k * 9;
2882 px0 = x + j * 3;
2883 py = py0;
2884 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2885
2886 break;
2887
2888 case 4:
2889 k = IA[i];
2890 j = JA[k];
2891 pA = val + k * 9;
2892 px0 = x + j * 3;
2893 py = py0;
2894 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2895
2896 k++;
2897 j = JA[k];
2898 pA = val + k * 9;
2899 px0 = x + j * 3;
2900 py = py0;
2901 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2902
2903 k++;
2904 j = JA[k];
2905 pA = val + k * 9;
2906 px0 = x + j * 3;
2907 py = py0;
2908 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2909
2910 k++;
2911 j = JA[k];
2912 pA = val + k * 9;
2913 px0 = x + j * 3;
2914 py = py0;
2915 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2916
2917 break;
2918
2919 case 5:
2920 k = IA[i];
2921 j = JA[k];
2922 pA = val + k * 9;
2923 px0 = x + j * 3;
2924 py = py0;
2925 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2926
2927 k++;
2928 j = JA[k];
2929 pA = val + k * 9;
2930 px0 = x + j * 3;
2931 py = py0;
2932 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2933
2934 k++;
2935 j = JA[k];
2936 pA = val + k * 9;
2937 px0 = x + j * 3;
2938 py = py0;
2939 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2940
2941 k++;
2942 j = JA[k];
2943 pA = val + k * 9;
2944 px0 = x + j * 3;
2945 py = py0;
2946 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2947
2948 k++;
2949 j = JA[k];
2950 pA = val + k * 9;
2951 px0 = x + j * 3;
2952 py = py0;
2953 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2954
2955 break;
2956
2957 case 6:
2958 k = IA[i];
2959 j = JA[k];
2960 pA = val + k * 9;
2961 px0 = x + j * 3;
2962 py = py0;
2963 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2964
2965 k++;
2966 j = JA[k];
2967 pA = val + k * 9;
2968 px0 = x + j * 3;
2969 py = py0;
2970 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2971
2972 k++;
2973 j = JA[k];
2974 pA = val + k * 9;
2975 px0 = x + j * 3;
2976 py = py0;
2977 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2978
2979 k++;
2980 j = JA[k];
2981 pA = val + k * 9;
2982 px0 = x + j * 3;
2983 py = py0;
2984 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2985
2986 k++;
2987 j = JA[k];
2988 pA = val + k * 9;
2989 px0 = x + j * 3;
2990 py = py0;
2991 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2992
2993 k++;
2994 j = JA[k];
2995 pA = val + k * 9;
2996 px0 = x + j * 3;
2997 py = py0;
2998 fasp_blas_smat_ypAx_nc3(pA, px0, py);
2999
3000 break;
3001
3002 case 7:
3003 k = IA[i];
3004 j = JA[k];
3005 pA = val + k * 9;
3006 px0 = x + j * 3;
3007 py = py0;
3008 fasp_blas_smat_ypAx_nc3(pA, px0, py);
3009
3010 k++;
3011 j = JA[k];
3012 pA = val + k * 9;
3013 px0 = x + j * 3;
3014 py = py0;
3015 fasp_blas_smat_ypAx_nc3(pA, px0, py);
3016
3017 k++;
3018 j = JA[k];
3019 pA = val + k * 9;
3020 px0 = x + j * 3;
3021 py = py0;
3022 fasp_blas_smat_ypAx_nc3(pA, px0, py);
3023
3024 k++;
3025 j = JA[k];
3026 pA = val + k * 9;
3027 px0 = x + j * 3;
3028 py = py0;
3029 fasp_blas_smat_ypAx_nc3(pA, px0, py);
3030
3031 k++;
3032 j = JA[k];
3033 pA = val + k * 9;
3034 px0 = x + j * 3;
3035 py = py0;
3036 fasp_blas_smat_ypAx_nc3(pA, px0, py);
3037
3038 k++;
3039 j = JA[k];
3040 pA = val + k * 9;
3041 px0 = x + j * 3;
3042 py = py0;
3043 fasp_blas_smat_ypAx_nc3(pA, px0, py);
3044
3045 k++;
3046 j = JA[k];
3047 pA = val + k * 9;
3048 px0 = x + j * 3;
3049 py = py0;
3050 fasp_blas_smat_ypAx_nc3(pA, px0, py);
3051
3052 break;
3053
3054 default:
3055 for (k = IA[i]; k < IA[i + 1]; ++k) {
3056 j = JA[k];
3057 pA = val + k * 9;
3058 px0 = x + j * 3;
3059 py = py0;
3060 fasp_blas_smat_ypAx_nc3(pA, px0, py);
3061 }
3062 break;
3063 }
3064 }
3065 }
3066#endif
3067 } else {
3068 for (i = 0; i < ROW; ++i) {
3069 py0 = &y[i * 3];
3070 num_nnz_row = IA[i + 1] - IA[i];
3071 switch (num_nnz_row) {
3072 case 3:
3073 k = IA[i];
3074 j = JA[k];
3075 px0 = x + j * 3; // &x[j*nb];
3076 py = py0;
3077 py[0] += px0[0];
3078 py[1] += px0[1];
3079 py[2] += px0[2];
3080
3081 k++;
3082 j = JA[k];
3083 px0 = x + j * 3; // &x[j*nb];
3084 py = py0;
3085 py[0] += px0[0];
3086 py[1] += px0[1];
3087 py[2] += px0[2];
3088
3089 k++;
3090 j = JA[k];
3091 px0 = x + j * 3; // &x[j*nb];
3092 py = py0;
3093 py[0] += px0[0];
3094 py[1] += px0[1];
3095 py[2] += px0[2];
3096
3097 break;
3098
3099 case 4:
3100 k = IA[i];
3101 j = JA[k];
3102 px0 = x + j * 3; // &x[j*nb];
3103 py = py0;
3104 py[0] += px0[0];
3105 py[1] += px0[1];
3106 py[2] += px0[2];
3107
3108 k++;
3109 j = JA[k];
3110 px0 = x + j * 3; // &x[j*nb];
3111 py = py0;
3112 py[0] += px0[0];
3113 py[1] += px0[1];
3114 py[2] += px0[2];
3115
3116 k++;
3117 j = JA[k];
3118 px0 = x + j * 3; // &x[j*nb];
3119 py = py0;
3120 py[0] += px0[0];
3121 py[1] += px0[1];
3122 py[2] += px0[2];
3123
3124 k++;
3125 j = JA[k];
3126 px0 = x + j * 3; // &x[j*nb];
3127 py = py0;
3128 py[0] += px0[0];
3129 py[1] += px0[1];
3130 py[2] += px0[2];
3131
3132 break;
3133
3134 case 5:
3135 k = IA[i];
3136 j = JA[k];
3137 px0 = x + j * 3; // &x[j*nb];
3138 py = py0;
3139 py[0] += px0[0];
3140 py[1] += px0[1];
3141 py[2] += px0[2];
3142
3143 k++;
3144 j = JA[k];
3145 px0 = x + j * 3; // &x[j*nb];
3146 py = py0;
3147 py[0] += px0[0];
3148 py[1] += px0[1];
3149 py[2] += px0[2];
3150
3151 k++;
3152 j = JA[k];
3153 px0 = x + j * 3; // &x[j*nb];
3154 py = py0;
3155 py[0] += px0[0];
3156 py[1] += px0[1];
3157 py[2] += px0[2];
3158
3159 k++;
3160 j = JA[k];
3161 px0 = x + j * 3; // &x[j*nb];
3162 py = py0;
3163 py[0] += px0[0];
3164 py[1] += px0[1];
3165 py[2] += px0[2];
3166
3167 k++;
3168 j = JA[k];
3169 px0 = x + j * 3; // &x[j*nb];
3170 py = py0;
3171 py[0] += px0[0];
3172 py[1] += px0[1];
3173 py[2] += px0[2];
3174
3175 break;
3176
3177 case 6:
3178 k = IA[i];
3179 j = JA[k];
3180 px0 = x + j * 3; // &x[j*nb];
3181 py = py0;
3182 py[0] += px0[0];
3183 py[1] += px0[1];
3184 py[2] += px0[2];
3185
3186 k++;
3187 j = JA[k];
3188 px0 = x + j * 3; // &x[j*nb];
3189 py = py0;
3190 py[0] += px0[0];
3191 py[1] += px0[1];
3192 py[2] += px0[2];
3193
3194 k++;
3195 j = JA[k];
3196 px0 = x + j * 3; // &x[j*nb];
3197 py = py0;
3198 py[0] += px0[0];
3199 py[1] += px0[1];
3200 py[2] += px0[2];
3201
3202 k++;
3203 j = JA[k];
3204 px0 = x + j * 3; // &x[j*nb];
3205 py = py0;
3206 py[0] += px0[0];
3207 py[1] += px0[1];
3208 py[2] += px0[2];
3209
3210 k++;
3211 j = JA[k];
3212 px0 = x + j * 3; // &x[j*nb];
3213 py = py0;
3214 py[0] += px0[0];
3215 py[1] += px0[1];
3216 py[2] += px0[2];
3217
3218 k++;
3219 j = JA[k];
3220 px0 = x + j * 3; // &x[j*nb];
3221 py = py0;
3222 py[0] += px0[0];
3223 py[1] += px0[1];
3224 py[2] += px0[2];
3225
3226 break;
3227
3228 case 7:
3229 k = IA[i];
3230 j = JA[k];
3231 px0 = x + j * 3; // &x[j*nb];
3232 py = py0;
3233 py[0] += px0[0];
3234 py[1] += px0[1];
3235 py[2] += px0[2];
3236
3237 k++;
3238 j = JA[k];
3239 px0 = x + j * 3; // &x[j*nb];
3240 py = py0;
3241 py[0] += px0[0];
3242 py[1] += px0[1];
3243 py[2] += px0[2];
3244
3245 k++;
3246 j = JA[k];
3247 px0 = x + j * 3; // &x[j*nb];
3248 py = py0;
3249 py[0] += px0[0];
3250 py[1] += px0[1];
3251 py[2] += px0[2];
3252
3253 k++;
3254 j = JA[k];
3255 px0 = x + j * 3; // &x[j*nb];
3256 py = py0;
3257 py[0] += px0[0];
3258 py[1] += px0[1];
3259 py[2] += px0[2];
3260
3261 k++;
3262 j = JA[k];
3263 px0 = x + j * 3; // &x[j*nb];
3264 py = py0;
3265 py[0] += px0[0];
3266 py[1] += px0[1];
3267 py[2] += px0[2];
3268
3269 k++;
3270 j = JA[k];
3271 px0 = x + j * 3; // &x[j*nb];
3272 py = py0;
3273 py[0] += px0[0];
3274 py[1] += px0[1];
3275 py[2] += px0[2];
3276
3277 k++;
3278 j = JA[k];
3279 px0 = x + j * 3; // &x[j*nb];
3280 py = py0;
3281 py[0] += px0[0];
3282 py[1] += px0[1];
3283 py[2] += px0[2];
3284
3285 break;
3286
3287 default:
3288 for (k = IA[i]; k < IA[i + 1]; ++k) {
3289 j = JA[k];
3290 px0 = x + j * 3; // &x[j*nb];
3291 py = py0;
3292 py[0] += px0[0];
3293 py[1] += px0[1];
3294 py[2] += px0[2];
3295 }
3296 break;
3297 }
3298 }
3299 }
3300 }
3301 break;
3302
3303 case 5:
3304 {
3305 if (use_openmp) {
3306#ifdef _OPENMP
3307#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, \
3308 py)
3309 {
3310 myid = omp_get_thread_num();
3311 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
3312 for (i = mybegin; i < myend; ++i) {
3313 py0 = &y[i * 5];
3314 num_nnz_row = IA[i + 1] - IA[i];
3315 switch (num_nnz_row) {
3316
3317 case 3:
3318 k = IA[i];
3319 j = JA[k];
3320 pA = val + k * 25; // &val[k*jump];
3321 px0 = x + j * 5; // &x[j*nb];
3322 py = py0;
3323 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3324
3325 k++;
3326 j = JA[k];
3327 pA = val + k * 25; // &val[k*jump];
3328 px0 = x + j * 5; // &x[j*nb];
3329 py = py0;
3330 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3331
3332 k++;
3333 j = JA[k];
3334 pA = val + k * 25; // &val[k*jump];
3335 px0 = x + j * 5; // &x[j*nb];
3336 py = py0;
3337 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3338
3339 break;
3340
3341 case 4:
3342 k = IA[i];
3343 j = JA[k];
3344 pA = val + k * 25; // &val[k*jump];
3345 px0 = x + j * 5; // &x[j*nb];
3346 py = py0;
3347 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3348
3349 k++;
3350 j = JA[k];
3351 pA = val + k * 25; // &val[k*jump];
3352 px0 = x + j * 5; // &x[j*nb];
3353 py = py0;
3354 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3355
3356 k++;
3357 j = JA[k];
3358 pA = val + k * 25; // &val[k*jump];
3359 px0 = x + j * 5; // &x[j*nb];
3360 py = py0;
3361 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3362
3363 k++;
3364 j = JA[k];
3365 pA = val + k * 25; // &val[k*jump];
3366 px0 = x + j * 5; // &x[j*nb];
3367 py = py0;
3368 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3369
3370 break;
3371
3372 case 5:
3373 k = IA[i];
3374 j = JA[k];
3375 pA = val + k * 25; // &val[k*jump];
3376 px0 = x + j * 5; // &x[j*nb];
3377 py = py0;
3378 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3379
3380 k++;
3381 j = JA[k];
3382 pA = val + k * 25; // &val[k*jump];
3383 px0 = x + j * 5; // &x[j*nb];
3384 py = py0;
3385 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3386
3387 k++;
3388 j = JA[k];
3389 pA = val + k * 25; // &val[k*jump];
3390 px0 = x + j * 5; // &x[j*nb];
3391 py = py0;
3392 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3393
3394 k++;
3395 j = JA[k];
3396 pA = val + k * 25; // &val[k*jump];
3397 px0 = x + j * 5; // &x[j*nb];
3398 py = py0;
3399 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3400
3401 k++;
3402 j = JA[k];
3403 pA = val + k * 25; // &val[k*jump];
3404 px0 = x + j * 5; // &x[j*nb];
3405 py = py0;
3406 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3407
3408 break;
3409
3410 case 6:
3411 k = IA[i];
3412 j = JA[k];
3413 pA = val + k * 25; // &val[k*jump];
3414 px0 = x + j * 5; // &x[j*nb];
3415 py = py0;
3416 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3417
3418 k++;
3419 j = JA[k];
3420 pA = val + k * 25; // &val[k*jump];
3421 px0 = x + j * 5; // &x[j*nb];
3422 py = py0;
3423 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3424
3425 k++;
3426 j = JA[k];
3427 pA = val + k * 25; // &val[k*jump];
3428 px0 = x + j * 5; // &x[j*nb];
3429 py = py0;
3430 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3431
3432 k++;
3433 j = JA[k];
3434 pA = val + k * 25; // &val[k*jump];
3435 px0 = x + j * 5; // &x[j*nb];
3436 py = py0;
3437 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3438
3439 k++;
3440 j = JA[k];
3441 pA = val + k * 25; // &val[k*jump];
3442 px0 = x + j * 5; // &x[j*nb];
3443 py = py0;
3444 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3445
3446 k++;
3447 j = JA[k];
3448 pA = val + k * 25; // &val[k*jump];
3449 px0 = x + j * 5; // &x[j*nb];
3450 py = py0;
3451 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3452
3453 break;
3454
3455 case 7:
3456 k = IA[i];
3457 j = JA[k];
3458 pA = val + k * 25; // &val[k*jump];
3459 px0 = x + j * 5; // &x[j*nb];
3460 py = py0;
3461 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3462
3463 k++;
3464 j = JA[k];
3465 pA = val + k * 25; // &val[k*jump];
3466 px0 = x + j * 5; // &x[j*nb];
3467 py = py0;
3468 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3469
3470 k++;
3471 j = JA[k];
3472 pA = val + k * 25; // &val[k*jump];
3473 px0 = x + j * 5; // &x[j*nb];
3474 py = py0;
3475 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3476
3477 k++;
3478 j = JA[k];
3479 pA = val + k * 25; // &val[k*jump];
3480 px0 = x + j * 5; // &x[j*nb];
3481 py = py0;
3482 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3483
3484 k++;
3485 j = JA[k];
3486 pA = val + k * 25; // &val[k*jump];
3487 px0 = x + j * 5; // &x[j*nb];
3488 py = py0;
3489 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3490
3491 k++;
3492 j = JA[k];
3493 pA = val + k * 25; // &val[k*jump];
3494 px0 = x + j * 5; // &x[j*nb];
3495 py = py0;
3496 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3497
3498 k++;
3499 j = JA[k];
3500 pA = val + k * 25; // &val[k*jump];
3501 px0 = x + j * 5; // &x[j*nb];
3502 py = py0;
3503 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3504
3505 break;
3506
3507 default:
3508 for (k = IA[i]; k < IA[i + 1]; ++k) {
3509 j = JA[k];
3510 pA = val + k * 25; // &val[k*jump];
3511 px0 = x + j * 5; // &x[j*nb];
3512 py = py0;
3513 fasp_blas_smat_ypAx_nc5(pA, px0, py);
3514 }
3515 break;
3516 }
3517 }
3518 }
3519#endif
3520 } else {
3521 for (i = 0; i < ROW; ++i) {
3522 py0 = &y[i * 5];
3523 num_nnz_row = IA[i + 1] - IA[i];
3524 switch (num_nnz_row) {
3525
3526 case 3:
3527 k = IA[i];
3528 j = JA[k];
3529 px0 = x + j * 5; // &x[j*nb];
3530 py = py0;
3531 py[0] += px0[0];
3532 py[1] += px0[1];
3533 py[2] += px0[2];
3534 py[3] += px0[3];
3535 py[4] += px0[4];
3536
3537 k++;
3538 j = JA[k];
3539 px0 = x + j * 5; // &x[j*nb];
3540 py = py0;
3541 py[0] += px0[0];
3542 py[1] += px0[1];
3543 py[2] += px0[2];
3544 py[3] += px0[3];
3545 py[4] += px0[4];
3546
3547 k++;
3548 j = JA[k];
3549 px0 = x + j * 5; // &x[j*nb];
3550 py = py0;
3551 py[0] += px0[0];
3552 py[1] += px0[1];
3553 py[2] += px0[2];
3554 py[3] += px0[3];
3555 py[4] += px0[4];
3556
3557 break;
3558
3559 case 4:
3560 k = IA[i];
3561 j = JA[k];
3562 px0 = x + j * 5; // &x[j*nb];
3563 py = py0;
3564 py[0] += px0[0];
3565 py[1] += px0[1];
3566 py[2] += px0[2];
3567 py[3] += px0[3];
3568 py[4] += px0[4];
3569
3570 k++;
3571 j = JA[k];
3572 px0 = x + j * 5; // &x[j*nb];
3573 py = py0;
3574 py[0] += px0[0];
3575 py[1] += px0[1];
3576 py[2] += px0[2];
3577 py[3] += px0[3];
3578 py[4] += px0[4];
3579
3580 k++;
3581 j = JA[k];
3582 px0 = x + j * 5; // &x[j*nb];
3583 py = py0;
3584 py[0] += px0[0];
3585 py[1] += px0[1];
3586 py[2] += px0[2];
3587 py[3] += px0[3];
3588 py[4] += px0[4];
3589
3590 k++;
3591 j = JA[k];
3592 px0 = x + j * 5; // &x[j*nb];
3593 py = py0;
3594 py[0] += px0[0];
3595 py[1] += px0[1];
3596 py[2] += px0[2];
3597 py[3] += px0[3];
3598 py[4] += px0[4];
3599
3600 break;
3601
3602 case 5:
3603 k = IA[i];
3604 j = JA[k];
3605 px0 = x + j * 5; // &x[j*nb];
3606 py = py0;
3607 py[0] += px0[0];
3608 py[1] += px0[1];
3609 py[2] += px0[2];
3610 py[3] += px0[3];
3611 py[4] += px0[4];
3612
3613 k++;
3614 j = JA[k];
3615 px0 = x + j * 5; // &x[j*nb];
3616 py = py0;
3617 py[0] += px0[0];
3618 py[1] += px0[1];
3619 py[2] += px0[2];
3620 py[3] += px0[3];
3621 py[4] += px0[4];
3622
3623 k++;
3624 j = JA[k];
3625 px0 = x + j * 5; // &x[j*nb];
3626 py = py0;
3627 py[0] += px0[0];
3628 py[1] += px0[1];
3629 py[2] += px0[2];
3630 py[3] += px0[3];
3631 py[4] += px0[4];
3632
3633 k++;
3634 j = JA[k];
3635 px0 = x + j * 5; // &x[j*nb];
3636 py = py0;
3637 py[0] += px0[0];
3638 py[1] += px0[1];
3639 py[2] += px0[2];
3640 py[3] += px0[3];
3641 py[4] += px0[4];
3642
3643 k++;
3644 j = JA[k];
3645 px0 = x + j * 5; // &x[j*nb];
3646 py = py0;
3647 py[0] += px0[0];
3648 py[1] += px0[1];
3649 py[2] += px0[2];
3650 py[3] += px0[3];
3651 py[4] += px0[4];
3652
3653 break;
3654
3655 case 6:
3656 k = IA[i];
3657 j = JA[k];
3658 px0 = x + j * 5; // &x[j*nb];
3659 py = py0;
3660 py[0] += px0[0];
3661 py[1] += px0[1];
3662 py[2] += px0[2];
3663 py[3] += px0[3];
3664 py[4] += px0[4];
3665
3666 k++;
3667 j = JA[k];
3668 px0 = x + j * 5; // &x[j*nb];
3669 py = py0;
3670 py[0] += px0[0];
3671 py[1] += px0[1];
3672 py[2] += px0[2];
3673 py[3] += px0[3];
3674 py[4] += px0[4];
3675
3676 k++;
3677 j = JA[k];
3678 px0 = x + j * 5; // &x[j*nb];
3679 py = py0;
3680 py[0] += px0[0];
3681 py[1] += px0[1];
3682 py[2] += px0[2];
3683 py[3] += px0[3];
3684 py[4] += px0[4];
3685
3686 k++;
3687 j = JA[k];
3688 px0 = x + j * 5; // &x[j*nb];
3689 py = py0;
3690 py[0] += px0[0];
3691 py[1] += px0[1];
3692 py[2] += px0[2];
3693 py[3] += px0[3];
3694 py[4] += px0[4];
3695
3696 k++;
3697 j = JA[k];
3698 px0 = x + j * 5; // &x[j*nb];
3699 py = py0;
3700 py[0] += px0[0];
3701 py[1] += px0[1];
3702 py[2] += px0[2];
3703 py[3] += px0[3];
3704 py[4] += px0[4];
3705
3706 k++;
3707 j = JA[k];
3708 px0 = x + j * 5; // &x[j*nb];
3709 py = py0;
3710 py[0] += px0[0];
3711 py[1] += px0[1];
3712 py[2] += px0[2];
3713 py[3] += px0[3];
3714 py[4] += px0[4];
3715
3716 break;
3717
3718 case 7:
3719 k = IA[i];
3720 j = JA[k];
3721 px0 = x + j * 5; // &x[j*nb];
3722 py = py0;
3723 py[0] += px0[0];
3724 py[1] += px0[1];
3725 py[2] += px0[2];
3726 py[3] += px0[3];
3727 py[4] += px0[4];
3728
3729 k++;
3730 j = JA[k];
3731 px0 = x + j * 5; // &x[j*nb];
3732 py = py0;
3733 py[0] += px0[0];
3734 py[1] += px0[1];
3735 py[2] += px0[2];
3736 py[3] += px0[3];
3737 py[4] += px0[4];
3738
3739 k++;
3740 j = JA[k];
3741 px0 = x + j * 5; // &x[j*nb];
3742 py = py0;
3743 py[0] += px0[0];
3744 py[1] += px0[1];
3745 py[2] += px0[2];
3746 py[3] += px0[3];
3747 py[4] += px0[4];
3748
3749 k++;
3750 j = JA[k];
3751 px0 = x + j * 5; // &x[j*nb];
3752 py = py0;
3753 py[0] += px0[0];
3754 py[1] += px0[1];
3755 py[2] += px0[2];
3756 py[3] += px0[3];
3757 py[4] += px0[4];
3758
3759 k++;
3760 j = JA[k];
3761 px0 = x + j * 5; // &x[j*nb];
3762 py = py0;
3763 py[0] += px0[0];
3764 py[1] += px0[1];
3765 py[2] += px0[2];
3766 py[3] += px0[3];
3767 py[4] += px0[4];
3768
3769 k++;
3770 j = JA[k];
3771 px0 = x + j * 5; // &x[j*nb];
3772 py = py0;
3773 py[0] += px0[0];
3774 py[1] += px0[1];
3775 py[2] += px0[2];
3776 py[3] += px0[3];
3777 py[4] += px0[4];
3778
3779 k++;
3780 j = JA[k];
3781 px0 = x + j * 5; // &x[j*nb];
3782 py = py0;
3783 py[0] += px0[0];
3784 py[1] += px0[1];
3785 py[2] += px0[2];
3786 py[3] += px0[3];
3787 py[4] += px0[4];
3788
3789 break;
3790
3791 default:
3792 for (k = IA[i]; k < IA[i + 1]; ++k) {
3793 j = JA[k];
3794 px0 = x + j * 5; // &x[j*nb];
3795 py = py0;
3796 py[0] += px0[0];
3797 py[1] += px0[1];
3798 py[2] += px0[2];
3799 py[3] += px0[3];
3800 py[4] += px0[4];
3801 }
3802 break;
3803 }
3804 }
3805 }
3806 }
3807 break;
3808
3809 case 7:
3810 {
3811 if (use_openmp) {
3812#ifdef _OPENMP
3813#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, \
3814 py)
3815 {
3816 myid = omp_get_thread_num();
3817 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
3818 for (i = mybegin; i < myend; ++i) {
3819 py0 = &y[i * 7];
3820 num_nnz_row = IA[i + 1] - IA[i];
3821 switch (num_nnz_row) {
3822
3823 case 3:
3824 k = IA[i];
3825 j = JA[k];
3826 pA = val + k * 49; // &val[k*jump];
3827 px0 = x + j * 7; // &x[j*nb];
3828 py = py0;
3829 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3830
3831 k++;
3832 j = JA[k];
3833 pA = val + k * 49; // &val[k*jump];
3834 px0 = x + j * 7; // &x[j*nb];
3835 py = py0;
3836 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3837
3838 k++;
3839 j = JA[k];
3840 pA = val + k * 49; // &val[k*jump];
3841 px0 = x + j * 7; // &x[j*nb];
3842 py = py0;
3843 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3844
3845 break;
3846
3847 case 4:
3848 k = IA[i];
3849 j = JA[k];
3850 pA = val + k * 49; // &val[k*jump];
3851 px0 = x + j * 7; // &x[j*nb];
3852 py = py0;
3853 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3854
3855 k++;
3856 j = JA[k];
3857 pA = val + k * 49; // &val[k*jump];
3858 px0 = x + j * 7; // &x[j*nb];
3859 py = py0;
3860 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3861
3862 k++;
3863 j = JA[k];
3864 pA = val + k * 49; // &val[k*jump];
3865 px0 = x + j * 7; // &x[j*nb];
3866 py = py0;
3867 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3868
3869 k++;
3870 j = JA[k];
3871 pA = val + k * 49; // &val[k*jump];
3872 px0 = x + j * 7; // &x[j*nb];
3873 py = py0;
3874 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3875
3876 break;
3877
3878 case 5:
3879 k = IA[i];
3880 j = JA[k];
3881 pA = val + k * 49; // &val[k*jump];
3882 px0 = x + j * 7; // &x[j*nb];
3883 py = py0;
3884 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3885
3886 k++;
3887 j = JA[k];
3888 pA = val + k * 49; // &val[k*jump];
3889 px0 = x + j * 7; // &x[j*nb];
3890 py = py0;
3891 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3892
3893 k++;
3894 j = JA[k];
3895 pA = val + k * 49; // &val[k*jump];
3896 px0 = x + j * 7; // &x[j*nb];
3897 py = py0;
3898 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3899
3900 k++;
3901 j = JA[k];
3902 pA = val + k * 49; // &val[k*jump];
3903 px0 = x + j * 7; // &x[j*nb];
3904 py = py0;
3905 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3906
3907 k++;
3908 j = JA[k];
3909 pA = val + k * 49; // &val[k*jump];
3910 px0 = x + j * 7; // &x[j*nb];
3911 py = py0;
3912 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3913
3914 break;
3915
3916 case 6:
3917 k = IA[i];
3918 j = JA[k];
3919 pA = val + k * 49; // &val[k*jump];
3920 px0 = x + j * 7; // &x[j*nb];
3921 py = py0;
3922 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3923
3924 k++;
3925 j = JA[k];
3926 pA = val + k * 49; // &val[k*jump];
3927 px0 = x + j * 7; // &x[j*nb];
3928 py = py0;
3929 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3930
3931 k++;
3932 j = JA[k];
3933 pA = val + k * 49; // &val[k*jump];
3934 px0 = x + j * 7; // &x[j*nb];
3935 py = py0;
3936 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3937
3938 k++;
3939 j = JA[k];
3940 pA = val + k * 49; // &val[k*jump];
3941 px0 = x + j * 7; // &x[j*nb];
3942 py = py0;
3943 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3944
3945 k++;
3946 j = JA[k];
3947 pA = val + k * 49; // &val[k*jump];
3948 px0 = x + j * 7; // &x[j*nb];
3949 py = py0;
3950 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3951
3952 k++;
3953 j = JA[k];
3954 pA = val + k * 49; // &val[k*jump];
3955 px0 = x + j * 7; // &x[j*nb];
3956 py = py0;
3957 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3958
3959 break;
3960
3961 case 7:
3962 k = IA[i];
3963 j = JA[k];
3964 pA = val + k * 49; // &val[k*jump];
3965 px0 = x + j * 7; // &x[j*nb];
3966 py = py0;
3967 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3968
3969 k++;
3970 j = JA[k];
3971 pA = val + k * 49; // &val[k*jump];
3972 px0 = x + j * 7; // &x[j*nb];
3973 py = py0;
3974 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3975
3976 k++;
3977 j = JA[k];
3978 pA = val + k * 49; // &val[k*jump];
3979 px0 = x + j * 7; // &x[j*nb];
3980 py = py0;
3981 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3982
3983 k++;
3984 j = JA[k];
3985 pA = val + k * 49; // &val[k*jump];
3986 px0 = x + j * 7; // &x[j*nb];
3987 py = py0;
3988 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3989
3990 k++;
3991 j = JA[k];
3992 pA = val + k * 49; // &val[k*jump];
3993 px0 = x + j * 7; // &x[j*nb];
3994 py = py0;
3995 fasp_blas_smat_ypAx_nc7(pA, px0, py);
3996
3997 k++;
3998 j = JA[k];
3999 pA = val + k * 49; // &val[k*jump];
4000 px0 = x + j * 7; // &x[j*nb];
4001 py = py0;
4002 fasp_blas_smat_ypAx_nc7(pA, px0, py);
4003
4004 k++;
4005 j = JA[k];
4006 pA = val + k * 49; // &val[k*jump];
4007 px0 = x + j * 7; // &x[j*nb];
4008 py = py0;
4009 fasp_blas_smat_ypAx_nc7(pA, px0, py);
4010
4011 break;
4012
4013 default:
4014 for (k = IA[i]; k < IA[i + 1]; ++k) {
4015 j = JA[k];
4016 pA = val + k * 49; // &val[k*jump];
4017 px0 = x + j * 7; // &x[j*nb];
4018 py = py0;
4019 fasp_blas_smat_ypAx_nc7(pA, px0, py);
4020 }
4021 break;
4022 }
4023 }
4024 }
4025#endif
4026 } else {
4027 for (i = 0; i < ROW; ++i) {
4028 py0 = &y[i * 7];
4029 num_nnz_row = IA[i + 1] - IA[i];
4030 switch (num_nnz_row) {
4031
4032 case 3:
4033 k = IA[i];
4034 j = JA[k];
4035 px0 = x + j * 7; // &x[j*nb];
4036 py = py0;
4037 py[0] += px0[0];
4038 py[1] += px0[1];
4039 py[2] += px0[2];
4040 py[3] += px0[3];
4041 py[4] += px0[4];
4042 py[5] += px0[5];
4043 py[6] += px0[6];
4044
4045 k++;
4046 j = JA[k];
4047 px0 = x + j * 7; // &x[j*nb];
4048 py = py0;
4049 py[0] += px0[0];
4050 py[1] += px0[1];
4051 py[2] += px0[2];
4052 py[3] += px0[3];
4053 py[4] += px0[4];
4054 py[5] += px0[5];
4055 py[6] += px0[6];
4056
4057 k++;
4058 j = JA[k];
4059 px0 = x + j * 7; // &x[j*nb];
4060 py = py0;
4061 py[0] += px0[0];
4062 py[1] += px0[1];
4063 py[2] += px0[2];
4064 py[3] += px0[3];
4065 py[4] += px0[4];
4066 py[5] += px0[5];
4067 py[6] += px0[6];
4068
4069 break;
4070
4071 case 4:
4072 k = IA[i];
4073 j = JA[k];
4074 px0 = x + j * 7; // &x[j*nb];
4075 py = py0;
4076 py[0] += px0[0];
4077 py[1] += px0[1];
4078 py[2] += px0[2];
4079 py[3] += px0[3];
4080 py[4] += px0[4];
4081 py[5] += px0[5];
4082 py[6] += px0[6];
4083
4084 k++;
4085 j = JA[k];
4086 px0 = x + j * 7; // &x[j*nb];
4087 py = py0;
4088 py[0] += px0[0];
4089 py[1] += px0[1];
4090 py[2] += px0[2];
4091 py[3] += px0[3];
4092 py[4] += px0[4];
4093 py[5] += px0[5];
4094 py[6] += px0[6];
4095
4096 k++;
4097 j = JA[k];
4098 px0 = x + j * 7; // &x[j*nb];
4099 py = py0;
4100 py[0] += px0[0];
4101 py[1] += px0[1];
4102 py[2] += px0[2];
4103 py[3] += px0[3];
4104 py[4] += px0[4];
4105 py[5] += px0[5];
4106 py[6] += px0[6];
4107
4108 k++;
4109 j = JA[k];
4110 px0 = x + j * 7; // &x[j*nb];
4111 py = py0;
4112 py[0] += px0[0];
4113 py[1] += px0[1];
4114 py[2] += px0[2];
4115 py[3] += px0[3];
4116 py[4] += px0[4];
4117 py[5] += px0[5];
4118 py[6] += px0[6];
4119
4120 break;
4121
4122 case 5:
4123 k = IA[i];
4124 j = JA[k];
4125 px0 = x + j * 7; // &x[j*nb];
4126 py = py0;
4127 py[0] += px0[0];
4128 py[1] += px0[1];
4129 py[2] += px0[2];
4130 py[3] += px0[3];
4131 py[4] += px0[4];
4132 py[5] += px0[5];
4133 py[6] += px0[6];
4134
4135 k++;
4136 j = JA[k];
4137 px0 = x + j * 7; // &x[j*nb];
4138 py = py0;
4139 py[0] += px0[0];
4140 py[1] += px0[1];
4141 py[2] += px0[2];
4142 py[3] += px0[3];
4143 py[4] += px0[4];
4144 py[5] += px0[5];
4145 py[6] += px0[6];
4146
4147 k++;
4148 j = JA[k];
4149 px0 = x + j * 7; // &x[j*nb];
4150 py = py0;
4151 py[0] += px0[0];
4152 py[1] += px0[1];
4153 py[2] += px0[2];
4154 py[3] += px0[3];
4155 py[4] += px0[4];
4156 py[5] += px0[5];
4157 py[6] += px0[6];
4158
4159 k++;
4160 j = JA[k];
4161 px0 = x + j * 7; // &x[j*nb];
4162 py = py0;
4163 py[0] += px0[0];
4164 py[1] += px0[1];
4165 py[2] += px0[2];
4166 py[3] += px0[3];
4167 py[4] += px0[4];
4168 py[5] += px0[5];
4169 py[6] += px0[6];
4170
4171 k++;
4172 j = JA[k];
4173 px0 = x + j * 7; // &x[j*nb];
4174 py = py0;
4175 py[0] += px0[0];
4176 py[1] += px0[1];
4177 py[2] += px0[2];
4178 py[3] += px0[3];
4179 py[4] += px0[4];
4180 py[5] += px0[5];
4181 py[6] += px0[6];
4182
4183 break;
4184
4185 case 6:
4186 k = IA[i];
4187 j = JA[k];
4188 px0 = x + j * 7; // &x[j*nb];
4189 py = py0;
4190 py[0] += px0[0];
4191 py[1] += px0[1];
4192 py[2] += px0[2];
4193 py[3] += px0[3];
4194 py[4] += px0[4];
4195 py[5] += px0[5];
4196 py[6] += px0[6];
4197
4198 k++;
4199 j = JA[k];
4200 px0 = x + j * 7; // &x[j*nb];
4201 py = py0;
4202 py[0] += px0[0];
4203 py[1] += px0[1];
4204 py[2] += px0[2];
4205 py[3] += px0[3];
4206 py[4] += px0[4];
4207 py[5] += px0[5];
4208 py[6] += px0[6];
4209
4210 k++;
4211 j = JA[k];
4212 px0 = x + j * 7; // &x[j*nb];
4213 py = py0;
4214 py[0] += px0[0];
4215 py[1] += px0[1];
4216 py[2] += px0[2];
4217 py[3] += px0[3];
4218 py[4] += px0[4];
4219 py[5] += px0[5];
4220 py[6] += px0[6];
4221
4222 k++;
4223 j = JA[k];
4224 px0 = x + j * 7; // &x[j*nb];
4225 py = py0;
4226 py[0] += px0[0];
4227 py[1] += px0[1];
4228 py[2] += px0[2];
4229 py[3] += px0[3];
4230 py[4] += px0[4];
4231 py[5] += px0[5];
4232 py[6] += px0[6];
4233
4234 k++;
4235 j = JA[k];
4236 px0 = x + j * 7; // &x[j*nb];
4237 py = py0;
4238 py[0] += px0[0];
4239 py[1] += px0[1];
4240 py[2] += px0[2];
4241 py[3] += px0[3];
4242 py[4] += px0[4];
4243 py[5] += px0[5];
4244 py[6] += px0[6];
4245
4246 k++;
4247 j = JA[k];
4248 px0 = x + j * 7; // &x[j*nb];
4249 py = py0;
4250 py[0] += px0[0];
4251 py[1] += px0[1];
4252 py[2] += px0[2];
4253 py[3] += px0[3];
4254 py[4] += px0[4];
4255 py[5] += px0[5];
4256 py[6] += px0[6];
4257
4258 break;
4259
4260 case 7:
4261 k = IA[i];
4262 j = JA[k];
4263 px0 = x + j * 7; // &x[j*nb];
4264 py = py0;
4265 py[0] += px0[0];
4266 py[1] += px0[1];
4267 py[2] += px0[2];
4268 py[3] += px0[3];
4269 py[4] += px0[4];
4270 py[5] += px0[5];
4271 py[6] += px0[6];
4272
4273 k++;
4274 j = JA[k];
4275 px0 = x + j * 7; // &x[j*nb];
4276 py = py0;
4277 py[0] += px0[0];
4278 py[1] += px0[1];
4279 py[2] += px0[2];
4280 py[3] += px0[3];
4281 py[4] += px0[4];
4282 py[5] += px0[5];
4283 py[6] += px0[6];
4284
4285 k++;
4286 j = JA[k];
4287 px0 = x + j * 7; // &x[j*nb];
4288 py = py0;
4289 py[0] += px0[0];
4290 py[1] += px0[1];
4291 py[2] += px0[2];
4292 py[3] += px0[3];
4293 py[4] += px0[4];
4294 py[5] += px0[5];
4295 py[6] += px0[6];
4296
4297 k++;
4298 j = JA[k];
4299 px0 = x + j * 7; // &x[j*nb];
4300 py = py0;
4301 py[0] += px0[0];
4302 py[1] += px0[1];
4303 py[2] += px0[2];
4304 py[3] += px0[3];
4305 py[4] += px0[4];
4306 py[5] += px0[5];
4307 py[6] += px0[6];
4308
4309 k++;
4310 j = JA[k];
4311 px0 = x + j * 7; // &x[j*nb];
4312 py = py0;
4313 py[0] += px0[0];
4314 py[1] += px0[1];
4315 py[2] += px0[2];
4316 py[3] += px0[3];
4317 py[4] += px0[4];
4318 py[5] += px0[5];
4319 py[6] += px0[6];
4320
4321 k++;
4322 j = JA[k];
4323 px0 = x + j * 7; // &x[j*nb];
4324 py = py0;
4325 py[0] += px0[0];
4326 py[1] += px0[1];
4327 py[2] += px0[2];
4328 py[3] += px0[3];
4329 py[4] += px0[4];
4330 py[5] += px0[5];
4331 py[6] += px0[6];
4332
4333 k++;
4334 j = JA[k];
4335 px0 = x + j * 7; // &x[j*nb];
4336 py = py0;
4337 py[0] += px0[0];
4338 py[1] += px0[1];
4339 py[2] += px0[2];
4340 py[3] += px0[3];
4341 py[4] += px0[4];
4342 py[5] += px0[5];
4343 py[6] += px0[6];
4344
4345 break;
4346
4347 default:
4348 for (k = IA[i]; k < IA[i + 1]; ++k) {
4349 j = JA[k];
4350 px0 = x + j * 7; // &x[j*nb];
4351 py = py0;
4352 py[0] += px0[0];
4353 py[1] += px0[1];
4354 py[2] += px0[2];
4355 py[3] += px0[3];
4356 py[4] += px0[4];
4357 py[5] += px0[5];
4358 py[6] += px0[6];
4359 }
4360 break;
4361 }
4362 }
4363 }
4364 }
4365 break;
4366
4367 default:
4368 {
4369 if (use_openmp) {
4370#ifdef _OPENMP
4371#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, \
4372 py)
4373 {
4374 myid = omp_get_thread_num();
4375 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
4376 for (i = mybegin; i < myend; ++i) {
4377 py0 = &y[i * nb];
4378 num_nnz_row = IA[i + 1] - IA[i];
4379 switch (num_nnz_row) {
4380
4381 case 3:
4382 k = IA[i];
4383 j = JA[k];
4384 px0 = x + j * nb; // &x[j*nb];
4385 py = py0;
4386 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4387
4388 k++;
4389 j = JA[k];
4390 px0 = x + j * nb; // &x[j*nb];
4391 py = py0;
4392 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4393
4394 k++;
4395 j = JA[k];
4396 px0 = x + j * nb; // &x[j*nb];
4397 py = py0;
4398 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4399
4400 break;
4401
4402 case 4:
4403 k = IA[i];
4404 j = JA[k];
4405 px0 = x + j * nb; // &x[j*nb];
4406 py = py0;
4407 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4408
4409 k++;
4410 j = JA[k];
4411 px0 = x + j * nb; // &x[j*nb];
4412 py = py0;
4413 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4414
4415 k++;
4416 j = JA[k];
4417 px0 = x + j * nb; // &x[j*nb];
4418 py = py0;
4419 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4420
4421 k++;
4422 j = JA[k];
4423 px0 = x + j * nb; // &x[j*nb];
4424 py = py0;
4425 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4426
4427 break;
4428
4429 case 5:
4430 k = IA[i];
4431 j = JA[k];
4432 px0 = x + j * nb; // &x[j*nb];
4433 py = py0;
4434 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4435
4436 k++;
4437 j = JA[k];
4438 px0 = x + j * nb; // &x[j*nb];
4439 py = py0;
4440 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4441
4442 k++;
4443 j = JA[k];
4444 px0 = x + j * nb; // &x[j*nb];
4445 py = py0;
4446 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4447
4448 k++;
4449 j = JA[k];
4450 px0 = x + j * nb; // &x[j*nb];
4451 py = py0;
4452 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4453
4454 k++;
4455 j = JA[k];
4456 px0 = x + j * nb; // &x[j*nb];
4457 py = py0;
4458 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4459
4460 break;
4461
4462 case 6:
4463 k = IA[i];
4464 j = JA[k];
4465 px0 = x + j * nb; // &x[j*nb];
4466 py = py0;
4467 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4468
4469 k++;
4470 j = JA[k];
4471 px0 = x + j * nb; // &x[j*nb];
4472 py = py0;
4473 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4474
4475 k++;
4476 j = JA[k];
4477 px0 = x + j * nb; // &x[j*nb];
4478 py = py0;
4479 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4480
4481 k++;
4482 j = JA[k];
4483 px0 = x + j * nb; // &x[j*nb];
4484 py = py0;
4485 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4486
4487 k++;
4488 j = JA[k];
4489 px0 = x + j * nb; // &x[j*nb];
4490 py = py0;
4491 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4492
4493 k++;
4494 j = JA[k];
4495 px0 = x + j * nb; // &x[j*nb];
4496 py = py0;
4497 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4498
4499 break;
4500
4501 case 7:
4502 k = IA[i];
4503 j = JA[k];
4504 px0 = x + j * nb; // &x[j*nb];
4505 py = py0;
4506 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4507
4508 k++;
4509 j = JA[k];
4510 px0 = x + j * nb; // &x[j*nb];
4511 py = py0;
4512 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4513
4514 k++;
4515 j = JA[k];
4516 px0 = x + j * nb; // &x[j*nb];
4517 py = py0;
4518 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4519
4520 k++;
4521 j = JA[k];
4522 px0 = x + j * nb; // &x[j*nb];
4523 py = py0;
4524 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4525
4526 k++;
4527 j = JA[k];
4528 px0 = x + j * nb; // &x[j*nb];
4529 py = py0;
4530 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4531
4532 k++;
4533 j = JA[k];
4534 px0 = x + j * nb; // &x[j*nb];
4535 py = py0;
4536 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4537
4538 k++;
4539 j = JA[k];
4540 px0 = x + j * nb; // &x[j*nb];
4541 py = py0;
4542 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4543
4544 break;
4545
4546 default:
4547 for (k = IA[i]; k < IA[i + 1]; ++k) {
4548 j = JA[k];
4549 px0 = x + j * nb; // &x[j*nb];
4550 py = py0;
4551 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4552 }
4553 break;
4554 }
4555 }
4556 }
4557#endif
4558 } else {
4559 for (i = 0; i < ROW; ++i) {
4560 py0 = &y[i * nb];
4561 num_nnz_row = IA[i + 1] - IA[i];
4562 switch (num_nnz_row) {
4563
4564 case 3:
4565 k = IA[i];
4566 j = JA[k];
4567 px0 = x + j * nb; // &x[j*nb];
4568 py = py0;
4569 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4570
4571 k++;
4572 j = JA[k];
4573 px0 = x + j * nb; // &x[j*nb];
4574 py = py0;
4575 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4576
4577 k++;
4578 j = JA[k];
4579 px0 = x + j * nb; // &x[j*nb];
4580 py = py0;
4581 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4582
4583 break;
4584
4585 case 4:
4586 k = IA[i];
4587 j = JA[k];
4588 px0 = x + j * nb; // &x[j*nb];
4589 py = py0;
4590 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4591
4592 k++;
4593 j = JA[k];
4594 px0 = x + j * nb; // &x[j*nb];
4595 py = py0;
4596 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4597
4598 k++;
4599 j = JA[k];
4600 px0 = x + j * nb; // &x[j*nb];
4601 py = py0;
4602 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4603
4604 k++;
4605 j = JA[k];
4606 px0 = x + j * nb; // &x[j*nb];
4607 py = py0;
4608 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4609
4610 break;
4611
4612 case 5:
4613 k = IA[i];
4614 j = JA[k];
4615 px0 = x + j * nb; // &x[j*nb];
4616 py = py0;
4617 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4618
4619 k++;
4620 j = JA[k];
4621 px0 = x + j * nb; // &x[j*nb];
4622 py = py0;
4623 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4624
4625 k++;
4626 j = JA[k];
4627 px0 = x + j * nb; // &x[j*nb];
4628 py = py0;
4629 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4630
4631 k++;
4632 j = JA[k];
4633 px0 = x + j * nb; // &x[j*nb];
4634 py = py0;
4635 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4636
4637 k++;
4638 j = JA[k];
4639 px0 = x + j * nb; // &x[j*nb];
4640 py = py0;
4641 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4642
4643 break;
4644
4645 case 6:
4646 k = IA[i];
4647 j = JA[k];
4648 px0 = x + j * nb; // &x[j*nb];
4649 py = py0;
4650 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4651
4652 k++;
4653 j = JA[k];
4654 px0 = x + j * nb; // &x[j*nb];
4655 py = py0;
4656 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4657
4658 k++;
4659 j = JA[k];
4660 px0 = x + j * nb; // &x[j*nb];
4661 py = py0;
4662 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4663
4664 k++;
4665 j = JA[k];
4666 px0 = x + j * nb; // &x[j*nb];
4667 py = py0;
4668 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4669
4670 k++;
4671 j = JA[k];
4672 px0 = x + j * nb; // &x[j*nb];
4673 py = py0;
4674 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4675
4676 k++;
4677 j = JA[k];
4678 px0 = x + j * nb; // &x[j*nb];
4679 py = py0;
4680 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4681
4682 break;
4683
4684 case 7:
4685 k = IA[i];
4686 j = JA[k];
4687 px0 = x + j * nb; // &x[j*nb];
4688 py = py0;
4689 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4690
4691 k++;
4692 j = JA[k];
4693 px0 = x + j * nb; // &x[j*nb];
4694 py = py0;
4695 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4696
4697 k++;
4698 j = JA[k];
4699 px0 = x + j * nb; // &x[j*nb];
4700 py = py0;
4701 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4702
4703 k++;
4704 j = JA[k];
4705 px0 = x + j * nb; // &x[j*nb];
4706 py = py0;
4707 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4708
4709 k++;
4710 j = JA[k];
4711 px0 = x + j * nb; // &x[j*nb];
4712 py = py0;
4713 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4714
4715 k++;
4716 j = JA[k];
4717 px0 = x + j * nb; // &x[j*nb];
4718 py = py0;
4719 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4720
4721 k++;
4722 j = JA[k];
4723 px0 = x + j * nb; // &x[j*nb];
4724 py = py0;
4725 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4726
4727 break;
4728
4729 default:
4730 for (k = IA[i]; k < IA[i + 1]; ++k) {
4731 j = JA[k];
4732 px0 = x + j * nb; // &x[j*nb];
4733 py = py0;
4734 fasp_blas_darray_axpy(nb, 1.0, px0, py);
4735 }
4736 break;
4737 }
4738 }
4739 }
4740 }
4741 break;
4742 }
4743}
4744
4760void fasp_blas_dbsr_mxm2(const dBSRmat* A, const dBSRmat* B, dBSRmat* C)
4761{
4762
4763 INT i, j, k, l, count;
4764 INT* JD = (INT*)fasp_mem_calloc(B->COL, sizeof(INT));
4765
4766 const INT nb = A->nb;
4767 const INT nb2 = nb * nb;
4768
4769 // check A and B see if there are compatible for multiplication
4770 if ((A->COL != B->ROW) && (A->nb != B->nb)) {
4771 printf("### ERROR: Matrix sizes do not match!\n");
4772 fasp_chkerr(ERROR_MAT_SIZE, __FUNCTION__);
4773 }
4774
4775 C->ROW = A->ROW;
4776 C->COL = B->COL;
4777 C->nb = A->nb;
4779
4780 C->val = NULL;
4781 C->JA = NULL;
4782 C->IA = (INT*)fasp_mem_calloc(C->ROW + 1, sizeof(INT));
4783
4784 REAL* temp = (REAL*)fasp_mem_calloc(nb2, sizeof(REAL));
4785
4786 for (i = 0; i < B->COL; ++i) JD[i] = -1;
4787
4788 // step 1: Find first the structure IA of C
4789 for (i = 0; i < C->ROW; ++i) {
4790 count = 0;
4791
4792 for (k = A->IA[i]; k < A->IA[i + 1]; ++k) {
4793 for (j = B->IA[A->JA[k]]; j < B->IA[A->JA[k] + 1]; ++j) {
4794 for (l = 0; l < count; l++) {
4795 if (JD[l] == B->JA[j]) break;
4796 }
4797
4798 if (l == count) {
4799 JD[count] = B->JA[j];
4800 count++;
4801 }
4802 }
4803 }
4804 C->IA[i + 1] = count;
4805 for (j = 0; j < count; ++j) {
4806 JD[j] = -1;
4807 }
4808 }
4809
4810 for (i = 0; i < C->ROW; ++i) C->IA[i + 1] += C->IA[i];
4811
4812 // step 2: Find the structure JA of C
4813 INT countJD;
4814
4815 C->JA = (INT*)fasp_mem_calloc(C->IA[C->ROW], sizeof(INT));
4816
4817 for (i = 0; i < C->ROW; ++i) {
4818 countJD = 0;
4819 count = C->IA[i];
4820 for (k = A->IA[i]; k < A->IA[i + 1]; ++k) {
4821 for (j = B->IA[A->JA[k]]; j < B->IA[A->JA[k] + 1]; ++j) {
4822 for (l = 0; l < countJD; l++) {
4823 if (JD[l] == B->JA[j]) break;
4824 }
4825
4826 if (l == countJD) {
4827 C->JA[count] = B->JA[j];
4828 JD[countJD] = B->JA[j];
4829 count++;
4830 countJD++;
4831 }
4832 }
4833 }
4834
4835 // for (j=0;j<countJD;++j) JD[j]=-1;
4836 fasp_iarray_set(countJD, JD, -1);
4837 }
4838
4839 fasp_mem_free(JD);
4840 JD = NULL;
4841
4842 // step 3: Find the structure A of C
4843 C->val = (REAL*)fasp_mem_calloc((C->IA[C->ROW]) * nb2, sizeof(REAL));
4844
4845 for (i = 0; i < C->ROW; ++i) {
4846 for (j = C->IA[i]; j < C->IA[i + 1]; ++j) {
4847
4848 fasp_darray_set(nb2, C->val + (j * nb2), 0x0);
4849
4850 for (k = A->IA[i]; k < A->IA[i + 1]; ++k) {
4851 for (l = B->IA[A->JA[k]]; l < B->IA[A->JA[k] + 1]; l++) {
4852 if (B->JA[l] == C->JA[j]) {
4853 fasp_blas_smat_mul(A->val + (k * nb2), B->val + (l * nb2), temp,
4854 nb);
4855 fasp_blas_darray_axpy(nb2, 1.0, temp, C->val + (j * nb2));
4856 } // end if
4857 } // end for l
4858 } // end for k
4859 } // end for j
4860 } // end for i
4861
4862 C->NNZ = C->IA[C->ROW] - C->IA[0];
4863
4864 fasp_mem_free(temp);
4865 temp = NULL;
4866}
4867
4883void fasp_blas_dbsr_mxm(const dBSRmat* A, const dBSRmat* B, dBSRmat* C)
4884{
4885 REAL* A_data = A->val;
4886 INT* A_i = A->IA;
4887 INT* A_j = A->JA;
4888 INT nrows_A = A->ROW;
4889 INT ncols_A = A->COL;
4890
4891 REAL* B_data = B->val;
4892 INT* B_i = B->IA;
4893 INT* B_j = B->JA;
4894 INT nrows_B = B->ROW;
4895 INT ncols_B = B->COL;
4896
4897 INT ia, ib, ic, ja, jb;
4898 INT num_nonzeros = 0;
4899 INT row_start, counter;
4900 REAL *a_entry, *b_entry;
4901 INT* B_marker = NULL;
4902
4903 const INT nb = A->nb;
4904 const INT nb2 = nb * nb;
4905
4906 B_marker = (INT*)fasp_mem_calloc(ncols_B, sizeof(INT));
4907
4908 // check A and B see if there are compatible for multiplication
4909 if ((A->COL != B->ROW) && (A->nb != B->nb)) {
4910 printf("### ERROR: Matrix sizes do not match!\n");
4911 fasp_chkerr(ERROR_MAT_SIZE, __FUNCTION__);
4912 }
4913
4914 C->ROW = A->ROW;
4915 C->COL = B->COL;
4916 C->nb = A->nb;
4918
4919 C->val = NULL;
4920 C->JA = NULL;
4921 C->IA = (INT*)fasp_mem_calloc(C->ROW + 1, sizeof(INT));
4922
4923 /* initialize the marker array */
4924 fasp_iarray_set(ncols_B, B_marker, -1);
4925
4926 /* step 1: obtain the nonzero-structure of C */
4927 for (ic = 0; ic < nrows_A; ic++) {
4928 for (ia = A_i[ic]; ia < A_i[ic + 1]; ia++) {
4929 ja = A_j[ia];
4930 for (ib = B_i[ja]; ib < B_i[ja + 1]; ib++) {
4931 jb = B_j[ib];
4932 if (B_marker[jb] != ic) {
4933 B_marker[jb] = ic;
4934 num_nonzeros++;
4935 }
4936 }
4937 }
4938 C->IA[ic + 1] = num_nonzeros;
4939 }
4940
4941 C->NNZ = num_nonzeros;
4942 C->JA = (INT*)fasp_mem_calloc(num_nonzeros, sizeof(INT));
4943 C->val = (REAL*)fasp_mem_calloc(num_nonzeros * nb2, sizeof(REAL));
4944
4945 /* initialize the marker array again */
4946 fasp_iarray_set(ncols_B, B_marker, -1);
4947
4948 REAL* temp = (REAL*)fasp_mem_calloc(nb2, sizeof(REAL));
4949
4950 /* step 2: fill in the nonzero entries of C */
4951 counter = 0;
4952 for (ic = 0; ic < nrows_A; ic++) {
4953 row_start = C->IA[ic];
4954 for (ia = A_i[ic]; ia < A_i[ic + 1]; ia++) {
4955 ja = A_j[ia];
4956 a_entry = A_data + ia * nb2;
4957 for (ib = B_i[ja]; ib < B_i[ja + 1]; ib++) {
4958 jb = B_j[ib];
4959 b_entry = B_data + ib * nb2;
4960 // temp = a_entry * b_entry
4961 fasp_blas_smat_mul(a_entry, b_entry, temp, nb);
4962 if (B_marker[jb] < row_start) {
4963 B_marker[jb] = counter;
4964 C->JA[B_marker[jb]] = jb;
4965 // C->val[B_marker[jb]*nb2] = a_entry * b_entry;
4966 memcpy(C->val + B_marker[jb] * nb2, temp, nb2 * sizeof(REAL));
4967 counter++;
4968 } else {
4969 // C->val[B_marker[jb]*nb2] += a_entry * b_entry;
4970 fasp_blas_darray_axpy(nb2, 1.0, temp, C->val + B_marker[jb] * nb2);
4971 }
4972 }
4973 }
4974 }
4975
4976 fasp_mem_free(temp);
4977 temp = NULL;
4978 fasp_mem_free(B_marker);
4979 B_marker = NULL;
4980}
4981
4997void fasp_blas_dbsr_mxm_adb(const dBSRmat* A, dvector* D, const dBSRmat* B, dBSRmat* C)
4998{
4999 REAL* A_data = A->val;
5000 INT* A_i = A->IA;
5001 INT* A_j = A->JA;
5002 INT nrows_A = A->ROW;
5003 INT ncols_A = A->COL;
5004
5005 REAL* B_data = B->val;
5006 INT* B_i = B->IA;
5007 INT* B_j = B->JA;
5008 INT nrows_B = B->ROW;
5009 INT ncols_B = B->COL;
5010
5011 INT ia, ib, ic, ja, jb;
5012 INT num_nonzeros = 0;
5013 INT row_start, counter;
5014 REAL *a_entry, *b_entry, *d_entry;
5015 INT* B_marker = NULL;
5016
5017 const INT nb = A->nb;
5018 const INT nb2 = nb * nb;
5019
5020 B_marker = (INT*)fasp_mem_calloc(ncols_B, sizeof(INT));
5021
5022 // check A and B see if there are compatible for multiplication
5023 if ((A->COL != B->ROW) && (A->nb != B->nb) && (D->row / nb2 != B->ROW)) {
5024 printf("### ERROR: Matrix sizes do not match!\n");
5025 fasp_chkerr(ERROR_MAT_SIZE, __FUNCTION__);
5026 }
5027
5028 C->ROW = A->ROW;
5029 C->COL = B->COL;
5030 C->nb = A->nb;
5032
5033 C->val = NULL;
5034 C->JA = NULL;
5035 C->IA = (INT*)fasp_mem_calloc(C->ROW + 1, sizeof(INT));
5036
5037 /* initialize the marker array */
5038 fasp_iarray_set(ncols_B, B_marker, -1);
5039
5040 /* step 1: obtain the nonzero-structure of C */
5041 for (ic = 0; ic < nrows_A; ic++) {
5042 for (ia = A_i[ic]; ia < A_i[ic + 1]; ia++) {
5043 ja = A_j[ia];
5044 for (ib = B_i[ja]; ib < B_i[ja + 1]; ib++) {
5045 jb = B_j[ib];
5046 if (B_marker[jb] != ic) {
5047 B_marker[jb] = ic;
5048 num_nonzeros++;
5049 }
5050 }
5051 }
5052 C->IA[ic + 1] = num_nonzeros;
5053 }
5054
5055 C->NNZ = num_nonzeros;
5056 C->JA = (INT*)fasp_mem_calloc(num_nonzeros, sizeof(INT));
5057 C->val = (REAL*)fasp_mem_calloc(num_nonzeros * nb2, sizeof(REAL));
5058
5059 /* initialize the marker array again */
5060 fasp_iarray_set(ncols_B, B_marker, -1);
5061
5062 REAL* temp = (REAL*)fasp_mem_calloc(nb2, sizeof(REAL));
5063 REAL* temp1 = (REAL*)fasp_mem_calloc(nb2, sizeof(REAL));
5064
5065 /* step 2: fill in the nonzero entries of C */
5066 counter = 0;
5067 for (ic = 0; ic < nrows_A; ic++) {
5068 row_start = C->IA[ic];
5069 for (ia = A_i[ic]; ia < A_i[ic + 1]; ia++) {
5070 ja = A_j[ia];
5071 a_entry = A_data + ia * nb2;
5072 d_entry = D->val + ja * nb2; // d
5073 // temp1 = a_entry * d_entry
5074 fasp_blas_smat_mul(a_entry, d_entry, temp1, nb);
5075 for (ib = B_i[ja]; ib < B_i[ja + 1]; ib++) {
5076 jb = B_j[ib];
5077 b_entry = B_data + ib * nb2;
5078 // temp = (a_entry * d_entry) * b_entry = temp1 * b_entry
5079 fasp_blas_smat_mul(temp1, b_entry, temp, nb);
5080 if (B_marker[jb] < row_start) {
5081 B_marker[jb] = counter;
5082 C->JA[B_marker[jb]] = jb;
5083 // C->val[B_marker[jb]*nb2] = a_entry * b_entry;
5084 memcpy(C->val + B_marker[jb] * nb2, temp, nb2 * sizeof(REAL));
5085 counter++;
5086 } else {
5087 // C->val[B_marker[jb]*nb2] += a_entry * b_entry;
5088 fasp_blas_darray_axpy(nb2, 1.0, temp, C->val + B_marker[jb] * nb2);
5089 }
5090 }
5091 }
5092 }
5093
5094 fasp_mem_free(temp);
5095 temp = NULL;
5096 fasp_mem_free(temp1);
5097 temp1 = NULL;
5098 fasp_mem_free(B_marker);
5099 B_marker = NULL;
5100}
5101
5117void fasp_blas_dbsr_schur(
5118 dvector* D2, const dBSRmat* A, dvector* D1, const dBSRmat* B, dBSRmat* C)
5119{
5120 REAL* A_data = A->val;
5121 INT* A_i = A->IA;
5122 INT* A_j = A->JA;
5123 INT nrows_A = A->ROW;
5124 INT ncols_A = A->COL;
5125
5126 REAL* B_data = B->val;
5127 INT* B_i = B->IA;
5128 INT* B_j = B->JA;
5129 INT nrows_B = B->ROW;
5130 INT ncols_B = B->COL;
5131
5132 INT ia, ib, ic, ja, jb, i;
5133 INT num_nonzeros = 0;
5134 INT row_start, counter;
5135 REAL *a_entry, *b_entry, *d1_entry, *d2_entry;
5136 INT* B_marker = NULL;
5137
5138 const INT nb = A->nb;
5139 const INT nb2 = nb * nb;
5140
5141 B_marker = (INT*)fasp_mem_calloc(ncols_B, sizeof(INT));
5142
5143 // check A and B see if there are compatible for multiplication
5144 if ((A->COL != B->ROW) && (A->nb != B->nb) && (D1->row / nb2 != B->ROW)) {
5145 printf("### ERROR: Matrix sizes do not match!\n");
5146 fasp_chkerr(ERROR_MAT_SIZE, __FUNCTION__);
5147 }
5148
5149 C->ROW = A->ROW;
5150 C->COL = B->COL;
5151 C->nb = A->nb;
5153
5154 C->val = NULL;
5155 C->JA = NULL;
5156 C->IA = (INT*)fasp_mem_calloc(C->ROW + 1, sizeof(INT));
5157
5158 /* initialize the marker array */
5159 fasp_iarray_set(ncols_B, B_marker, -1);
5160
5161 /* step 1: obtain the nonzero-structure of C */
5162 for (ic = 0; ic < nrows_A; ic++) {
5163 for (ia = A_i[ic]; ia < A_i[ic + 1]; ia++) {
5164 ja = A_j[ia];
5165 for (ib = B_i[ja]; ib < B_i[ja + 1]; ib++) {
5166 jb = B_j[ib];
5167 if (B_marker[jb] != ic) {
5168 B_marker[jb] = ic;
5169 num_nonzeros++;
5170 }
5171 }
5172 }
5173 C->IA[ic + 1] = num_nonzeros;
5174 }
5175
5176 C->NNZ = num_nonzeros;
5177 C->JA = (INT*)fasp_mem_calloc(num_nonzeros, sizeof(INT));
5178 C->val = (REAL*)fasp_mem_calloc(num_nonzeros * nb2, sizeof(REAL));
5179
5180 /* initialize the marker array again */
5181 fasp_iarray_set(ncols_B, B_marker, -1);
5182
5183 REAL* temp = (REAL*)fasp_mem_calloc(nb2, sizeof(REAL));
5184 REAL* temp1 = (REAL*)fasp_mem_calloc(nb2, sizeof(REAL));
5185 // REAL* d1_entry_tmp = (REAL*)fasp_mem_calloc(nb2, sizeof(REAL));
5186
5187 dvector D1_neg = fasp_dvec_create(D1->row);
5188 for (i = 0; i < D1->row; i++) {
5189 D1_neg.val[i] = -D1->val[i];
5190 }
5191
5192 /* step 2: fill in the nonzero entries of C */
5193 counter = 0;
5194 for (ic = 0; ic < nrows_A; ic++) {
5195 row_start = C->IA[ic];
5196 d2_entry = D2->val + ic * nb2; // d
5197 for (ia = A_i[ic]; ia < A_i[ic + 1]; ia++) {
5198 ja = A_j[ia];
5199 a_entry = A_data + ia * nb2;
5200 d1_entry = D1_neg.val + ja * nb2; // - d1_entry
5201 // temp1 = - a_entry * d1_entry
5202 fasp_blas_smat_mul(a_entry, d1_entry, temp1, nb);
5203 for (ib = B_i[ja]; ib < B_i[ja + 1]; ib++) {
5204 jb = B_j[ib];
5205 b_entry = B_data + ib * nb2;
5206 // temp = - (a_entry * d1_entry) * b_entry = temp1 * b_entry
5207 fasp_blas_smat_mul(temp1, b_entry, temp, nb);
5208 if (B_marker[jb] < row_start) {
5209 B_marker[jb] = counter;
5210 C->JA[B_marker[jb]] = jb;
5211 if (ic == jb) { // diag entry
5212 // C->val[B_marker[jb] * nb2] = d2_entry - (a_entry * d1_entry)
5213 // * b_entry;
5214 fasp_blas_smat_add(d2_entry, temp, nb, 1.0, 1.0,
5215 C->val + B_marker[jb] * nb2);
5216 } else { // nondiag entries
5217 // C->val[B_marker[jb] * nb2] = -(a_entry * d1_entry) * b_entry;
5218 memcpy(C->val + B_marker[jb] * nb2, temp, nb2 * sizeof(REAL));
5219 }
5220 counter++;
5221 } else {
5222 // C->val[B_marker[jb]*nb2] += - (a_entry * d1_entry) * b_entry;
5223 fasp_blas_darray_axpy(nb2, 1.0, temp, C->val + B_marker[jb] * nb2);
5224 }
5225 }
5226 }
5227 }
5228
5229 fasp_mem_free(temp);
5230 temp = NULL;
5231 fasp_mem_free(temp1);
5232 temp1 = NULL;
5233 fasp_mem_free(B_marker);
5234 B_marker = NULL;
5235 fasp_dvec_free(&D1_neg);
5236}
5237
5256 const dBSRmat* A,
5257 const dBSRmat* P,
5258 dBSRmat* B)
5259{
5260 const INT row = R->ROW, col = P->COL, nb = A->nb, nb2 = A->nb * A->nb;
5261
5262 const REAL *rj = R->val, *aj = A->val, *pj = P->val;
5263 const INT * ir = R->IA, *ia = A->IA, *ip = P->IA;
5264 const INT * jr = R->JA, *ja = A->JA, *jp = P->JA;
5265
5266 REAL* acj;
5267 INT * iac, *jac;
5268
5269 INT nB = A->NNZ;
5270 INT i, i1, j, jj, k, length;
5271 INT begin_row, end_row, begin_rowA, end_rowA, begin_rowR, end_rowR;
5272 INT istart, iistart, count;
5273
5274 INT* index = (INT*)fasp_mem_calloc(A->COL, sizeof(INT));
5275
5276 REAL* smat_tmp = (REAL*)fasp_mem_calloc(nb2, sizeof(REAL));
5277
5278 INT* iindex = (INT*)fasp_mem_calloc(col, sizeof(INT));
5279
5280 for (i = 0; i < A->COL; ++i) index[i] = -2;
5281
5282 memcpy(iindex, index, col * sizeof(INT));
5283
5284 jac = (INT*)fasp_mem_calloc(nB, sizeof(INT));
5285
5286 iac = (INT*)fasp_mem_calloc(row + 1, sizeof(INT));
5287
5288 REAL* temp = (REAL*)fasp_mem_calloc(A->COL * nb2, sizeof(REAL));
5289
5290 iac[0] = 0;
5291
5292 // First loop: form sparsity pattern of R*A*P
5293 for (i = 0; i < row; ++i) {
5294 // reset istart and length at the beginning of each loop
5295 istart = -1;
5296 length = 0;
5297 i1 = i + 1;
5298
5299 // go across the rows in R
5300 begin_rowR = ir[i];
5301 end_rowR = ir[i1];
5302 for (jj = begin_rowR; jj < end_rowR; ++jj) {
5303 j = jr[jj];
5304 // for each column in A
5305 begin_rowA = ia[j];
5306 end_rowA = ia[j + 1];
5307 for (k = begin_rowA; k < end_rowA; ++k) {
5308 if (index[ja[k]] == -2) {
5309 index[ja[k]] = istart;
5310 istart = ja[k];
5311 ++length;
5312 }
5313 }
5314 }
5315
5316 // book-keeping [resetting length and setting iistart]
5317 count = length;
5318 iistart = -1;
5319 length = 0;
5320
5321 // use each column that would have resulted from R*A
5322 for (j = 0; j < count; ++j) {
5323 jj = istart;
5324 istart = index[istart];
5325 index[jj] = -2;
5326
5327 // go across the row of P
5328 begin_row = ip[jj];
5329 end_row = ip[jj + 1];
5330 for (k = begin_row; k < end_row; ++k) {
5331 // pull out the appropriate columns of P
5332 if (iindex[jp[k]] == -2) {
5333 iindex[jp[k]] = iistart;
5334 iistart = jp[k];
5335 ++length;
5336 }
5337 } // end for k
5338 } // end for j
5339
5340 // set B->IA
5341 iac[i1] = iac[i] + length;
5342
5343 if (iac[i1] > nB) {
5344 nB = nB * 2;
5345 jac = (INT*)fasp_mem_realloc(jac, nB * sizeof(INT));
5346 }
5347
5348 // put the correct columns of p into the column list of the products
5349 begin_row = iac[i];
5350 end_row = iac[i1];
5351 for (j = begin_row; j < end_row; ++j) {
5352 // put the value in B->JA
5353 jac[j] = iistart;
5354 // set istart to the next value
5355 iistart = iindex[iistart];
5356 // set the iindex spot to 0
5357 iindex[jac[j]] = -2;
5358 } // end j
5359 } // end i: First loop
5360
5361 jac = (INT*)fasp_mem_realloc(jac, (iac[row]) * sizeof(INT));
5362
5363 acj = (REAL*)fasp_mem_calloc(iac[row] * nb2, sizeof(REAL));
5364
5365 INT* BTindex = (INT*)fasp_mem_calloc(col, sizeof(INT));
5366
5367 // Second loop: compute entries of R*A*P
5368 for (i = 0; i < row; ++i) {
5369 i1 = i + 1;
5370 // each col of B
5371 begin_row = iac[i];
5372 end_row = iac[i1];
5373 for (j = begin_row; j < end_row; ++j) {
5374 BTindex[jac[j]] = j;
5375 }
5376 // reset istart and length at the beginning of each loop
5377 istart = -1;
5378 length = 0;
5379
5380 // go across the rows in R
5381 begin_rowR = ir[i];
5382 end_rowR = ir[i1];
5383 for (jj = begin_rowR; jj < end_rowR; ++jj) {
5384 j = jr[jj];
5385 // for each column in A
5386 begin_rowA = ia[j];
5387 end_rowA = ia[j + 1];
5388 for (k = begin_rowA; k < end_rowA; ++k) {
5389 if (index[ja[k]] == -2) {
5390 index[ja[k]] = istart;
5391 istart = ja[k];
5392 ++length;
5393 }
5394 fasp_blas_smat_mul(&rj[jj * nb2], &aj[k * nb2], smat_tmp, nb);
5395 // fasp_darray_xpy(nb2,&temp[ja[k]*nb2], smat_tmp );
5396 fasp_blas_darray_axpy(nb2, 1.0, smat_tmp, &temp[ja[k] * nb2]);
5397
5398 // temp[ja[k]]+=rj[jj]*aj[k];
5399 // change to X = X+Y*Z
5400 }
5401 }
5402
5403 // book-keeping [resetting length and setting iistart]
5404 // use each column that would have resulted from R*A
5405 for (j = 0; j < length; ++j) {
5406 jj = istart;
5407 istart = index[istart];
5408 index[jj] = -2;
5409
5410 // go across the row of P
5411 begin_row = ip[jj];
5412 end_row = ip[jj + 1];
5413 for (k = begin_row; k < end_row; ++k) {
5414 // pull out the appropriate columns of P
5415 // acj[BTindex[jp[k]]]+=temp[jj]*pj[k];
5416 fasp_blas_smat_mul(&temp[jj * nb2], &pj[k * nb2], smat_tmp, nb);
5417 // fasp_darray_xpy(nb2,&acj[BTindex[jp[k]]*nb2], smat_tmp );
5418 fasp_blas_darray_axpy(nb2, 1.0, smat_tmp, &acj[BTindex[jp[k]] * nb2]);
5419
5420 // change to X = X+Y*Z
5421 }
5422 // temp[jj]=0.0; // change to X[nb,nb] = 0;
5423 fasp_darray_set(nb2, &temp[jj * nb2], 0.0);
5424 }
5425 } // end for i: Second loop
5426 // setup coarse matrix B
5427 B->ROW = row;
5428 B->COL = col;
5429 B->IA = iac;
5430 B->JA = jac;
5431 B->val = acj;
5432 B->NNZ = B->IA[B->ROW] - B->IA[0];
5433
5434 B->nb = A->nb;
5436
5437 fasp_mem_free(temp);
5438 temp = NULL;
5439 fasp_mem_free(index);
5440 index = NULL;
5441 fasp_mem_free(iindex);
5442 iindex = NULL;
5443 fasp_mem_free(BTindex);
5444 BTindex = NULL;
5445 fasp_mem_free(smat_tmp);
5446 smat_tmp = NULL;
5447}
5448
5467 const dBSRmat* A,
5468 const dBSRmat* P,
5469 dBSRmat* B)
5470{
5471 const INT row = R->ROW, col = P->COL, nb = A->nb, nb2 = A->nb * A->nb;
5472
5473 const REAL *rj = R->val, *aj = A->val, *pj = P->val;
5474 const INT * ir = R->IA, *ia = A->IA, *ip = P->IA;
5475 const INT * jr = R->JA, *ja = A->JA, *jp = P->JA;
5476
5477 REAL* acj;
5478 INT * iac, *jac;
5479
5480 INT* Ps_marker = NULL;
5481 INT* As_marker = NULL;
5482
5483#ifdef _OPENMP
5484 INT* P_marker = NULL;
5485 INT* A_marker = NULL;
5486 REAL* smat_tmp = NULL;
5487#endif
5488
5489 INT i, i1, i2, i3, jj1, jj2, jj3;
5490 INT counter, jj_row_begining;
5491
5492 INT nthreads = 1;
5493
5494#ifdef _OPENMP
5495 INT myid, mybegin, myend, Ctemp;
5496 nthreads = fasp_get_num_threads();
5497#endif
5498
5499 INT n_coarse = row;
5500 INT n_fine = A->ROW;
5501 INT coarse_mul_nthreads = n_coarse * nthreads;
5502 INT fine_mul_nthreads = n_fine * nthreads;
5503 INT coarse_add_nthreads = n_coarse + nthreads;
5504 INT minus_one_length = coarse_mul_nthreads + fine_mul_nthreads;
5505 INT total_calloc = minus_one_length + coarse_add_nthreads + nthreads;
5506
5507 Ps_marker = (INT*)fasp_mem_calloc(total_calloc, sizeof(INT));
5508 As_marker = Ps_marker + coarse_mul_nthreads;
5509
5510 /*------------------------------------------------------*
5511 * First Pass: Determine size of B and set up B_i *
5512 *------------------------------------------------------*/
5513 iac = (INT*)fasp_mem_calloc(n_coarse + 1, sizeof(INT));
5514
5515 fasp_iarray_set(minus_one_length, Ps_marker, -1);
5516
5517 REAL* tmp = (REAL*)fasp_mem_calloc(2 * nthreads * nb2, sizeof(REAL));
5518
5519#ifdef _OPENMP
5520 INT* RAP_temp = As_marker + fine_mul_nthreads;
5521 INT* part_end = RAP_temp + coarse_add_nthreads;
5522
5523 if (n_coarse > OPENMP_HOLDS) {
5524#pragma omp parallel for private(myid, mybegin, myend, Ctemp, P_marker, A_marker, \
5525 counter, i, jj_row_begining, jj1, i1, jj2, i2, \
5526 jj3, i3)
5527 for (myid = 0; myid < nthreads; myid++) {
5528 fasp_get_start_end(myid, nthreads, n_coarse, &mybegin, &myend);
5529 P_marker = Ps_marker + myid * n_coarse;
5530 A_marker = As_marker + myid * n_fine;
5531 counter = 0;
5532 for (i = mybegin; i < myend; ++i) {
5533 P_marker[i] = counter;
5534 jj_row_begining = counter;
5535 counter++;
5536 for (jj1 = ir[i]; jj1 < ir[i + 1]; ++jj1) {
5537 i1 = jr[jj1];
5538 for (jj2 = ia[i1]; jj2 < ia[i1 + 1]; ++jj2) {
5539 i2 = ja[jj2];
5540 if (A_marker[i2] != i) {
5541 A_marker[i2] = i;
5542 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; ++jj3) {
5543 i3 = jp[jj3];
5544 if (P_marker[i3] < jj_row_begining) {
5545 P_marker[i3] = counter;
5546 counter++;
5547 }
5548 }
5549 }
5550 }
5551 }
5552 RAP_temp[i + myid] = jj_row_begining;
5553 }
5554 RAP_temp[myend + myid] = counter;
5555 part_end[myid] = myend + myid + 1;
5556 }
5557 fasp_iarray_cp(part_end[0], RAP_temp, iac);
5558 counter = part_end[0];
5559 Ctemp = 0;
5560 for (i1 = 1; i1 < nthreads; i1++) {
5561 Ctemp += RAP_temp[part_end[i1 - 1] - 1];
5562 for (jj1 = part_end[i1 - 1] + 1; jj1 < part_end[i1]; jj1++) {
5563 iac[counter] = RAP_temp[jj1] + Ctemp;
5564 counter++;
5565 }
5566 }
5567 } else {
5568#endif
5569 counter = 0;
5570 for (i = 0; i < row; ++i) {
5571 Ps_marker[i] = counter;
5572 jj_row_begining = counter;
5573 counter++;
5574
5575 for (jj1 = ir[i]; jj1 < ir[i + 1]; ++jj1) {
5576 i1 = jr[jj1];
5577 for (jj2 = ia[i1]; jj2 < ia[i1 + 1]; ++jj2) {
5578 i2 = ja[jj2];
5579 if (As_marker[i2] != i) {
5580 As_marker[i2] = i;
5581 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; ++jj3) {
5582 i3 = jp[jj3];
5583 if (Ps_marker[i3] < jj_row_begining) {
5584 Ps_marker[i3] = counter;
5585 counter++;
5586 }
5587 }
5588 }
5589 }
5590 }
5591 iac[i] = jj_row_begining;
5592 }
5593#ifdef _OPENMP
5594 }
5595#endif
5596
5597 iac[row] = counter;
5598
5599 jac = (INT*)fasp_mem_calloc(iac[row], sizeof(INT));
5600
5601 acj = (REAL*)fasp_mem_calloc(iac[row] * nb2, sizeof(REAL));
5602
5603 fasp_iarray_set(minus_one_length, Ps_marker, -1);
5604
5605 /*------------------------------------------------------*
5606 * Second Pass: compute entries of B=R*A*P *
5607 *------------------------------------------------------*/
5608#ifdef _OPENMP
5609 if (n_coarse > OPENMP_HOLDS) {
5610#pragma omp parallel for private(myid, mybegin, myend, Ctemp, P_marker, A_marker, \
5611 counter, i, jj_row_begining, jj1, i1, jj2, i2, \
5612 jj3, i3, smat_tmp)
5613 for (myid = 0; myid < nthreads; myid++) {
5614 fasp_get_start_end(myid, nthreads, n_coarse, &mybegin, &myend);
5615 P_marker = Ps_marker + myid * n_coarse;
5616 A_marker = As_marker + myid * n_fine;
5617 smat_tmp = tmp + myid * 2 * nb2;
5618 counter = iac[mybegin];
5619 for (i = mybegin; i < myend; ++i) {
5620 P_marker[i] = counter;
5621 jj_row_begining = counter;
5622 jac[counter] = i;
5623 fasp_darray_set(nb2, &acj[counter * nb2], 0x0);
5624 counter++;
5625
5626 for (jj1 = ir[i]; jj1 < ir[i + 1]; ++jj1) {
5627 i1 = jr[jj1];
5628 for (jj2 = ia[i1]; jj2 < ia[i1 + 1]; ++jj2) {
5629 fasp_blas_smat_mul(&rj[jj1 * nb2], &aj[jj2 * nb2], smat_tmp,
5630 nb);
5631 i2 = ja[jj2];
5632 if (A_marker[i2] != i) {
5633 A_marker[i2] = i;
5634 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; ++jj3) {
5635 i3 = jp[jj3];
5636 fasp_blas_smat_mul(smat_tmp, &pj[jj3 * nb2],
5637 smat_tmp + nb2, nb);
5638 if (P_marker[i3] < jj_row_begining) {
5639 P_marker[i3] = counter;
5640 fasp_darray_cp(nb2, smat_tmp + nb2,
5641 &acj[counter * nb2]);
5642 jac[counter] = i3;
5643 counter++;
5644 } else {
5645 fasp_blas_darray_axpy(nb2, 1.0, smat_tmp + nb2,
5646 &acj[P_marker[i3] * nb2]);
5647 }
5648 }
5649 } else {
5650 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; jj3++) {
5651 i3 = jp[jj3];
5652 fasp_blas_smat_mul(smat_tmp, &pj[jj3 * nb2],
5653 smat_tmp + nb2, nb);
5654 fasp_blas_darray_axpy(nb2, 1.0, smat_tmp + nb2,
5655 &acj[P_marker[i3] * nb2]);
5656 }
5657 }
5658 }
5659 }
5660 }
5661 }
5662 } else {
5663#endif
5664 counter = 0;
5665 for (i = 0; i < row; ++i) {
5666 Ps_marker[i] = counter;
5667 jj_row_begining = counter;
5668 jac[counter] = i;
5669 fasp_darray_set(nb2, &acj[counter * nb2], 0x0);
5670 counter++;
5671
5672 for (jj1 = ir[i]; jj1 < ir[i + 1]; ++jj1) {
5673 i1 = jr[jj1];
5674 for (jj2 = ia[i1]; jj2 < ia[i1 + 1]; ++jj2) {
5675 fasp_blas_smat_mul(&rj[jj1 * nb2], &aj[jj2 * nb2], tmp, nb);
5676 i2 = ja[jj2];
5677 if (As_marker[i2] != i) {
5678 As_marker[i2] = i;
5679 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; ++jj3) {
5680 i3 = jp[jj3];
5681 fasp_blas_smat_mul(tmp, &pj[jj3 * nb2], tmp + nb2, nb);
5682 if (Ps_marker[i3] < jj_row_begining) {
5683 Ps_marker[i3] = counter;
5684 fasp_darray_cp(nb2, tmp + nb2, &acj[counter * nb2]);
5685 jac[counter] = i3;
5686 counter++;
5687 } else {
5688 fasp_blas_darray_axpy(nb2, 1.0, tmp + nb2,
5689 &acj[Ps_marker[i3] * nb2]);
5690 }
5691 }
5692 } else {
5693 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; jj3++) {
5694 i3 = jp[jj3];
5695 fasp_blas_smat_mul(tmp, &pj[jj3 * nb2], tmp + nb2, nb);
5696 fasp_blas_darray_axpy(nb2, 1.0, tmp + nb2,
5697 &acj[Ps_marker[i3] * nb2]);
5698 }
5699 }
5700 }
5701 }
5702 }
5703#ifdef _OPENMP
5704 }
5705#endif
5706 // setup coarse matrix B
5707 B->ROW = row;
5708 B->COL = col;
5709 B->IA = iac;
5710 B->JA = jac;
5711 B->val = acj;
5712 B->NNZ = B->IA[B->ROW] - B->IA[0];
5713 B->nb = A->nb;
5715
5716 fasp_mem_free(Ps_marker);
5717 Ps_marker = NULL;
5718 fasp_mem_free(tmp);
5719 tmp = NULL;
5720}
5721
5740 const dBSRmat* A,
5741 const dBSRmat* P,
5742 dBSRmat* B)
5743{
5744 const INT row = R->ROW, col = P->COL, nb2 = A->nb * A->nb;
5745
5746 const REAL* aj = A->val;
5747 const INT * ir = R->IA, *ia = A->IA, *ip = P->IA;
5748 const INT * jr = R->JA, *ja = A->JA, *jp = P->JA;
5749
5750 INT * iac, *jac;
5751 REAL* acj;
5752 INT* Ps_marker = NULL;
5753 INT* As_marker = NULL;
5754
5755#ifdef _OPENMP
5756 INT* P_marker = NULL;
5757 INT* A_marker = NULL;
5758#endif
5759
5760 INT i, i1, i2, i3, jj1, jj2, jj3;
5761 INT counter, jj_row_begining;
5762 INT RAP_nnz; // for OpenMP, Li Zhao, 2023.06.17
5763
5764 INT nthreads = 1;
5765
5766#ifdef _OPENMP
5767 INT myid, mybegin, myend, Ctemp;
5768 nthreads = fasp_get_num_threads();
5769#endif
5770
5771 INT n_coarse = row;
5772 INT n_fine = A->ROW;
5773 INT coarse_mul_nthreads = n_coarse * nthreads;
5774 INT fine_mul_nthreads = n_fine * nthreads;
5775 INT coarse_add_nthreads = n_coarse + nthreads;
5776 INT minus_one_length = coarse_mul_nthreads + fine_mul_nthreads;
5777 INT total_calloc = minus_one_length + coarse_add_nthreads + nthreads;
5778
5779 Ps_marker = (INT*)fasp_mem_calloc(total_calloc, sizeof(INT));
5780 As_marker = Ps_marker + coarse_mul_nthreads;
5781
5782 /*------------------------------------------------------*
5783 * First Pass: Determine size of B and set up B_i *
5784 *------------------------------------------------------*/
5785 iac = (INT*)fasp_mem_calloc(n_coarse + 1, sizeof(INT));
5786
5787 fasp_iarray_set(minus_one_length, Ps_marker, -1);
5788
5789#ifdef _OPENMP
5790 INT* RAP_temp = As_marker + fine_mul_nthreads;
5791 INT* part_end = RAP_temp + coarse_add_nthreads;
5792
5793 if (n_coarse > OPENMP_HOLDS) {
5794#pragma omp parallel for private(myid, mybegin, myend, Ctemp, P_marker, A_marker, \
5795 counter, i, jj_row_begining, jj1, i1, jj2, i2, \
5796 jj3, i3)
5797 for (myid = 0; myid < nthreads; myid++) {
5798 fasp_get_start_end(myid, nthreads, n_coarse, &mybegin, &myend);
5799 P_marker = Ps_marker + myid * n_coarse;
5800 A_marker = As_marker + myid * n_fine;
5801 counter = 0;
5802 for (i = mybegin; i < myend; ++i) {
5803 P_marker[i] = counter;
5804 jj_row_begining = counter;
5805 counter++;
5806 for (jj1 = ir[i]; jj1 < ir[i + 1]; ++jj1) {
5807 i1 = jr[jj1];
5808 for (jj2 = ia[i1]; jj2 < ia[i1 + 1]; ++jj2) {
5809 i2 = ja[jj2];
5810 if (A_marker[i2] != i) {
5811 A_marker[i2] = i;
5812 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; ++jj3) {
5813 i3 = jp[jj3];
5814 if (P_marker[i3] < jj_row_begining) {
5815 P_marker[i3] = counter;
5816 counter++;
5817 }
5818 }
5819 }
5820 }
5821 }
5822 RAP_temp[i + myid] = jj_row_begining;
5823 }
5824 RAP_temp[myend + myid] = counter;
5825 part_end[myid] = myend + myid + 1;
5826 }
5827 fasp_iarray_cp(part_end[0], RAP_temp, iac);
5828 counter = part_end[0];
5829 Ctemp = 0;
5830 for (i1 = 1; i1 < nthreads; i1++) {
5831 Ctemp += RAP_temp[part_end[i1 - 1] - 1];
5832 for (jj1 = part_end[i1 - 1] + 1; jj1 < part_end[i1]; jj1++) {
5833 iac[counter] = RAP_temp[jj1] + Ctemp;
5834 counter++;
5835 }
5836 }
5837 RAP_nnz = iac[row]; // for OpenMP, Li Zhao, 2023.06.17
5838 } else {
5839#endif
5840 counter = 0;
5841 for (i = 0; i < row; ++i) {
5842 Ps_marker[i] = counter;
5843 jj_row_begining = counter;
5844 counter++;
5845
5846 for (jj1 = ir[i]; jj1 < ir[i + 1]; ++jj1) {
5847 i1 = jr[jj1];
5848 for (jj2 = ia[i1]; jj2 < ia[i1 + 1]; ++jj2) {
5849 i2 = ja[jj2];
5850 if (As_marker[i2] != i) {
5851 As_marker[i2] = i;
5852 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; ++jj3) {
5853 i3 = jp[jj3];
5854 if (Ps_marker[i3] < jj_row_begining) {
5855 Ps_marker[i3] = counter;
5856 counter++;
5857 }
5858 }
5859 }
5860 }
5861 }
5862 iac[i] = jj_row_begining;
5863 }
5864
5865 iac[row] = counter;
5866 RAP_nnz = counter; // for OpenMP, Li Zhao, 2023.06.17
5867#ifdef _OPENMP
5868 }
5869#endif
5870
5871 // iac[row] = counter; // this case only for sequetial
5872
5873 jac = (INT*)fasp_mem_calloc(RAP_nnz, sizeof(INT));
5874
5875 acj = (REAL*)fasp_mem_calloc(RAP_nnz * nb2, sizeof(REAL));
5876
5877 fasp_iarray_set(minus_one_length, Ps_marker, -1);
5878
5879 /*------------------------------------------------------*
5880 * Second Pass: compute entries of B=R*A*P *
5881 *------------------------------------------------------*/
5882#ifdef _OPENMP
5883 if (n_coarse > OPENMP_HOLDS) {
5884#pragma omp parallel for private(myid, mybegin, myend, P_marker, A_marker, counter, i, \
5885 jj_row_begining, jj1, i1, jj2, i2, jj3, i3)
5886 for (myid = 0; myid < nthreads; myid++) {
5887 fasp_get_start_end(myid, nthreads, n_coarse, &mybegin, &myend);
5888 P_marker = Ps_marker + myid * n_coarse;
5889 A_marker = As_marker + myid * n_fine;
5890 counter = iac[mybegin];
5891 for (i = mybegin; i < myend; ++i) {
5892 P_marker[i] = counter;
5893 jj_row_begining = counter;
5894 jac[counter] = i;
5895 fasp_darray_set(nb2, &acj[counter * nb2], 0x0);
5896 // printf("i=%d nb2=%d counter=%d s\n", i, nb2, counter); // zhaoli
5897 counter++;
5898
5899 for (jj1 = ir[i]; jj1 < ir[i + 1]; ++jj1) {
5900 i1 = jr[jj1];
5901 for (jj2 = ia[i1]; jj2 < ia[i1 + 1]; ++jj2) {
5902
5903 i2 = ja[jj2];
5904 if (A_marker[i2] != i) {
5905 A_marker[i2] = i;
5906 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; ++jj3) {
5907 i3 = jp[jj3];
5908
5909 if (P_marker[i3] < jj_row_begining) {
5910 P_marker[i3] = counter;
5911 fasp_darray_cp(nb2, &aj[jj2 * nb2],
5912 &acj[counter * nb2]);
5913 jac[counter] = i3;
5914 counter++;
5915 } else {
5916 fasp_blas_darray_axpy(nb2, 1.0, &aj[jj2 * nb2],
5917 &acj[P_marker[i3] * nb2]);
5918 }
5919 }
5920 } else {
5921 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; jj3++) {
5922 i3 = jp[jj3];
5923 fasp_blas_darray_axpy(nb2, 1.0, &aj[jj2 * nb2],
5924 &acj[P_marker[i3] * nb2]);
5925 }
5926 }
5927 }
5928 }
5929 }
5930 }
5931 } else {
5932#endif
5933 counter = 0;
5934 for (i = 0; i < row; ++i) {
5935 Ps_marker[i] = counter;
5936 jj_row_begining = counter;
5937 jac[counter] = i;
5938 fasp_darray_set(nb2, &acj[counter * nb2], 0x0);
5939 counter++;
5940
5941 for (jj1 = ir[i]; jj1 < ir[i + 1]; ++jj1) {
5942 i1 = jr[jj1];
5943 for (jj2 = ia[i1]; jj2 < ia[i1 + 1]; ++jj2) {
5944
5945 i2 = ja[jj2];
5946 if (As_marker[i2] != i) {
5947 As_marker[i2] = i;
5948 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; ++jj3) {
5949 i3 = jp[jj3];
5950 if (Ps_marker[i3] < jj_row_begining) {
5951 Ps_marker[i3] = counter;
5952 fasp_darray_cp(nb2, &aj[jj2 * nb2],
5953 &acj[counter * nb2]);
5954 jac[counter] = i3;
5955 counter++;
5956 } else {
5957 fasp_blas_darray_axpy(nb2, 1.0, &aj[jj2 * nb2],
5958 &acj[Ps_marker[i3] * nb2]);
5959 }
5960 }
5961 } else {
5962 for (jj3 = ip[i2]; jj3 < ip[i2 + 1]; jj3++) {
5963 i3 = jp[jj3];
5964 fasp_blas_darray_axpy(nb2, 1.0, &aj[jj2 * nb2],
5965 &acj[Ps_marker[i3] * nb2]);
5966 }
5967 }
5968 }
5969 }
5970 }
5971#ifdef _OPENMP
5972 }
5973#endif
5974
5975 // setup coarse matrix B
5976 B->ROW = row;
5977 B->COL = col;
5978 B->IA = iac;
5979 B->JA = jac;
5980 B->val = acj;
5981 B->NNZ = B->IA[B->ROW] - B->IA[0];
5982 B->nb = A->nb;
5984
5985 fasp_mem_free(Ps_marker);
5986 Ps_marker = NULL;
5987}
5988
5989/*---------------------------------*/
5990/*-- End of File --*/
5991/*---------------------------------*/
void fasp_darray_set(const INT n, REAL *x, const REAL val)
Set initial value for an array to be x=val.
Definition: AuxArray.c:41
void fasp_iarray_set(const INT n, INT *x, const INT val)
Set initial value for an array to be x=val.
Definition: AuxArray.c:98
void fasp_darray_cp(const INT n, const REAL *x, REAL *y)
Copy an array to the other y=x.
Definition: AuxArray.c:210
void fasp_iarray_cp(const INT n, const INT *x, INT *y)
Copy an array to the other y=x.
Definition: AuxArray.c:227
void fasp_mem_free(void *mem)
Free up previous allocated memory body and set pointer to NULL.
Definition: AuxMemory.c:152
void * fasp_mem_realloc(void *oldmem, const LONGLONG tsize)
Reallocate, initiate, and check memory.
Definition: AuxMemory.c:113
void * fasp_mem_calloc(const unsigned int size, const unsigned int type)
Allocate, initiate, and check memory.
Definition: AuxMemory.c:65
void fasp_chkerr(const SHORT status, const char *fctname)
Check error status and print out error messages before quit.
Definition: AuxMessage.c:213
void fasp_get_start_end(const INT procid, const INT nprocs, const INT n, INT *start, INT *end)
Assign Load to each thread.
Definition: AuxThreads.c:93
void fasp_dvec_free(dvector *u)
Free vector data space of REAL type.
Definition: AuxVector.c:145
dvector fasp_dvec_create(const INT m)
Create dvector data space of REAL type.
Definition: AuxVector.c:62
void fasp_blas_darray_ax(const INT n, const REAL a, REAL *x)
x = a*x
Definition: BlaArray.c:43
void fasp_blas_darray_axpy(const INT n, const REAL a, const REAL *x, REAL *y)
y = a*x + y
Definition: BlaArray.c:90
void fasp_blas_smat_ypAx(const REAL *A, const REAL *x, REAL *y, const INT n)
Compute y := y + Ax, where 'A' is a n*n dense matrix.
Definition: BlaSmallMat.c:779
void fasp_blas_smat_ypAx_nc3(const REAL *A, const REAL *x, REAL *y)
Compute y := y + Ax, where 'A' is a 3*3 dense matrix.
Definition: BlaSmallMat.c:673
void fasp_blas_smat_ypAx_nc2(const REAL *A, const REAL *x, REAL *y)
Compute y := y + Ax, where 'A' is a 2*2 dense matrix.
Definition: BlaSmallMat.c:651
void fasp_blas_smat_mul(const REAL *a, const REAL *b, REAL *c, const INT n)
Compute the matrix product of two small full matrices a and b, stored in c.
Definition: BlaSmallMat.c:596
void fasp_blas_smat_add(const REAL *a, const REAL *b, const INT n, const REAL alpha, const REAL beta, REAL *c)
Compute c = alpha*a + beta*b.
Definition: BlaSmallMat.c:86
void fasp_blas_smat_axm1(REAL *a, const INT n, const REAL alpha, REAL *b)
Compute b = alpha*a (in place)
Definition: BlaSmallMat.c:60
void fasp_blas_smat_ypAx_nc5(const REAL *A, const REAL *x, REAL *y)
Compute y := y + Ax, where 'A' is a 5*5 dense matrix.
Definition: BlaSmallMat.c:718
void fasp_blas_smat_ypAx_nc7(const REAL *A, const REAL *x, REAL *y)
Compute y := y + Ax, where 'A' is a 7*7 dense matrix.
Definition: BlaSmallMat.c:742
void fasp_dbsr_alloc(const INT ROW, const INT COL, const INT NNZ, const INT nb, const INT storage_manner, dBSRmat *A)
Allocate memory space for BSR format sparse matrix.
Definition: BlaSparseBSR.c:96
void fasp_blas_dbsr_rap1(const dBSRmat *R, const dBSRmat *A, const dBSRmat *P, dBSRmat *B)
dBSRmat sparse matrix multiplication B=R*A*P
Definition: BlaSpmvBSR.c:5255
void fasp_blas_dbsr_axm(dBSRmat *A, const REAL alpha)
Multiply a sparse matrix A in BSR format by a scalar alpha.
Definition: BlaSpmvBSR.c:215
void fasp_blas_dbsr_mxm(const dBSRmat *A, const dBSRmat *B, dBSRmat *C)
Sparse matrix multiplication C=A*B.
Definition: BlaSpmvBSR.c:4883
void fasp_blas_dbsr_rap_agg(const dBSRmat *R, const dBSRmat *A, const dBSRmat *P, dBSRmat *B)
dBSRmat sparse matrix multiplication B=R*A*P, where small block matrices in P and R are identity matr...
Definition: BlaSpmvBSR.c:5739
void fasp_blas_dbsr_aAxpby(const REAL alpha, dBSRmat *A, REAL *x, const REAL beta, REAL *y)
Compute y := alpha*A*x + beta*y.
Definition: BlaSpmvBSR.c:243
void fasp_blas_dbsr_mxv_agg(const dBSRmat *A, const REAL *x, REAL *y)
Compute y := A*x, where each small block matrices of A is an identity.
Definition: BlaSpmvBSR.c:2815
void fasp_blas_dbsr_mxm2(const dBSRmat *A, const dBSRmat *B, dBSRmat *C)
Sparse matrix multiplication C=A*B.
Definition: BlaSpmvBSR.c:4760
void fasp_blas_dbsr_mxm_adb(const dBSRmat *A, dvector *D, const dBSRmat *B, dBSRmat *C)
Sparse matrix multiplication C=A*D*B, where D is diagnal matrix.
Definition: BlaSpmvBSR.c:4997
void fasp_blas_dbsr_aAxpy(const REAL alpha, const dBSRmat *A, const REAL *x, REAL *y)
Compute y := alpha*A*x + y.
Definition: BlaSpmvBSR.c:514
void fasp_blas_dbsr_mxv(const dBSRmat *A, const REAL *x, REAL *y)
Compute y := A*x.
Definition: BlaSpmvBSR.c:1055
SHORT fasp_blas_dbsr_add(const dBSRmat *A, const REAL alpha, const dBSRmat *B, const REAL beta, dBSRmat *C)
compute C = alpha*A + beta*B in BSR format
Definition: BlaSpmvBSR.c:45
void fasp_blas_dbsr_aAxpy_agg(const REAL alpha, const dBSRmat *A, const REAL *x, REAL *y)
Compute y := alpha*A*x + y where each small block matrix is an identity matrix.
Definition: BlaSpmvBSR.c:778
void fasp_blas_dbsr_rap(const dBSRmat *R, const dBSRmat *A, const dBSRmat *P, dBSRmat *B)
dBSRmat sparse matrix multiplication B=R*A*P
Definition: BlaSpmvBSR.c:5466
Main header file for the FASP project.
#define REAL
Definition: fasp.h:75
#define SHORT
FASP integer and floating point numbers.
Definition: fasp.h:71
#define INT
Definition: fasp.h:72
#define FASP_SUCCESS
Definition of return status and error messages.
Definition: fasp_const.h:19
#define OPENMP_HOLDS
Definition: fasp_const.h:269
#define TRUE
Definition of logic type.
Definition: fasp_const.h:61
#define FALSE
Definition: fasp_const.h:62
#define ERROR_MAT_SIZE
Definition: fasp_const.h:26
Block sparse row storage matrix of REAL type.
Definition: fasp_block.h:34
INT COL
number of cols of sub-blocks in matrix A, N
Definition: fasp_block.h:40
INT NNZ
number of nonzero sub-blocks in matrix A, NNZ
Definition: fasp_block.h:43
REAL * val
Definition: fasp_block.h:57
INT nb
dimension of each sub-block
Definition: fasp_block.h:46
INT * IA
integer array of row pointers, the size is ROW+1
Definition: fasp_block.h:60
INT ROW
number of rows of sub-blocks in matrix A, M
Definition: fasp_block.h:37
INT * JA
Definition: fasp_block.h:64
INT storage_manner
storage manner for each sub-block
Definition: fasp_block.h:49
Vector with n entries of REAL type.
Definition: fasp.h:354
REAL * val
actual vector entries
Definition: fasp.h:360
INT row
number of rows
Definition: fasp.h:357