// Program 12.6 Release 1.3 on 30.08.2006
// The Parallel Gauss-Seidel Method (wavefront scheme, checkerboard decomposition)
  bool IsPrintEnabled;
  double *u;

int main(int argc, char *argv[])
{
  if (argc < 2)
  {
    printf("USAGE: GZ_ParWB.exe N\n");
    return 1;
  }
  if (argc == 2)
    IsPrintEnabled = 0;
  else
    IsPrintEnabled = atoi(argv[2]);

  int N = atoi(argv[1]);

  GZ_Par(N);  

  return 0;
}

int GZ_Par(int N)
{
  int i, j, ib, jb, nx;
  clock_t begin,end;  
  
  int NB; // block size
  
  double temp;
  double dmax;
  double * dm = NULL; 
  double d; 
  int Step;
  omp_lock_t dmax_lock;
  omp_init_lock(&dmax_lock);
  
  printf("The Gauss-Seidel Method (wavefront scheme, checkerboard decomposition). OpenMP Parallel version.\r\n");
  
  NB = (N - 2) / BS;   // get the first blocks' size
  
  if ((u = CreateMatrix(N)) == NULL)
    return 1;
  if (dm)
    delete [] dm;
  dm = new double [NB];
  
  Step = 0;
  
  printf("Matrix size: %d Block size %d, and Block number %d\r\n", N, BS, NB);
  begin = clock();
  
  do
  {
    dmax = 0;
    for (int idm = 0; idm < NB; idm++)
      dm[idm] = 0;
     // the first half of second diagonal (chess)
     // nb - count of current computing blocks
    for (nx = 1; nx <= NB; nx++)
    {
      //printf("Diagonal (first half) #%d\r\n", nx);
      // i, j - coors of curr block
#pragma omp parallel for shared(u, nx, dm) private(i, j, temp, d)
      for (i = 0; i < nx; i++)
      {
        //printf("First half of iteration on #%d thread of %d\r\n", omp_get_thread_num(), omp_get_num_threads());
        j = nx - i - 1;
        // ib, jb - coors of elem in each block
        int ii, jj;
        int ioffs = i * BS + 1;
        int joffs = j * BS + 1;
        // ii, jj - abs coors curr elem in matrix
        for (ib = 0; ib < BS; ib++)
        {
          for (jb = 0; jb < BS; jb++)
          {
            ii = ioffs + ib;
            jj = joffs + jb;
            temp = u[N * ii + jj];
            u[N * ii + jj] = 0.25 * (u[N * ii + jj + 1] + u[N * ii + jj - 1] + 
              u[N * (ii + 1) + jj] + u[N * (ii - 1) + jj]);
            d = fabs(u[N * ii + jj] - temp);
            if (dm[i] < d)
              dm[i] = d;
          }// for jb
        }// for ib
      }//for i
    }// for nx
    // the last half of second diagonal
    for (nx = NB - 1; nx > 0; nx--)
    {
      //printf("Diagonal (second half) #%d\r\n", nx);
      // i, j - coors of curr block
#pragma omp parallel for shared(u, nx, dm) private(i, j, temp, d)
      for (i = NB - nx; i < NB; i++)
      {
        //printf("Second half of iteration on #%d thread of %d\r\n", omp_get_thread_num(), omp_get_num_threads());
        j = NB - (i + nx) + NB - 1;
        // ib, jb - coors of elem in each block
        int ii, jj;
        int ioffs = i * BS + 1;
        int joffs = j * BS + 1;
        // ii, jj - abs coors curr elem in matrix
        for (ib = 0; ib < BS; ib++)
        {
          for (jb = 0; jb < BS; jb++)
          {
            ii = ioffs + ib;
            jj = joffs + jb;
            temp = u[N * ii + jj];
            u[N * ii + jj] = 0.25 * (u[N * ii + jj + 1] + u[N * ii + jj - 1] + 
              u[N * (ii + 1) + jj] + u[N * (ii - 1) + jj]);
            d = fabs(u[N * ii + jj] - temp);
            if (dm[i] < d)
              dm[i] = d;
          }// for jb
        }// for ib
      }//for i
    }// for nx
    
    //Looking for min element
    // printf("Looking for min element (%f, %f, %f, %f, %f, %f)->%f\r\n",dm[0],dm[1],dm[2],dm[3],dm[4],dm[5],dmax);
    for (i = 0; i < NB; i++ ) 
    {
      if (dmax < dm[i]) 
        dmax = dm[i];
    } 
    Step++;
  }
  while (dmax > EPS);
  
  end = clock();
  
  long tck = end - begin;
  
  printf("Results:\r\n- current eps = %f\r\n- required eps = %f\r\n- matrix Size = %d\r\n- iterations = %d\r\n- time = %f\r\n",
    dmax, EPS, N, Step, ((double)tck)/CLOCKS_PER_SEC);
  
  if (IsPrintEnabled)
    PrintMatrix(u, N);
  
  DeleteMatrix(u);
  
  return 0;
}