#include <iostream>
#include <stdlib.h>
#include <assert.h>
using namespace std;
typedef double FLOAT;

static void NaiveMatrixMultiply(FLOAT *A, FLOAT *B, FLOAT *C, int const n) {
   for(int i = 0; i < n; i++)
    for(int j = 0; j < n; j++)
     for(int k = 0; k < n; k++)
      C[i * n + j] += A[i * n + k] * B[k * n + j];
}


static void BlockedMatrixMultiply(FLOAT *A, FLOAT *B, FLOAT *C, int const n) {
   #pragma isat tuning name(tune_mm) scope(M1_begin, M1_end) measure(M2_begin, M2_end) variable(blk_k, range($L1_CACHE_LINESIZE, 16*$L1_CACHE_LINESIZE, $L1_CACHE_LINESIZE/2)) variable(blk_j, range($L1_CACHE_LINESIZE, 16*$L1_CACHE_LINESIZE, $L1_CACHE_LINESIZE/2)) variable(blk_i, range($L1_CACHE_LINESIZE, 16*$L1_CACHE_LINESIZE, $L1_CACHE_LINESIZE/2)) // add "search(dependent)" at the end of this pragma if you want to search the variables dependently // add "search(dependent)" at the end of this pragma if you want to search the variables dependently // add "search(dependent)" at the end of this pragma if you want to search the variables dependently // add "search(dependent)" at the end of this pragma if you want to search the variables dependently
   #pragma isat marker M1_begin
   int const blk_i = 64;
   int const blk_j = 64;
   int const blk_k = 64;
   for(int i = 0; i < n; i += blk_i)
    for(int j = 0; j < n; j += blk_j)
     for(int k = 0; k < n; k += blk_k)
      for(int ii = i; ii < (((i + blk_i) < (n)) ? (i + blk_i) : (n)); ii++)
       for(int jj = j; jj < (((j + blk_j) < (n)) ? (j + blk_j) : (n)); jj++)
        for(int kk = k; kk < (((k + blk_k) < (n)) ? (k + blk_k) : (n)); kk++)
         C[ii * n + jj] = C[ii * n + jj] + A[ii * n + kk] * B[kk * n + jj];
   #pragma isat marker M1_end
}


static void Initialize(FLOAT *X, int const n, FLOAT const v) {
   for(int i = 0; i < n; i++)
    for(int j = 0; j < n; j++)
     X[i * n + j] = v;
}


static FLOAT Sum(FLOAT *X, int const n) {
   FLOAT s = 0;
   for(int i = 0; i < n; i++)
    for(int j = 0; j < n; j++)
     s += X[i * n + j];
   
   return s;
}


int main(int argc, char **argv) {
   FLOAT *A = new FLOAT[1000 * 1000];
   FLOAT *B = new FLOAT[1000 * 1000];
   //FLOAT* C_naive = new FLOAT[N * N];
   //assert(C_naive);
   FLOAT *C_blocked = new FLOAT[1000 * 1000];
   Initialize(A, 1000, 1);
   Initialize(B, 1000, 2);
   //Initialize(C_naive, N, 0);
   Initialize(C_blocked, 1000, 0);
   #pragma isat marker M2_begin
   BlockedMatrixMultiply(A, B, C_blocked, 1000);
   #pragma isat marker M2_end
   FLOAT const s_blocked = Sum(C_blocked, 1000);
   //NaiveMatrixMultiply(A, B, C_naive, N);
   //const FLOAT s_naive = Sum(C_naive, N);
   //cout << "Sum-naive = " << s_naive << endl;
   cout << "Sum-blocked = " << s_blocked << endl;
}
