diff --git a/test2/.clang-format b/.clang-format
similarity index 97%
rename from test2/.clang-format
rename to .clang-format
index 1b9315e..ae6c3fd 100644
--- a/test2/.clang-format
+++ b/.clang-format
@@ -22,7 +22,7 @@ AlignConsecutiveDeclarations:
   AcrossEmptyLines: false
   AcrossComments: false
   AlignCompound: true
-  PadOperators: true
+  PadOperators: false
 
 AlignConsecutiveMacros:
   Enabled: true
diff --git a/test2/Makefile b/test2/Makefile
index 8e8b2ea..80efcdb 100644
--- a/test2/Makefile
+++ b/test2/Makefile
@@ -1,6 +1,9 @@
 CPPFLAGS = -Wall -Wextra -g
 # link kompute as a static library and the rest as dynamic
-LDFLAGS = -L/usr/local/lib -Wl,-Bstatic -lkompute -lkp_logger -Wl,-Bdynamic -lvulkan -lfmt -Wl,--as-needed
+LDFLAGS = -L/usr/local/lib \
+	-Wl,-Bstatic -lkompute -lkp_logger \
+	-Wl,-Bdynamic -lvulkan -lfmt \
+	-Wl,--as-needed
 
 test2: main.cpp
 	g++ ${CPPFLAGS} main.cpp -o test2 ${LDFLAGS}
diff --git a/test2/main.cpp b/test2/main.cpp
index 838e8f4..9d62028 100644
--- a/test2/main.cpp
+++ b/test2/main.cpp
@@ -1,7 +1,9 @@
+#include <chrono>
 #include <fstream>
 #include <iostream>
 #include <iterator>
 #include <memory>
+#include <numeric>
 #include <regex>
 #include <sstream>
 #include <vector>
@@ -11,7 +13,7 @@
 
 #include <unistd.h>
 
-#define MSIZE 128
+#define MSIZE 64
 
 static std::vector<uint32_t> compile_shader(const std::string &source)
 {
@@ -64,6 +66,12 @@ int main()
 {
 	// create the kompute manager
 	kp::Manager mgr;
+
+	// timestampPeriod is the number of nanoseconds required for a timestamp
+	// query to be incremented by 1.
+	auto  device_proprieties = mgr.getDeviceProperties();
+	float device_timescale	 = device_proprieties.limits.timestampPeriod;
+
 	// matrices are on the stack, this breaks for large MSIZE (1024)
 	float matrixA[MSIZE][MSIZE] = {0};
 	float matrixB[MSIZE][MSIZE] = {0};
@@ -71,13 +79,16 @@ int main()
 	// fill an identity matrix
 	for (int y = 0; y < MSIZE; y++) {
 		matrixA[y][y] = 1.0;
+		matrixB[y][y] = 2.0;
 	}
 	// fill a matrix with data
+	/*
 	for (int y = 0; y < MSIZE; y++) {
 		for (int x = 0; x < MSIZE; x++) {
 			matrixB[y][x] = x * 0.74 - y * 0.22;
 		}
 	}
+	*/
 
 	// create the tensors, tensors are just arrays, in the shader we will have
 	// to describe how it translates to matrices
@@ -93,7 +104,12 @@ int main()
 
 	// workgroup, dispatch a 2D array of workgroups (2D matrices)
 	// TODO: determine the size of the workgroups by doing some calls to vk
-	const int wgrp_x = 32, wgrp_y = 32;
+	const int lcsize_x = 32;
+	const int lcsize_y = 32;
+	const int lcsize_z = 1;
+	const int wgrp_x   = std::max(MSIZE / lcsize_x, 1);
+	const int wgrp_y   = std::max(MSIZE / lcsize_y, 1);
+
 	// this should call vkCmdDispatch(x, y, z)
 	kp::Workgroup workgroup({wgrp_x, wgrp_y, 1});
 
@@ -103,31 +119,58 @@ int main()
 
 	// substitute the value for the number of threads (xyz) per workgroup since
 	// it has to be a compile-time constant
-	shader_str = replacewith<int>("__lcsize_x__", 32, shader_str);
-	shader_str = replacewith<int>("__lcsize_y__", 32, shader_str);
-	shader_str = replacewith<int>("__lcsize_z__", 1, shader_str);
+	shader_str = replacewith<int>("__lcsize_x__", lcsize_x, shader_str);
+	shader_str = replacewith<int>("__lcsize_y__", lcsize_y, shader_str);
+	shader_str = replacewith<int>("__lcsize_z__", lcsize_z, shader_str);
 
-	printf("%s\n", shader_str.c_str());
-	return 0;
-
-	const std::vector<uint32_t> shader =
-	    compile_shader(shader_to_string("shader.comp"));
+	// compile the shader
+	const std::vector<uint32_t> shader = compile_shader(shader_str);
 
+	// prepare the algorithm with shader, parameters, workgroups to dispatch and
+	// a specialization constant constant to specify the size of each tensor
 	std::shared_ptr<kp::Algorithm> algo =
 	    mgr.algorithm(params, shader, workgroup, {MSIZE});
 
-	mgr.sequence()
-	    ->record<kp::OpTensorSyncDevice>(params)
+	// start a timer to measure CPU (host) time
+	auto start = std::chrono::high_resolution_clock::now();
+
+	// evaluate the sequence of events synchronously on queue index 0 and
+	// attaching a maximum of 10 timestamp
+	std::shared_ptr<kp::Sequence> sq;
+	sq = mgr.sequence(0, 10);
+	sq->rerecord();
+	sq->record<kp::OpTensorSyncDevice>(params)
 	    ->record<kp::OpAlgoDispatch>(algo)
 	    ->record<kp::OpTensorSyncLocal>(params)
 	    ->eval();
 
+	// stop all the timers and get the device (GPU) timestamps
+	auto end = std::chrono::high_resolution_clock::now();
+	auto total_time =
+	    std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+		.count();
+	std::vector<std::uint64_t> timestamps = sq->getTimestamps();
+	std::adjacent_difference(
+	    timestamps.begin(), timestamps.end(), timestamps.begin()
+	);
+
+	// print all the timing information
+	printf("device timescale: %f\n", device_timescale);
+	printf("cpu time: %ldus\ndevice times: ", total_time);
+	for (auto i = std::next(timestamps.begin()); i < timestamps.end(); i++) {
+		float op_us = (float)(*i * device_timescale) / 1000;
+		printf("%.2fus ", op_us);
+	}
+	printf("\n");
+
 	// print the resulting matrix
-	std::cout << "Output: {  ";
-	for (const float &elem : tensorC->vector<float>()) {
-		printf("%.2f, ", elem);
+	for (int y = 0; y < MSIZE; y++) {
+		for (int x = 0; x < MSIZE; x++) {
+			float elem = tensorC->vector<float>().at(y * MSIZE + x);
+			printf("%.1f ", elem);
+		}
+		printf("\n");
 	}
-	std::cout << "}" << std::endl;
 
 	return 0;
 }
diff --git a/test2/shader.comp b/test2/shader.comp
index 3316350..ac006f7 100644
--- a/test2/shader.comp
+++ b/test2/shader.comp
@@ -4,9 +4,9 @@
 // The number of threads spawned per-workgroup, these are substituted by the
 // program pre-compilation
 layout(
-    local_size_x = __lcsize_x__,
-    local_size_y = __lcsize_y__,
-    local_size_z = __lcsize_z__
+	local_size_x = __lcsize_x__,
+	local_size_y = __lcsize_y__,
+	local_size_z = __lcsize_z__
 ) in;
 
 // The buffers are provided via the tensors
@@ -14,11 +14,24 @@ layout(binding = 0) buffer tensorA { float matA[]; };
 layout(binding = 1) buffer tensorB { float matB[]; };
 layout(binding = 2) buffer tensorC { float matC[]; };
 
-// specialization constants
+// specialization constant
 layout(constant_id = 0) const float tensor_size_f = 0;
 
+// each thread calculates just matC[id.y][id.x]
 void main()
 {
-	uint index = gl_GlobalInvocationID.x;
-	o[index]   = a[index] * b[index];
+	uint tensor_size_u = uint(tensor_size_f);
+	// thread ID in the workgroup and workgroup ID
+	uvec3 tid = gl_LocalInvocationID;
+	uvec3 gid = gl_WorkGroupID;
+	uvec3 id = gl_GlobalInvocationID;
+
+	// Cyx = sum(k, Ayk * Bkx)
+	float acc = 0;
+	uint y = id.y * tensor_size_u;
+	uint x = id.x;
+	for (uint k = 0; k < tensor_size_u; k++) {
+		acc += matA[y + k] * matB[x + k * tensor_size_u];
+	}
+	matC[y + id.x] = acc;
 }
\ No newline at end of file