diff --git a/test2/.clang-format b/test2/.clang-format
new file mode 100644
index 0000000..1b9315e
--- /dev/null
+++ b/test2/.clang-format
@@ -0,0 +1,40 @@
+# linux kernel style formatting
+BasedOnStyle: LLVM
+IndentWidth: 8
+UseTab: Always
+
+BreakBeforeBraces: Linux
+AllowShortIfStatementsOnASingleLine: false
+IndentCaseLabels: false
+ColumnLimit: 85
+
+InsertBraces: true
+SortIncludes: Never
+BinPackParameters: false
+BinPackArguments: false
+Cpp11BracedListStyle: true
+SpaceBeforeCpp11BracedList: true
+SeparateDefinitionBlocks: Always
+AlignAfterOpenBracket: BlockIndent
+
+AlignConsecutiveDeclarations:
+  Enabled: true
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCompound: true
+  PadOperators: true
+
+AlignConsecutiveMacros:
+  Enabled: true
+  AcrossEmptyLines: false
+  AcrossComments: true
+
+AlignConsecutiveBitFields:
+  Enabled: true
+  AcrossEmptyLines: false
+  AcrossComments: true
+
+AlignConsecutiveAssignments:
+  Enabled: true
+  AcrossEmptyLines: false
+  AcrossComments: true
diff --git a/test2/.gitignore b/test2/.gitignore
index e37dbbe..9f35b14 100644
--- a/test2/.gitignore
+++ b/test2/.gitignore
@@ -1,3 +1,3 @@
+tmp_*
 *.spv
-*.comp
-test1
+test2
diff --git a/test2/main.cpp b/test2/main.cpp
index 274a3db..838e8f4 100644
--- a/test2/main.cpp
+++ b/test2/main.cpp
@@ -9,22 +9,29 @@
 #include <kompute/Kompute.hpp>
 #include <vulkan/vulkan_handles.hpp>
 
+#include <unistd.h>
 
-static std::vector<uint32_t> compile_shader(const std::string& source)
+#define MSIZE 128
+
+static std::vector<uint32_t> compile_shader(const std::string &source)
 {
 	std::ofstream fileOut("tmp_kp_shader.comp");
 	fileOut << source;
 	fileOut.close();
-	if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str()))
+	if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o "
+			       "tmp_kp_shader.comp.spv")
+		       .c_str())) {
 		throw std::runtime_error("Error running glslangValidator command");
-	std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
+	}
+	std::ifstream	  fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
 	std::vector<char> buffer;
-	buffer.insert(buffer.begin(), std::istreambuf_iterator<char>(fileStream), {});
-	return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())};
+	buffer.insert(
+	    buffer.begin(), std::istreambuf_iterator<char>(fileStream), {}
+	);
+	return {
+	    (uint32_t *)buffer.data(), (uint32_t *)(buffer.data() + buffer.size())};
 }
 
-
-
 static std::string shader_to_string(const char *path)
 {
 	std::ifstream comp_file;
@@ -39,71 +46,36 @@ static std::string shader_to_string(const char *path)
 	return outstr.str();
 }
 
-
-// the sed replace command as a function
-// 1. /pattern/replace/
-// 2. /pattern//
-static std::string regex_replace(const char *expr_string, std::string str)
+template <typename T>
+std::string replacewith(const char *needle, T val, std::string str)
 {
-	std::string expr(expr_string);
-	std::string pattern, replace;
-
-	if (expr.size() < 3 || expr[0] != '/' || expr.back() != '/') {
-		return str;
-	} else {
-		// shift 1
-		expr = expr.substr(1, expr.size()-2);
-	}
-	for (size_t pos = 0, nxpos = 0; ; pos = nxpos) {
-		nxpos = expr.find("/", pos);
-		if (nxpos == std::string::npos) {
-			break;
-		}
-		// skip escaped '/'
-		if (nxpos > 0 && expr[nxpos-1] == '\\') {
-			if (nxpos > 1 && expr[nxpos-2] != '\\') {
-				continue;
-			}
-		}
-		pattern = expr.substr(0, nxpos);
-		replace = expr.substr(nxpos+1);
-		break;
-	}
-	if (pattern.empty()) {
-		return str;
-	}
-
-	std::regex reg(pattern);
-	std::string newstr = std::regex_replace(str, reg, replace);
-	return newstr;
-}
 
+	std::string replace = std::to_string(val);
+	size_t	    len	    = strlen(needle);
 
-template <typename T> std::string regex_subst(const char *expr, T val,  std::string str)
-{
-	std::regex reg(expr);
-	std::regex_match();
-	return newstr;
+	for (size_t pos = 0; (pos = str.find(needle)) != std::string::npos;) {
+		str.replace(pos, len, replace);
+	}
+	return str;
 }
 
-
+// compute C = A*B on the GPU
 int main()
 {
 	// create the kompute manager
 	kp::Manager mgr;
-
-	// C = A*B
-	float matrixA[1024][1024] = {0};
-	float matrixB[1024][1024] = {0};
-	float matrixC[1024][1024] = {0};
+	// matrices are on the stack, this breaks for large MSIZE (1024)
+	float matrixA[MSIZE][MSIZE] = {0};
+	float matrixB[MSIZE][MSIZE] = {0};
+	float matrixC[MSIZE][MSIZE] = {0};
 	// fill an identity matrix
-	for (int y = 0; y < 1024; y++) {
+	for (int y = 0; y < MSIZE; y++) {
 		matrixA[y][y] = 1.0;
 	}
 	// fill a matrix with data
-	for (int y = 0; y < 1024; y++) {
-		for (int x = 0; x < 1024; x++) {
-			matrixB[y][x] = x*0.74 - y*0.22;
+	for (int y = 0; y < MSIZE; y++) {
+		for (int x = 0; x < MSIZE; x++) {
+			matrixB[y][x] = x * 0.74 - y * 0.22;
 		}
 	}
 
@@ -112,13 +84,12 @@ int main()
 	kp::Tensor::TensorDataTypes dtype = kp::Tensor::TensorDataTypes::eFloat;
 
 	// auto because fuck C++
-	auto tensorA = mgr.tensor(matrixA, 1024*1024, sizeof(float), dtype);
-	auto tensorB = mgr.tensor(matrixB, 1024*1024, sizeof(float), dtype);
-	auto tensorC = mgr.tensor(matrixC, 1024*1024, sizeof(float), dtype);
+	auto tensorA = mgr.tensor(matrixA, MSIZE * MSIZE, sizeof(float), dtype);
+	auto tensorB = mgr.tensor(matrixB, MSIZE * MSIZE, sizeof(float), dtype);
+	auto tensorC = mgr.tensor(matrixC, MSIZE * MSIZE, sizeof(float), dtype);
 
 	const std::vector<std::shared_ptr<kp::Tensor>> params = {
-		tensorA, tensorB, tensorC
-	};
+	    tensorA, tensorB, tensorC};
 
 	// workgroup, dispatch a 2D array of workgroups (2D matrices)
 	// TODO: determine the size of the workgroups by doing some calls to vk
@@ -126,30 +97,34 @@ int main()
 	// this should call vkCmdDispatch(x, y, z)
 	kp::Workgroup workgroup({wgrp_x, wgrp_y, 1});
 
-	// substitute the values in the shader
+	// get the shader code into a string
 	const char *shader_path = "shader.comp";
-	std::string shader_str = shader_to_string(shader_path);
-	shader_str = regex_replace("/{lcsize_x}/", shader_str);
-	const std::vector<uint32_t> shader = compile_shader(shader_to_string("shader.comp"));
+	std::string shader_str	= shader_to_string(shader_path);
 
+	// substitute the value for the number of threads (xyz) per workgroup since
+	// it has to be a compile-time constant
+	shader_str = replacewith<int>("__lcsize_x__", 32, shader_str);
+	shader_str = replacewith<int>("__lcsize_y__", 32, shader_str);
+	shader_str = replacewith<int>("__lcsize_z__", 1, shader_str);
 
-	std::shared_ptr<kp::Algorithm> algo = mgr.algorithm(
-		params,
-		shader,
-		workgroup,
-		{1024.0}
-	);
+	printf("%s\n", shader_str.c_str());
+	return 0;
 
-	mgr.sequence()
-		->record<kp::OpTensorSyncDevice>(params)
-		->record<kp::OpAlgoDispatch>(algo)
-		->record<kp::OpTensorSyncLocal>(params)
-		->eval();
+	const std::vector<uint32_t> shader =
+	    compile_shader(shader_to_string("shader.comp"));
 
+	std::shared_ptr<kp::Algorithm> algo =
+	    mgr.algorithm(params, shader, workgroup, {MSIZE});
+
+	mgr.sequence()
+	    ->record<kp::OpTensorSyncDevice>(params)
+	    ->record<kp::OpAlgoDispatch>(algo)
+	    ->record<kp::OpTensorSyncLocal>(params)
+	    ->eval();
 
 	// print the resulting matrix
 	std::cout << "Output: {  ";
-	for (const float& elem : tensorC->vector<float>()) {
+	for (const float &elem : tensorC->vector<float>()) {
 		printf("%.2f, ", elem);
 	}
 	std::cout << "}" << std::endl;
diff --git a/test2/shader.comp b/test2/shader.comp
new file mode 100644
index 0000000..3316350
--- /dev/null
+++ b/test2/shader.comp
@@ -0,0 +1,24 @@
+#version 450
+// clang-format off
+
+// The number of threads spawned per-workgroup, these are substituted by the
+// program pre-compilation
+layout(
+    local_size_x = __lcsize_x__,
+    local_size_y = __lcsize_y__,
+    local_size_z = __lcsize_z__
+) in;
+
+// The buffers are provided via the tensors
+layout(binding = 0) buffer tensorA { float matA[]; };
+layout(binding = 1) buffer tensorB { float matB[]; };
+layout(binding = 2) buffer tensorC { float matC[]; };
+
+// specialization constants
+layout(constant_id = 0) const float tensor_size_f = 0;
+
+void main()
+{
+	uint index = gl_GlobalInvocationID.x;
+	o[index]   = a[index] * b[index];
+}
\ No newline at end of file
diff --git a/test2/test2 b/test2/test2
deleted file mode 100755
index 401d056..0000000
Binary files a/test2/test2 and /dev/null differ