kompute_tests/test2/main.cpp

#include <fstream>
#include <iostream>
#include <iterator>
#include <memory>
#include <regex>
#include <sstream>
#include <vector>

#include <kompute/Kompute.hpp>
#include <vulkan/vulkan_handles.hpp>

#include <unistd.h>

#define MSIZE 128

static std::vector<uint32_t> compile_shader(const std::string &source)
{
	std::ofstream fileOut("tmp_kp_shader.comp");
	fileOut << source;
	fileOut.close();
	if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o "
			       "tmp_kp_shader.comp.spv")
		       .c_str())) {
		throw std::runtime_error("Error running glslangValidator command");
	}
	std::ifstream	  fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
	std::vector<char> buffer;
	buffer.insert(
	    buffer.begin(), std::istreambuf_iterator<char>(fileStream), {}
	);
	return {
	    (uint32_t *)buffer.data(), (uint32_t *)(buffer.data() + buffer.size())};
}

static std::string shader_to_string(const char *path)
{
	std::ifstream comp_file;

	comp_file.open(path);
	if (comp_file.is_open() == false) {
		return std::string("// bad code");
	}

	std::ostringstream outstr;
	outstr << comp_file.rdbuf();
	return outstr.str();
}

template <typename T>
std::string replacewith(const char *needle, T val, std::string str)
{

	std::string replace = std::to_string(val);
	size_t	    len	    = strlen(needle);

	for (size_t pos = 0; (pos = str.find(needle)) != std::string::npos;) {
		str.replace(pos, len, replace);
	}
	return str;
}

// compute C = A*B on the GPU
int main()
{
	// create the kompute manager
	kp::Manager mgr;
	// matrices are on the stack, this breaks for large MSIZE (1024)
	float matrixA[MSIZE][MSIZE] = {0};
	float matrixB[MSIZE][MSIZE] = {0};
	float matrixC[MSIZE][MSIZE] = {0};
	// fill an identity matrix
	for (int y = 0; y < MSIZE; y++) {
		matrixA[y][y] = 1.0;
	}
	// fill a matrix with data
	for (int y = 0; y < MSIZE; y++) {
		for (int x = 0; x < MSIZE; x++) {
			matrixB[y][x] = x * 0.74 - y * 0.22;
		}
	}

	// create the tensors, tensors are just arrays, in the shader we will have
	// to describe how it translates to matrices
	kp::Tensor::TensorDataTypes dtype = kp::Tensor::TensorDataTypes::eFloat;

	// auto because fuck C++
	auto tensorA = mgr.tensor(matrixA, MSIZE * MSIZE, sizeof(float), dtype);
	auto tensorB = mgr.tensor(matrixB, MSIZE * MSIZE, sizeof(float), dtype);
	auto tensorC = mgr.tensor(matrixC, MSIZE * MSIZE, sizeof(float), dtype);

	const std::vector<std::shared_ptr<kp::Tensor>> params = {
	    tensorA, tensorB, tensorC};

	// workgroup, dispatch a 2D array of workgroups (2D matrices)
	// TODO: determine the size of the workgroups by doing some calls to vk
	const int wgrp_x = 32, wgrp_y = 32;
	// this should call vkCmdDispatch(x, y, z)
	kp::Workgroup workgroup({wgrp_x, wgrp_y, 1});

	// get the shader code into a string
	const char *shader_path = "shader.comp";
	std::string shader_str	= shader_to_string(shader_path);

	// substitute the value for the number of threads (xyz) per workgroup since
	// it has to be a compile-time constant
	shader_str = replacewith<int>("__lcsize_x__", 32, shader_str);
	shader_str = replacewith<int>("__lcsize_y__", 32, shader_str);
	shader_str = replacewith<int>("__lcsize_z__", 1, shader_str);

	printf("%s\n", shader_str.c_str());
	return 0;

	const std::vector<uint32_t> shader =
	    compile_shader(shader_to_string("shader.comp"));

	std::shared_ptr<kp::Algorithm> algo =
	    mgr.algorithm(params, shader, workgroup, {MSIZE});

	mgr.sequence()
	    ->record<kp::OpTensorSyncDevice>(params)
	    ->record<kp::OpAlgoDispatch>(algo)
	    ->record<kp::OpTensorSyncLocal>(params)
	    ->eval();

	// print the resulting matrix
	std::cout << "Output: {  ";
	for (const float &elem : tensorC->vector<float>()) {
		printf("%.2f, ", elem);
	}
	std::cout << "}" << std::endl;

	return 0;
}