#include #include #include #include #include #include #include #include #include #include #define MSIZE 128 static std::vector compile_shader(const std::string &source) { std::ofstream fileOut("tmp_kp_shader.comp"); fileOut << source; fileOut.close(); if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o " "tmp_kp_shader.comp.spv") .c_str())) { throw std::runtime_error("Error running glslangValidator command"); } std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary); std::vector buffer; buffer.insert( buffer.begin(), std::istreambuf_iterator(fileStream), {} ); return { (uint32_t *)buffer.data(), (uint32_t *)(buffer.data() + buffer.size())}; } static std::string shader_to_string(const char *path) { std::ifstream comp_file; comp_file.open(path); if (comp_file.is_open() == false) { return std::string("// bad code"); } std::ostringstream outstr; outstr << comp_file.rdbuf(); return outstr.str(); } template std::string replacewith(const char *needle, T val, std::string str) { std::string replace = std::to_string(val); size_t len = strlen(needle); for (size_t pos = 0; (pos = str.find(needle)) != std::string::npos;) { str.replace(pos, len, replace); } return str; } // compute C = A*B on the GPU int main() { // create the kompute manager kp::Manager mgr; // matrices are on the stack, this breaks for large MSIZE (1024) float matrixA[MSIZE][MSIZE] = {0}; float matrixB[MSIZE][MSIZE] = {0}; float matrixC[MSIZE][MSIZE] = {0}; // fill an identity matrix for (int y = 0; y < MSIZE; y++) { matrixA[y][y] = 1.0; } // fill a matrix with data for (int y = 0; y < MSIZE; y++) { for (int x = 0; x < MSIZE; x++) { matrixB[y][x] = x * 0.74 - y * 0.22; } } // create the tensors, tensors are just arrays, in the shader we will have // to describe how it translates to matrices kp::Tensor::TensorDataTypes dtype = kp::Tensor::TensorDataTypes::eFloat; // auto because fuck C++ auto tensorA = mgr.tensor(matrixA, MSIZE * MSIZE, sizeof(float), dtype); auto tensorB = mgr.tensor(matrixB, MSIZE * MSIZE, sizeof(float), dtype); auto tensorC = mgr.tensor(matrixC, MSIZE * MSIZE, sizeof(float), dtype); const std::vector> params = { tensorA, tensorB, tensorC}; // workgroup, dispatch a 2D array of workgroups (2D matrices) // TODO: determine the size of the workgroups by doing some calls to vk const int wgrp_x = 32, wgrp_y = 32; // this should call vkCmdDispatch(x, y, z) kp::Workgroup workgroup({wgrp_x, wgrp_y, 1}); // get the shader code into a string const char *shader_path = "shader.comp"; std::string shader_str = shader_to_string(shader_path); // substitute the value for the number of threads (xyz) per workgroup since // it has to be a compile-time constant shader_str = replacewith("__lcsize_x__", 32, shader_str); shader_str = replacewith("__lcsize_y__", 32, shader_str); shader_str = replacewith("__lcsize_z__", 1, shader_str); printf("%s\n", shader_str.c_str()); return 0; const std::vector shader = compile_shader(shader_to_string("shader.comp")); std::shared_ptr algo = mgr.algorithm(params, shader, workgroup, {MSIZE}); mgr.sequence() ->record(params) ->record(algo) ->record(params) ->eval(); // print the resulting matrix std::cout << "Output: { "; for (const float &elem : tensorC->vector()) { printf("%.2f, ", elem); } std::cout << "}" << std::endl; return 0; }