#include #include #include #include #include #include #include #include #include #include #include #include #define MSIZE 32 static std::vector compile_shader(const std::string &source) { std::ofstream fileOut("tmp_kp_shader.comp"); fileOut << source; fileOut.close(); if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o " "tmp_kp_shader.comp.spv") .c_str())) { throw std::runtime_error("Error running glslangValidator command"); } std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary); std::vector buffer; buffer.insert( buffer.begin(), std::istreambuf_iterator(fileStream), {} ); return { (uint32_t *)buffer.data(), (uint32_t *)(buffer.data() + buffer.size())}; } static std::string shader_to_string(const char *path) { std::ifstream comp_file; comp_file.open(path); if (comp_file.is_open() == false) { return std::string("// bad code"); } std::ostringstream outstr; outstr << comp_file.rdbuf(); return outstr.str(); } template std::string replacewith(const char *needle, T val, std::string str) { std::string replace = std::to_string(val); size_t len = strlen(needle); for (size_t pos = 0; (pos = str.find(needle)) != std::string::npos;) { str.replace(pos, len, replace); } return str; } void print_matrix(float m[], size_t w, size_t h) { for (size_t y = 0; y < h; y++) { for (size_t x = 0; x < w; x++) { printf("%.1f ", m[y * w + x]); } printf("\n"); } } void fill_identity(float m[], size_t w, size_t h) { for (size_t y = 0; y < h; y++) { m[y * w + y] = 1.0; } } void fill_garbage(float m[], size_t w, size_t h) { for (size_t y = 0; y < h; y++) { for (size_t x = 0; x < w; x++) { m[y * w + x] = x * 0.74 - y * 0.22; } } } // compute C = A*B on the GPU int main() { // create the kompute manager kp::Manager mgr; // timestampPeriod is the number of nanoseconds required for a timestamp // query to be incremented by 1. auto device_proprieties = mgr.getDeviceProperties(); float device_timescale = device_proprieties.limits.timestampPeriod; // matrices are on the stack, this breaks for large MSIZE (1024) float matrixA[MSIZE][MSIZE] = {0}; float matrixB[MSIZE][MSIZE] = {0}; float matrixC[MSIZE][MSIZE] = {0}; fill_garbage((float *)matrixA, MSIZE, MSIZE); matrixB[0][0] = 1.0; // create the tensors, tensors are just arrays, in the shader we will have // to describe how it translates to matrices kp::Tensor::TensorDataTypes dtype = kp::Tensor::TensorDataTypes::eFloat; // auto because fuck C++ auto tensorA = mgr.tensor(matrixA, MSIZE * MSIZE, sizeof(float), dtype); auto tensorB = mgr.tensor(matrixB, MSIZE * MSIZE, sizeof(float), dtype); auto tensorC = mgr.tensor(matrixC, MSIZE * MSIZE, sizeof(float), dtype); const std::vector> params = { tensorA, tensorB, tensorC}; // workgroup, dispatch a 2D array of workgroups (2D matrices) // TODO: determine the size of the workgroups by doing some calls to vk const int lcsize_x = 32; const int lcsize_y = 32; const int lcsize_z = 1; const int wgrp_x = std::max(MSIZE / lcsize_x, 1); const int wgrp_y = std::max(MSIZE / lcsize_y, 1); // this should call vkCmdDispatch(x, y, z) kp::Workgroup workgroup({wgrp_x, wgrp_y, 1}); // get the shader code into a string const char *shader_path = "shader.comp"; std::string shader_str = shader_to_string(shader_path); // substitute the value for the number of threads (xyz) per workgroup since // it has to be a compile-time constant shader_str = replacewith("__lcsize_x__", lcsize_x, shader_str); shader_str = replacewith("__lcsize_y__", lcsize_y, shader_str); shader_str = replacewith("__lcsize_z__", lcsize_z, shader_str); // compile the shader const std::vector shader = compile_shader(shader_str); // prepare the algorithm with shader, parameters, workgroups to dispatch and // a specialization constant constant to specify the size of each tensor std::shared_ptr algo = mgr.algorithm(params, shader, workgroup, {MSIZE}); // start a timer to measure CPU (host) time auto start = std::chrono::high_resolution_clock::now(); // evaluate the sequence of events synchronously on queue index 0 and // attaching a maximum of 10 timestamp std::shared_ptr sq; sq = mgr.sequence(0, 10); sq->rerecord(); sq->record(params) ->record(algo) ->record(params) ->eval(); // stop all the timers and get the device (GPU) timestamps auto end = std::chrono::high_resolution_clock::now(); auto total_time = std::chrono::duration_cast(end - start) .count(); std::vector timestamps = sq->getTimestamps(); std::adjacent_difference( timestamps.begin(), timestamps.end(), timestamps.begin() ); // print all the timing information printf("device timescale: %f\n", device_timescale); printf("cpu time: %ldus\ndevice times: ", total_time); for (auto i = std::next(timestamps.begin()); i < timestamps.end(); i++) { float op_us = (float)(*i * device_timescale) / 1000; printf("%.2fus ", op_us); } printf("\n"); // print the resulting matrix printf("matrixA:\n"); print_matrix(&tensorA->vector()[0], MSIZE, MSIZE); printf("matrixB:\n"); print_matrix(&tensorB->vector()[0], MSIZE, MSIZE); printf("matrixC:\n"); print_matrix(&tensorC->vector()[0], MSIZE, MSIZE); return 0; }