|
|
|
@ -9,22 +9,29 @@ |
|
|
|
|
#include <kompute/Kompute.hpp> |
|
|
|
|
#include <vulkan/vulkan_handles.hpp> |
|
|
|
|
|
|
|
|
|
#include <unistd.h> |
|
|
|
|
|
|
|
|
|
#define MSIZE 128 |
|
|
|
|
|
|
|
|
|
static std::vector<uint32_t> compile_shader(const std::string &source) |
|
|
|
|
{ |
|
|
|
|
std::ofstream fileOut("tmp_kp_shader.comp"); |
|
|
|
|
fileOut << source; |
|
|
|
|
fileOut.close(); |
|
|
|
|
if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str())) |
|
|
|
|
if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o " |
|
|
|
|
"tmp_kp_shader.comp.spv") |
|
|
|
|
.c_str())) { |
|
|
|
|
throw std::runtime_error("Error running glslangValidator command"); |
|
|
|
|
} |
|
|
|
|
std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary); |
|
|
|
|
std::vector<char> buffer; |
|
|
|
|
buffer.insert(buffer.begin(), std::istreambuf_iterator<char>(fileStream), {}); |
|
|
|
|
return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())}; |
|
|
|
|
buffer.insert( |
|
|
|
|
buffer.begin(), std::istreambuf_iterator<char>(fileStream), {} |
|
|
|
|
); |
|
|
|
|
return { |
|
|
|
|
(uint32_t *)buffer.data(), (uint32_t *)(buffer.data() + buffer.size())}; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static std::string shader_to_string(const char *path) |
|
|
|
|
{ |
|
|
|
|
std::ifstream comp_file; |
|
|
|
@ -39,70 +46,35 @@ static std::string shader_to_string(const char *path) |
|
|
|
|
return outstr.str(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// the sed replace command as a function
|
|
|
|
|
// 1. /pattern/replace/
|
|
|
|
|
// 2. /pattern//
|
|
|
|
|
static std::string regex_replace(const char *expr_string, std::string str) |
|
|
|
|
template <typename T> |
|
|
|
|
std::string replacewith(const char *needle, T val, std::string str) |
|
|
|
|
{ |
|
|
|
|
std::string expr(expr_string); |
|
|
|
|
std::string pattern, replace; |
|
|
|
|
|
|
|
|
|
if (expr.size() < 3 || expr[0] != '/' || expr.back() != '/') { |
|
|
|
|
return str; |
|
|
|
|
} else { |
|
|
|
|
// shift 1
|
|
|
|
|
expr = expr.substr(1, expr.size()-2); |
|
|
|
|
} |
|
|
|
|
for (size_t pos = 0, nxpos = 0; ; pos = nxpos) { |
|
|
|
|
nxpos = expr.find("/", pos); |
|
|
|
|
if (nxpos == std::string::npos) { |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
// skip escaped '/'
|
|
|
|
|
if (nxpos > 0 && expr[nxpos-1] == '\\') { |
|
|
|
|
if (nxpos > 1 && expr[nxpos-2] != '\\') { |
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
pattern = expr.substr(0, nxpos); |
|
|
|
|
replace = expr.substr(nxpos+1); |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
if (pattern.empty()) { |
|
|
|
|
return str; |
|
|
|
|
} |
|
|
|
|
std::string replace = std::to_string(val); |
|
|
|
|
size_t len = strlen(needle); |
|
|
|
|
|
|
|
|
|
std::regex reg(pattern); |
|
|
|
|
std::string newstr = std::regex_replace(str, reg, replace); |
|
|
|
|
return newstr; |
|
|
|
|
for (size_t pos = 0; (pos = str.find(needle)) != std::string::npos;) { |
|
|
|
|
str.replace(pos, len, replace); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T> std::string regex_subst(const char *expr, T val, std::string str) |
|
|
|
|
{ |
|
|
|
|
std::regex reg(expr); |
|
|
|
|
std::regex_match(); |
|
|
|
|
return newstr; |
|
|
|
|
return str; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// compute C = A*B on the GPU
|
|
|
|
|
int main() |
|
|
|
|
{ |
|
|
|
|
// create the kompute manager
|
|
|
|
|
kp::Manager mgr; |
|
|
|
|
|
|
|
|
|
// C = A*B
|
|
|
|
|
float matrixA[1024][1024] = {0}; |
|
|
|
|
float matrixB[1024][1024] = {0}; |
|
|
|
|
float matrixC[1024][1024] = {0}; |
|
|
|
|
// matrices are on the stack, this breaks for large MSIZE (1024)
|
|
|
|
|
float matrixA[MSIZE][MSIZE] = {0}; |
|
|
|
|
float matrixB[MSIZE][MSIZE] = {0}; |
|
|
|
|
float matrixC[MSIZE][MSIZE] = {0}; |
|
|
|
|
// fill an identity matrix
|
|
|
|
|
for (int y = 0; y < 1024; y++) { |
|
|
|
|
for (int y = 0; y < MSIZE; y++) { |
|
|
|
|
matrixA[y][y] = 1.0; |
|
|
|
|
} |
|
|
|
|
// fill a matrix with data
|
|
|
|
|
for (int y = 0; y < 1024; y++) { |
|
|
|
|
for (int x = 0; x < 1024; x++) { |
|
|
|
|
for (int y = 0; y < MSIZE; y++) { |
|
|
|
|
for (int x = 0; x < MSIZE; x++) { |
|
|
|
|
matrixB[y][x] = x * 0.74 - y * 0.22; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -112,13 +84,12 @@ int main() |
|
|
|
|
kp::Tensor::TensorDataTypes dtype = kp::Tensor::TensorDataTypes::eFloat; |
|
|
|
|
|
|
|
|
|
// auto because fuck C++
|
|
|
|
|
auto tensorA = mgr.tensor(matrixA, 1024*1024, sizeof(float), dtype); |
|
|
|
|
auto tensorB = mgr.tensor(matrixB, 1024*1024, sizeof(float), dtype); |
|
|
|
|
auto tensorC = mgr.tensor(matrixC, 1024*1024, sizeof(float), dtype); |
|
|
|
|
auto tensorA = mgr.tensor(matrixA, MSIZE * MSIZE, sizeof(float), dtype); |
|
|
|
|
auto tensorB = mgr.tensor(matrixB, MSIZE * MSIZE, sizeof(float), dtype); |
|
|
|
|
auto tensorC = mgr.tensor(matrixC, MSIZE * MSIZE, sizeof(float), dtype); |
|
|
|
|
|
|
|
|
|
const std::vector<std::shared_ptr<kp::Tensor>> params = { |
|
|
|
|
tensorA, tensorB, tensorC |
|
|
|
|
}; |
|
|
|
|
tensorA, tensorB, tensorC}; |
|
|
|
|
|
|
|
|
|
// workgroup, dispatch a 2D array of workgroups (2D matrices)
|
|
|
|
|
// TODO: determine the size of the workgroups by doing some calls to vk
|
|
|
|
@ -126,19 +97,24 @@ int main() |
|
|
|
|
// this should call vkCmdDispatch(x, y, z)
|
|
|
|
|
kp::Workgroup workgroup({wgrp_x, wgrp_y, 1}); |
|
|
|
|
|
|
|
|
|
// substitute the values in the shader
|
|
|
|
|
// get the shader code into a string
|
|
|
|
|
const char *shader_path = "shader.comp"; |
|
|
|
|
std::string shader_str = shader_to_string(shader_path); |
|
|
|
|
shader_str = regex_replace("/{lcsize_x}/", shader_str); |
|
|
|
|
const std::vector<uint32_t> shader = compile_shader(shader_to_string("shader.comp")); |
|
|
|
|
|
|
|
|
|
// substitute the value for the number of threads (xyz) per workgroup since
|
|
|
|
|
// it has to be a compile-time constant
|
|
|
|
|
shader_str = replacewith<int>("__lcsize_x__", 32, shader_str); |
|
|
|
|
shader_str = replacewith<int>("__lcsize_y__", 32, shader_str); |
|
|
|
|
shader_str = replacewith<int>("__lcsize_z__", 1, shader_str); |
|
|
|
|
|
|
|
|
|
std::shared_ptr<kp::Algorithm> algo = mgr.algorithm( |
|
|
|
|
params, |
|
|
|
|
shader, |
|
|
|
|
workgroup, |
|
|
|
|
{1024.0} |
|
|
|
|
); |
|
|
|
|
printf("%s\n", shader_str.c_str()); |
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
const std::vector<uint32_t> shader = |
|
|
|
|
compile_shader(shader_to_string("shader.comp")); |
|
|
|
|
|
|
|
|
|
std::shared_ptr<kp::Algorithm> algo = |
|
|
|
|
mgr.algorithm(params, shader, workgroup, {MSIZE}); |
|
|
|
|
|
|
|
|
|
mgr.sequence() |
|
|
|
|
->record<kp::OpTensorSyncDevice>(params) |
|
|
|
@ -146,7 +122,6 @@ int main() |
|
|
|
|
->record<kp::OpTensorSyncLocal>(params) |
|
|
|
|
->eval(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// print the resulting matrix
|
|
|
|
|
std::cout << "Output: { "; |
|
|
|
|
for (const float &elem : tensorC->vector<float>()) { |
|
|
|
|