|
|
|
#include <chrono>
|
|
|
|
#include <fstream>
|
|
|
|
#include <iostream>
|
|
|
|
#include <iterator>
|
|
|
|
#include <memory>
|
|
|
|
#include <numeric>
|
|
|
|
#include <regex>
|
|
|
|
#include <sstream>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include <kompute/Kompute.hpp>
|
|
|
|
#include <vulkan/vulkan_handles.hpp>
|
|
|
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
|
|
|
#define MSIZE 32
|
|
|
|
|
|
|
|
static std::vector<uint32_t> compile_shader(const std::string &source)
|
|
|
|
{
|
|
|
|
std::ofstream fileOut("tmp_kp_shader.comp");
|
|
|
|
fileOut << source;
|
|
|
|
fileOut.close();
|
|
|
|
if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o "
|
|
|
|
"tmp_kp_shader.comp.spv")
|
|
|
|
.c_str())) {
|
|
|
|
throw std::runtime_error("Error running glslangValidator command");
|
|
|
|
}
|
|
|
|
std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
|
|
|
|
std::vector<char> buffer;
|
|
|
|
buffer.insert(
|
|
|
|
buffer.begin(), std::istreambuf_iterator<char>(fileStream), {}
|
|
|
|
);
|
|
|
|
return {
|
|
|
|
(uint32_t *)buffer.data(), (uint32_t *)(buffer.data() + buffer.size())};
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string shader_to_string(const char *path)
|
|
|
|
{
|
|
|
|
std::ifstream comp_file;
|
|
|
|
|
|
|
|
comp_file.open(path);
|
|
|
|
if (comp_file.is_open() == false) {
|
|
|
|
return std::string("// bad code");
|
|
|
|
}
|
|
|
|
|
|
|
|
std::ostringstream outstr;
|
|
|
|
outstr << comp_file.rdbuf();
|
|
|
|
return outstr.str();
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
std::string replacewith(const char *needle, T val, std::string str)
|
|
|
|
{
|
|
|
|
|
|
|
|
std::string replace = std::to_string(val);
|
|
|
|
size_t len = strlen(needle);
|
|
|
|
|
|
|
|
for (size_t pos = 0; (pos = str.find(needle)) != std::string::npos;) {
|
|
|
|
str.replace(pos, len, replace);
|
|
|
|
}
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
|
|
|
void print_matrix(float m[], size_t w, size_t h)
|
|
|
|
{
|
|
|
|
for (size_t y = 0; y < h; y++) {
|
|
|
|
for (size_t x = 0; x < w; x++) {
|
|
|
|
printf("%.1f ", m[y * w + x]);
|
|
|
|
}
|
|
|
|
printf("\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void fill_identity(float m[], size_t w, size_t h)
|
|
|
|
{
|
|
|
|
for (size_t y = 0; y < h; y++) {
|
|
|
|
m[y * w + y] = 1.0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void fill_garbage(float m[], size_t w, size_t h)
|
|
|
|
{
|
|
|
|
for (size_t y = 0; y < h; y++) {
|
|
|
|
for (size_t x = 0; x < w; x++) {
|
|
|
|
m[y * w + x] = x * 0.74 - y * 0.22;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// compute C = A*B on the GPU
|
|
|
|
int main()
|
|
|
|
{
|
|
|
|
// create the kompute manager
|
|
|
|
kp::Manager mgr;
|
|
|
|
|
|
|
|
// timestampPeriod is the number of nanoseconds required for a timestamp
|
|
|
|
// query to be incremented by 1.
|
|
|
|
auto device_proprieties = mgr.getDeviceProperties();
|
|
|
|
float device_timescale = device_proprieties.limits.timestampPeriod;
|
|
|
|
|
|
|
|
// matrices are on the stack, this breaks for large MSIZE (1024)
|
|
|
|
float matrixA[MSIZE][MSIZE] = {0};
|
|
|
|
float matrixB[MSIZE][MSIZE] = {0};
|
|
|
|
float matrixC[MSIZE][MSIZE] = {0};
|
|
|
|
|
|
|
|
fill_garbage((float *)matrixA, MSIZE, MSIZE);
|
|
|
|
matrixB[0][0] = 1.0;
|
|
|
|
|
|
|
|
// create the tensors, tensors are just arrays, in the shader we will have
|
|
|
|
// to describe how it translates to matrices
|
|
|
|
kp::Tensor::TensorDataTypes dtype = kp::Tensor::TensorDataTypes::eFloat;
|
|
|
|
|
|
|
|
// auto because fuck C++
|
|
|
|
auto tensorA = mgr.tensor(matrixA, MSIZE * MSIZE, sizeof(float), dtype);
|
|
|
|
auto tensorB = mgr.tensor(matrixB, MSIZE * MSIZE, sizeof(float), dtype);
|
|
|
|
auto tensorC = mgr.tensor(matrixC, MSIZE * MSIZE, sizeof(float), dtype);
|
|
|
|
|
|
|
|
const std::vector<std::shared_ptr<kp::Tensor>> params = {
|
|
|
|
tensorA, tensorB, tensorC};
|
|
|
|
|
|
|
|
// workgroup, dispatch a 2D array of workgroups (2D matrices)
|
|
|
|
// TODO: determine the size of the workgroups by doing some calls to vk
|
|
|
|
const int lcsize_x = 32;
|
|
|
|
const int lcsize_y = 32;
|
|
|
|
const int lcsize_z = 1;
|
|
|
|
const int wgrp_x = std::max(MSIZE / lcsize_x, 1);
|
|
|
|
const int wgrp_y = std::max(MSIZE / lcsize_y, 1);
|
|
|
|
|
|
|
|
// this should call vkCmdDispatch(x, y, z)
|
|
|
|
kp::Workgroup workgroup({wgrp_x, wgrp_y, 1});
|
|
|
|
|
|
|
|
// get the shader code into a string
|
|
|
|
const char *shader_path = "shader.comp";
|
|
|
|
std::string shader_str = shader_to_string(shader_path);
|
|
|
|
|
|
|
|
// substitute the value for the number of threads (xyz) per workgroup since
|
|
|
|
// it has to be a compile-time constant
|
|
|
|
shader_str = replacewith<int>("__lcsize_x__", lcsize_x, shader_str);
|
|
|
|
shader_str = replacewith<int>("__lcsize_y__", lcsize_y, shader_str);
|
|
|
|
shader_str = replacewith<int>("__lcsize_z__", lcsize_z, shader_str);
|
|
|
|
|
|
|
|
// compile the shader
|
|
|
|
const std::vector<uint32_t> shader = compile_shader(shader_str);
|
|
|
|
|
|
|
|
// prepare the algorithm with shader, parameters, workgroups to dispatch and
|
|
|
|
// a specialization constant constant to specify the size of each tensor
|
|
|
|
std::shared_ptr<kp::Algorithm> algo =
|
|
|
|
mgr.algorithm(params, shader, workgroup, {MSIZE});
|
|
|
|
|
|
|
|
// start a timer to measure CPU (host) time
|
|
|
|
auto start = std::chrono::high_resolution_clock::now();
|
|
|
|
|
|
|
|
// evaluate the sequence of events synchronously on queue index 0 and
|
|
|
|
// attaching a maximum of 10 timestamp
|
|
|
|
std::shared_ptr<kp::Sequence> sq;
|
|
|
|
sq = mgr.sequence(0, 10);
|
|
|
|
sq->rerecord();
|
|
|
|
sq->record<kp::OpTensorSyncDevice>(params)
|
|
|
|
->record<kp::OpAlgoDispatch>(algo)
|
|
|
|
->record<kp::OpTensorSyncLocal>(params)
|
|
|
|
->eval();
|
|
|
|
|
|
|
|
// stop all the timers and get the device (GPU) timestamps
|
|
|
|
auto end = std::chrono::high_resolution_clock::now();
|
|
|
|
auto total_time =
|
|
|
|
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
|
|
|
|
.count();
|
|
|
|
std::vector<std::uint64_t> timestamps = sq->getTimestamps();
|
|
|
|
std::adjacent_difference(
|
|
|
|
timestamps.begin(), timestamps.end(), timestamps.begin()
|
|
|
|
);
|
|
|
|
|
|
|
|
// print all the timing information
|
|
|
|
printf("device timescale: %f\n", device_timescale);
|
|
|
|
printf("cpu time: %ldus\ndevice times: ", total_time);
|
|
|
|
for (auto i = std::next(timestamps.begin()); i < timestamps.end(); i++) {
|
|
|
|
float op_us = (float)(*i * device_timescale) / 1000;
|
|
|
|
printf("%.2fus ", op_us);
|
|
|
|
}
|
|
|
|
printf("\n");
|
|
|
|
|
|
|
|
// print the resulting matrix
|
|
|
|
printf("matrixA:\n");
|
|
|
|
print_matrix(&tensorA->vector<float>()[0], MSIZE, MSIZE);
|
|
|
|
printf("matrixB:\n");
|
|
|
|
print_matrix(&tensorB->vector<float>()[0], MSIZE, MSIZE);
|
|
|
|
printf("matrixC:\n");
|
|
|
|
print_matrix(&tensorC->vector<float>()[0], MSIZE, MSIZE);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|