kompute_tests/test2/main.cpp

#include <chrono>
#include <fstream>
#include <iostream>
#include <iterator>
#include <memory>
#include <numeric>
#include <regex>
#include <sstream>
#include <vector>

#include <kompute/Kompute.hpp>
#include <vulkan/vulkan_handles.hpp>

#include <unistd.h>

#define MSIZE 32

static std::vector<uint32_t> compile_shader(const std::string &source)
{
	std::ofstream fileOut("tmp_kp_shader.comp");
	fileOut << source;
	fileOut.close();
	if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o "
			       "tmp_kp_shader.comp.spv")
		       .c_str())) {
		throw std::runtime_error("Error running glslangValidator command");
	}
	std::ifstream	  fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
	std::vector<char> buffer;
	buffer.insert(
	    buffer.begin(), std::istreambuf_iterator<char>(fileStream), {}
	);
	return {
	    (uint32_t *)buffer.data(), (uint32_t *)(buffer.data() + buffer.size())};
}

static std::string shader_to_string(const char *path)
{
	std::ifstream comp_file;

	comp_file.open(path);
	if (comp_file.is_open() == false) {
		return std::string("// bad code");
	}

	std::ostringstream outstr;
	outstr << comp_file.rdbuf();
	return outstr.str();
}

template <typename T>
std::string replacewith(const char *needle, T val, std::string str)
{

	std::string replace = std::to_string(val);
	size_t	    len	    = strlen(needle);

	for (size_t pos = 0; (pos = str.find(needle)) != std::string::npos;) {
		str.replace(pos, len, replace);
	}
	return str;
}

void print_matrix(float m[], size_t w, size_t h)
{
	for (size_t y = 0; y < h; y++) {
		for (size_t x = 0; x < w; x++) {
			printf("%.1f ", m[y * w + x]);
		}
		printf("\n");
	}
}

void fill_identity(float m[], size_t w, size_t h)
{
	for (size_t y = 0; y < h; y++) {
		m[y * w + y] = 1.0;
	}
}

void fill_garbage(float m[], size_t w, size_t h)
{
	for (size_t y = 0; y < h; y++) {
		for (size_t x = 0; x < w; x++) {
			m[y * w + x] = x * 0.74 - y * 0.22;
		}
	}
}

// compute C = A*B on the GPU
int main()
{
	// create the kompute manager
	kp::Manager mgr;

	// timestampPeriod is the number of nanoseconds required for a timestamp
	// query to be incremented by 1.
	auto  device_proprieties = mgr.getDeviceProperties();
	float device_timescale	 = device_proprieties.limits.timestampPeriod;

	// matrices are on the stack, this breaks for large MSIZE (1024)
	float matrixA[MSIZE][MSIZE] = {0};
	float matrixB[MSIZE][MSIZE] = {0};
	float matrixC[MSIZE][MSIZE] = {0};

	fill_garbage((float *)matrixA, MSIZE, MSIZE);
	matrixB[0][0] = 1.0;

	// create the tensors, tensors are just arrays, in the shader we will have
	// to describe how it translates to matrices
	kp::Tensor::TensorDataTypes dtype = kp::Tensor::TensorDataTypes::eFloat;

	// auto because fuck C++
	auto tensorA = mgr.tensor(matrixA, MSIZE * MSIZE, sizeof(float), dtype);
	auto tensorB = mgr.tensor(matrixB, MSIZE * MSIZE, sizeof(float), dtype);
	auto tensorC = mgr.tensor(matrixC, MSIZE * MSIZE, sizeof(float), dtype);

	const std::vector<std::shared_ptr<kp::Tensor>> params = {
	    tensorA, tensorB, tensorC};

	// workgroup, dispatch a 2D array of workgroups (2D matrices)
	// TODO: determine the size of the workgroups by doing some calls to vk
	const int lcsize_x = 32;
	const int lcsize_y = 32;
	const int lcsize_z = 1;
	const int wgrp_x   = std::max(MSIZE / lcsize_x, 1);
	const int wgrp_y   = std::max(MSIZE / lcsize_y, 1);

	// this should call vkCmdDispatch(x, y, z)
	kp::Workgroup workgroup({wgrp_x, wgrp_y, 1});

	// get the shader code into a string
	const char *shader_path = "shader.comp";
	std::string shader_str	= shader_to_string(shader_path);

	// substitute the value for the number of threads (xyz) per workgroup since
	// it has to be a compile-time constant
	shader_str = replacewith<int>("__lcsize_x__", lcsize_x, shader_str);
	shader_str = replacewith<int>("__lcsize_y__", lcsize_y, shader_str);
	shader_str = replacewith<int>("__lcsize_z__", lcsize_z, shader_str);

	// compile the shader
	const std::vector<uint32_t> shader = compile_shader(shader_str);

	// prepare the algorithm with shader, parameters, workgroups to dispatch and
	// a specialization constant constant to specify the size of each tensor
	std::shared_ptr<kp::Algorithm> algo =
	    mgr.algorithm(params, shader, workgroup, {MSIZE});

	// start a timer to measure CPU (host) time
	auto start = std::chrono::high_resolution_clock::now();

	// evaluate the sequence of events synchronously on queue index 0 and
	// attaching a maximum of 10 timestamp
	std::shared_ptr<kp::Sequence> sq;
	sq = mgr.sequence(0, 10);
	sq->rerecord();
	sq->record<kp::OpTensorSyncDevice>(params)
	    ->record<kp::OpAlgoDispatch>(algo)
	    ->record<kp::OpTensorSyncLocal>(params)
	    ->eval();

	// stop all the timers and get the device (GPU) timestamps
	auto end = std::chrono::high_resolution_clock::now();
	auto total_time =
	    std::chrono::duration_cast<std::chrono::microseconds>(end - start)
		.count();
	std::vector<std::uint64_t> timestamps = sq->getTimestamps();
	std::adjacent_difference(
	    timestamps.begin(), timestamps.end(), timestamps.begin()
	);

	// print all the timing information
	printf("device timescale: %f\n", device_timescale);
	printf("cpu time: %ldus\ndevice times: ", total_time);
	for (auto i = std::next(timestamps.begin()); i < timestamps.end(); i++) {
		float op_us = (float)(*i * device_timescale) / 1000;
		printf("%.2fus ", op_us);
	}
	printf("\n");

	// print the resulting matrix
	printf("matrixA:\n");
	print_matrix(&tensorA->vector<float>()[0], MSIZE, MSIZE);
	printf("matrixB:\n");
	print_matrix(&tensorB->vector<float>()[0], MSIZE, MSIZE);
	printf("matrixC:\n");
	print_matrix(&tensorC->vector<float>()[0], MSIZE, MSIZE);

	return 0;
}
fuck it, we multiply 12 months ago			`#include <chrono>`
bruh 12 months ago			`#include <fstream>`
			`#include <iostream>`
			`#include <iterator>`
			`#include <memory>`
fuck it, we multiply 12 months ago			`#include <numeric>`
bruh 12 months ago			`#include <regex>`
			`#include <sstream>`
			`#include <vector>`

			`#include <kompute/Kompute.hpp>`
			`#include <vulkan/vulkan_handles.hpp>`

boo 12 months ago			`#include <unistd.h>`
bruh 12 months ago
cooler test 12 months ago			`#define MSIZE 32`
boo 12 months ago
			`static std::vector<uint32_t> compile_shader(const std::string &source)`
bruh 12 months ago			`{`
			`std::ofstream fileOut("tmp_kp_shader.comp");`
			`fileOut << source;`
			`fileOut.close();`
boo 12 months ago			`if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o "`
			`"tmp_kp_shader.comp.spv")`
			`.c_str())) {`
bruh 12 months ago			`throw std::runtime_error("Error running glslangValidator command");`
boo 12 months ago			`}`
			`std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary);`
bruh 12 months ago			`std::vector<char> buffer;`
boo 12 months ago			`buffer.insert(`
			`buffer.begin(), std::istreambuf_iterator<char>(fileStream), {}`
			`);`
			`return {`
			`(uint32_t )buffer.data(), (uint32_t )(buffer.data() + buffer.size())};`
bruh 12 months ago			`}`

			`static std::string shader_to_string(const char *path)`
			`{`
			`std::ifstream comp_file;`

			`comp_file.open(path);`
			`if (comp_file.is_open() == false) {`
			`return std::string("// bad code");`
			`}`

			`std::ostringstream outstr;`
			`outstr << comp_file.rdbuf();`
			`return outstr.str();`
			`}`

boo 12 months ago			`template <typename T>`
			`std::string replacewith(const char *needle, T val, std::string str)`
bruh 12 months ago			`{`

boo 12 months ago			`std::string replace = std::to_string(val);`
			`size_t len = strlen(needle);`
bruh 12 months ago
boo 12 months ago			`for (size_t pos = 0; (pos = str.find(needle)) != std::string::npos;) {`
			`str.replace(pos, len, replace);`
			`}`
			`return str;`
bruh 12 months ago			`}`

cooler test 12 months ago			`void print_matrix(float m[], size_t w, size_t h)`
			`{`
			`for (size_t y = 0; y < h; y++) {`
			`for (size_t x = 0; x < w; x++) {`
			`printf("%.1f ", m[y * w + x]);`
			`}`
			`printf("\n");`
			`}`
			`}`

			`void fill_identity(float m[], size_t w, size_t h)`
			`{`
			`for (size_t y = 0; y < h; y++) {`
			`m[y * w + y] = 1.0;`
			`}`
			`}`

			`void fill_garbage(float m[], size_t w, size_t h)`
			`{`
			`for (size_t y = 0; y < h; y++) {`
			`for (size_t x = 0; x < w; x++) {`
			`m[y * w + x] = x * 0.74 - y * 0.22;`
			`}`
			`}`
			`}`

boo 12 months ago			`// compute C = A*B on the GPU`
bruh 12 months ago			`int main()`
			`{`
			`// create the kompute manager`
			`kp::Manager mgr;`
fuck it, we multiply 12 months ago
			`// timestampPeriod is the number of nanoseconds required for a timestamp`
			`// query to be incremented by 1.`
			`auto device_proprieties = mgr.getDeviceProperties();`
			`float device_timescale = device_proprieties.limits.timestampPeriod;`

boo 12 months ago			`// matrices are on the stack, this breaks for large MSIZE (1024)`
			`float matrixA[MSIZE][MSIZE] = {0};`
			`float matrixB[MSIZE][MSIZE] = {0};`
			`float matrixC[MSIZE][MSIZE] = {0};`
cooler test 12 months ago
			`fill_garbage((float *)matrixA, MSIZE, MSIZE);`
			`matrixB[0][0] = 1.0;`
bruh 12 months ago
			`// create the tensors, tensors are just arrays, in the shader we will have`
			`// to describe how it translates to matrices`
			`kp::Tensor::TensorDataTypes dtype = kp::Tensor::TensorDataTypes::eFloat;`

			`// auto because fuck C++`
boo 12 months ago			`auto tensorA = mgr.tensor(matrixA, MSIZE * MSIZE, sizeof(float), dtype);`
			`auto tensorB = mgr.tensor(matrixB, MSIZE * MSIZE, sizeof(float), dtype);`
			`auto tensorC = mgr.tensor(matrixC, MSIZE * MSIZE, sizeof(float), dtype);`
bruh 12 months ago
			`const std::vector<std::shared_ptr<kp::Tensor>> params = {`
boo 12 months ago			`tensorA, tensorB, tensorC};`
bruh 12 months ago
			`// workgroup, dispatch a 2D array of workgroups (2D matrices)`
			`// TODO: determine the size of the workgroups by doing some calls to vk`
fuck it, we multiply 12 months ago			`const int lcsize_x = 32;`
			`const int lcsize_y = 32;`
			`const int lcsize_z = 1;`
			`const int wgrp_x = std::max(MSIZE / lcsize_x, 1);`
			`const int wgrp_y = std::max(MSIZE / lcsize_y, 1);`

bruh 12 months ago			`// this should call vkCmdDispatch(x, y, z)`
			`kp::Workgroup workgroup({wgrp_x, wgrp_y, 1});`

boo 12 months ago			`// get the shader code into a string`
bruh 12 months ago			`const char *shader_path = "shader.comp";`
boo 12 months ago			`std::string shader_str = shader_to_string(shader_path);`
bruh 12 months ago
boo 12 months ago			`// substitute the value for the number of threads (xyz) per workgroup since`
			`// it has to be a compile-time constant`
fuck it, we multiply 12 months ago			`shader_str = replacewith<int>("__lcsize_x__", lcsize_x, shader_str);`
			`shader_str = replacewith<int>("__lcsize_y__", lcsize_y, shader_str);`
			`shader_str = replacewith<int>("__lcsize_z__", lcsize_z, shader_str);`
bruh 12 months ago
fuck it, we multiply 12 months ago			`// compile the shader`
			`const std::vector<uint32_t> shader = compile_shader(shader_str);`
bruh 12 months ago
fuck it, we multiply 12 months ago			`// prepare the algorithm with shader, parameters, workgroups to dispatch and`
			`// a specialization constant constant to specify the size of each tensor`
boo 12 months ago			`std::shared_ptr<kp::Algorithm> algo =`
			`mgr.algorithm(params, shader, workgroup, {MSIZE});`

fuck it, we multiply 12 months ago			`// start a timer to measure CPU (host) time`
			`auto start = std::chrono::high_resolution_clock::now();`

			`// evaluate the sequence of events synchronously on queue index 0 and`
			`// attaching a maximum of 10 timestamp`
			`std::shared_ptr<kp::Sequence> sq;`
			`sq = mgr.sequence(0, 10);`
			`sq->rerecord();`
			`sq->record<kp::OpTensorSyncDevice>(params)`
boo 12 months ago			`->record<kp::OpAlgoDispatch>(algo)`
			`->record<kp::OpTensorSyncLocal>(params)`
			`->eval();`
bruh 12 months ago
fuck it, we multiply 12 months ago			`// stop all the timers and get the device (GPU) timestamps`
			`auto end = std::chrono::high_resolution_clock::now();`
			`auto total_time =`
			`std::chrono::duration_cast<std::chrono::microseconds>(end - start)`
			`.count();`
			`std::vector<std::uint64_t> timestamps = sq->getTimestamps();`
			`std::adjacent_difference(`
			`timestamps.begin(), timestamps.end(), timestamps.begin()`
			`);`

			`// print all the timing information`
			`printf("device timescale: %f\n", device_timescale);`
			`printf("cpu time: %ldus\ndevice times: ", total_time);`
			`for (auto i = std::next(timestamps.begin()); i < timestamps.end(); i++) {`
			`float op_us = (float)(i device_timescale) / 1000;`
			`printf("%.2fus ", op_us);`
			`}`
			`printf("\n");`

bruh 12 months ago			`// print the resulting matrix`
cooler test 12 months ago			`printf("matrixA:\n");`
			`print_matrix(&tensorA->vector<float>()[0], MSIZE, MSIZE);`
			`printf("matrixB:\n");`
			`print_matrix(&tensorB->vector<float>()[0], MSIZE, MSIZE);`
			`printf("matrixC:\n");`
			`print_matrix(&tensorC->vector<float>()[0], MSIZE, MSIZE);`
bruh 12 months ago
			`return 0;`
			`}`