|
|
|
@ -1,7 +1,9 @@ |
|
|
|
|
#include <chrono> |
|
|
|
|
#include <fstream> |
|
|
|
|
#include <iostream> |
|
|
|
|
#include <iterator> |
|
|
|
|
#include <memory> |
|
|
|
|
#include <numeric> |
|
|
|
|
#include <regex> |
|
|
|
|
#include <sstream> |
|
|
|
|
#include <vector> |
|
|
|
@ -11,7 +13,7 @@ |
|
|
|
|
|
|
|
|
|
#include <unistd.h> |
|
|
|
|
|
|
|
|
|
#define MSIZE 128 |
|
|
|
|
#define MSIZE 64 |
|
|
|
|
|
|
|
|
|
static std::vector<uint32_t> compile_shader(const std::string &source) |
|
|
|
|
{ |
|
|
|
@ -64,6 +66,12 @@ int main() |
|
|
|
|
{ |
|
|
|
|
// create the kompute manager
|
|
|
|
|
kp::Manager mgr; |
|
|
|
|
|
|
|
|
|
// timestampPeriod is the number of nanoseconds required for a timestamp
|
|
|
|
|
// query to be incremented by 1.
|
|
|
|
|
auto device_proprieties = mgr.getDeviceProperties(); |
|
|
|
|
float device_timescale = device_proprieties.limits.timestampPeriod; |
|
|
|
|
|
|
|
|
|
// matrices are on the stack, this breaks for large MSIZE (1024)
|
|
|
|
|
float matrixA[MSIZE][MSIZE] = {0}; |
|
|
|
|
float matrixB[MSIZE][MSIZE] = {0}; |
|
|
|
@ -71,13 +79,16 @@ int main() |
|
|
|
|
// fill an identity matrix
|
|
|
|
|
for (int y = 0; y < MSIZE; y++) { |
|
|
|
|
matrixA[y][y] = 1.0; |
|
|
|
|
matrixB[y][y] = 2.0; |
|
|
|
|
} |
|
|
|
|
// fill a matrix with data
|
|
|
|
|
/*
|
|
|
|
|
for (int y = 0; y < MSIZE; y++) { |
|
|
|
|
for (int x = 0; x < MSIZE; x++) { |
|
|
|
|
matrixB[y][x] = x * 0.74 - y * 0.22; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
// create the tensors, tensors are just arrays, in the shader we will have
|
|
|
|
|
// to describe how it translates to matrices
|
|
|
|
@ -93,7 +104,12 @@ int main() |
|
|
|
|
|
|
|
|
|
// workgroup, dispatch a 2D array of workgroups (2D matrices)
|
|
|
|
|
// TODO: determine the size of the workgroups by doing some calls to vk
|
|
|
|
|
const int wgrp_x = 32, wgrp_y = 32; |
|
|
|
|
const int lcsize_x = 32; |
|
|
|
|
const int lcsize_y = 32; |
|
|
|
|
const int lcsize_z = 1; |
|
|
|
|
const int wgrp_x = std::max(MSIZE / lcsize_x, 1); |
|
|
|
|
const int wgrp_y = std::max(MSIZE / lcsize_y, 1); |
|
|
|
|
|
|
|
|
|
// this should call vkCmdDispatch(x, y, z)
|
|
|
|
|
kp::Workgroup workgroup({wgrp_x, wgrp_y, 1}); |
|
|
|
|
|
|
|
|
@ -103,31 +119,58 @@ int main() |
|
|
|
|
|
|
|
|
|
// substitute the value for the number of threads (xyz) per workgroup since
|
|
|
|
|
// it has to be a compile-time constant
|
|
|
|
|
shader_str = replacewith<int>("__lcsize_x__", 32, shader_str); |
|
|
|
|
shader_str = replacewith<int>("__lcsize_y__", 32, shader_str); |
|
|
|
|
shader_str = replacewith<int>("__lcsize_z__", 1, shader_str); |
|
|
|
|
shader_str = replacewith<int>("__lcsize_x__", lcsize_x, shader_str); |
|
|
|
|
shader_str = replacewith<int>("__lcsize_y__", lcsize_y, shader_str); |
|
|
|
|
shader_str = replacewith<int>("__lcsize_z__", lcsize_z, shader_str); |
|
|
|
|
|
|
|
|
|
printf("%s\n", shader_str.c_str()); |
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
const std::vector<uint32_t> shader = |
|
|
|
|
compile_shader(shader_to_string("shader.comp")); |
|
|
|
|
// compile the shader
|
|
|
|
|
const std::vector<uint32_t> shader = compile_shader(shader_str); |
|
|
|
|
|
|
|
|
|
// prepare the algorithm with shader, parameters, workgroups to dispatch and
|
|
|
|
|
// a specialization constant constant to specify the size of each tensor
|
|
|
|
|
std::shared_ptr<kp::Algorithm> algo = |
|
|
|
|
mgr.algorithm(params, shader, workgroup, {MSIZE}); |
|
|
|
|
|
|
|
|
|
mgr.sequence() |
|
|
|
|
->record<kp::OpTensorSyncDevice>(params) |
|
|
|
|
// start a timer to measure CPU (host) time
|
|
|
|
|
auto start = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
|
// evaluate the sequence of events synchronously on queue index 0 and
|
|
|
|
|
// attaching a maximum of 10 timestamp
|
|
|
|
|
std::shared_ptr<kp::Sequence> sq; |
|
|
|
|
sq = mgr.sequence(0, 10); |
|
|
|
|
sq->rerecord(); |
|
|
|
|
sq->record<kp::OpTensorSyncDevice>(params) |
|
|
|
|
->record<kp::OpAlgoDispatch>(algo) |
|
|
|
|
->record<kp::OpTensorSyncLocal>(params) |
|
|
|
|
->eval(); |
|
|
|
|
|
|
|
|
|
// stop all the timers and get the device (GPU) timestamps
|
|
|
|
|
auto end = std::chrono::high_resolution_clock::now(); |
|
|
|
|
auto total_time = |
|
|
|
|
std::chrono::duration_cast<std::chrono::microseconds>(end - start) |
|
|
|
|
.count(); |
|
|
|
|
std::vector<std::uint64_t> timestamps = sq->getTimestamps(); |
|
|
|
|
std::adjacent_difference( |
|
|
|
|
timestamps.begin(), timestamps.end(), timestamps.begin() |
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
// print all the timing information
|
|
|
|
|
printf("device timescale: %f\n", device_timescale); |
|
|
|
|
printf("cpu time: %ldus\ndevice times: ", total_time); |
|
|
|
|
for (auto i = std::next(timestamps.begin()); i < timestamps.end(); i++) { |
|
|
|
|
float op_us = (float)(*i * device_timescale) / 1000; |
|
|
|
|
printf("%.2fus ", op_us); |
|
|
|
|
} |
|
|
|
|
printf("\n"); |
|
|
|
|
|
|
|
|
|
// print the resulting matrix
|
|
|
|
|
std::cout << "Output: { "; |
|
|
|
|
for (const float &elem : tensorC->vector<float>()) { |
|
|
|
|
printf("%.2f, ", elem); |
|
|
|
|
for (int y = 0; y < MSIZE; y++) { |
|
|
|
|
for (int x = 0; x < MSIZE; x++) { |
|
|
|
|
float elem = tensorC->vector<float>().at(y * MSIZE + x); |
|
|
|
|
printf("%.1f ", elem); |
|
|
|
|
} |
|
|
|
|
printf("\n"); |
|
|
|
|
} |
|
|
|
|
std::cout << "}" << std::endl; |
|
|
|
|
|
|
|
|
|
return 0; |
|
|
|
|
} |
|
|
|
|