diff --git a/test2/.clang-format b/.clang-format similarity index 97% rename from test2/.clang-format rename to .clang-format index 1b9315e..ae6c3fd 100644 --- a/test2/.clang-format +++ b/.clang-format @@ -22,7 +22,7 @@ AlignConsecutiveDeclarations: AcrossEmptyLines: false AcrossComments: false AlignCompound: true - PadOperators: true + PadOperators: false AlignConsecutiveMacros: Enabled: true diff --git a/test2/Makefile b/test2/Makefile index 8e8b2ea..80efcdb 100644 --- a/test2/Makefile +++ b/test2/Makefile @@ -1,6 +1,9 @@ CPPFLAGS = -Wall -Wextra -g # link kompute as a static library and the rest as dynamic -LDFLAGS = -L/usr/local/lib -Wl,-Bstatic -lkompute -lkp_logger -Wl,-Bdynamic -lvulkan -lfmt -Wl,--as-needed +LDFLAGS = -L/usr/local/lib \ + -Wl,-Bstatic -lkompute -lkp_logger \ + -Wl,-Bdynamic -lvulkan -lfmt \ + -Wl,--as-needed test2: main.cpp g++ ${CPPFLAGS} main.cpp -o test2 ${LDFLAGS} diff --git a/test2/main.cpp b/test2/main.cpp index 838e8f4..9d62028 100644 --- a/test2/main.cpp +++ b/test2/main.cpp @@ -1,7 +1,9 @@ +#include #include #include #include #include +#include #include #include #include @@ -11,7 +13,7 @@ #include -#define MSIZE 128 +#define MSIZE 64 static std::vector compile_shader(const std::string &source) { @@ -64,6 +66,12 @@ int main() { // create the kompute manager kp::Manager mgr; + + // timestampPeriod is the number of nanoseconds required for a timestamp + // query to be incremented by 1. + auto device_proprieties = mgr.getDeviceProperties(); + float device_timescale = device_proprieties.limits.timestampPeriod; + // matrices are on the stack, this breaks for large MSIZE (1024) float matrixA[MSIZE][MSIZE] = {0}; float matrixB[MSIZE][MSIZE] = {0}; @@ -71,13 +79,16 @@ int main() // fill an identity matrix for (int y = 0; y < MSIZE; y++) { matrixA[y][y] = 1.0; + matrixB[y][y] = 2.0; } // fill a matrix with data + /* for (int y = 0; y < MSIZE; y++) { for (int x = 0; x < MSIZE; x++) { matrixB[y][x] = x * 0.74 - y * 0.22; } } + */ // create the tensors, tensors are just arrays, in the shader we will have // to describe how it translates to matrices @@ -93,7 +104,12 @@ int main() // workgroup, dispatch a 2D array of workgroups (2D matrices) // TODO: determine the size of the workgroups by doing some calls to vk - const int wgrp_x = 32, wgrp_y = 32; + const int lcsize_x = 32; + const int lcsize_y = 32; + const int lcsize_z = 1; + const int wgrp_x = std::max(MSIZE / lcsize_x, 1); + const int wgrp_y = std::max(MSIZE / lcsize_y, 1); + // this should call vkCmdDispatch(x, y, z) kp::Workgroup workgroup({wgrp_x, wgrp_y, 1}); @@ -103,31 +119,58 @@ int main() // substitute the value for the number of threads (xyz) per workgroup since // it has to be a compile-time constant - shader_str = replacewith("__lcsize_x__", 32, shader_str); - shader_str = replacewith("__lcsize_y__", 32, shader_str); - shader_str = replacewith("__lcsize_z__", 1, shader_str); + shader_str = replacewith("__lcsize_x__", lcsize_x, shader_str); + shader_str = replacewith("__lcsize_y__", lcsize_y, shader_str); + shader_str = replacewith("__lcsize_z__", lcsize_z, shader_str); - printf("%s\n", shader_str.c_str()); - return 0; - - const std::vector shader = - compile_shader(shader_to_string("shader.comp")); + // compile the shader + const std::vector shader = compile_shader(shader_str); + // prepare the algorithm with shader, parameters, workgroups to dispatch and + // a specialization constant constant to specify the size of each tensor std::shared_ptr algo = mgr.algorithm(params, shader, workgroup, {MSIZE}); - mgr.sequence() - ->record(params) + // start a timer to measure CPU (host) time + auto start = std::chrono::high_resolution_clock::now(); + + // evaluate the sequence of events synchronously on queue index 0 and + // attaching a maximum of 10 timestamp + std::shared_ptr sq; + sq = mgr.sequence(0, 10); + sq->rerecord(); + sq->record(params) ->record(algo) ->record(params) ->eval(); + // stop all the timers and get the device (GPU) timestamps + auto end = std::chrono::high_resolution_clock::now(); + auto total_time = + std::chrono::duration_cast(end - start) + .count(); + std::vector timestamps = sq->getTimestamps(); + std::adjacent_difference( + timestamps.begin(), timestamps.end(), timestamps.begin() + ); + + // print all the timing information + printf("device timescale: %f\n", device_timescale); + printf("cpu time: %ldus\ndevice times: ", total_time); + for (auto i = std::next(timestamps.begin()); i < timestamps.end(); i++) { + float op_us = (float)(*i * device_timescale) / 1000; + printf("%.2fus ", op_us); + } + printf("\n"); + // print the resulting matrix - std::cout << "Output: { "; - for (const float &elem : tensorC->vector()) { - printf("%.2f, ", elem); + for (int y = 0; y < MSIZE; y++) { + for (int x = 0; x < MSIZE; x++) { + float elem = tensorC->vector().at(y * MSIZE + x); + printf("%.1f ", elem); + } + printf("\n"); } - std::cout << "}" << std::endl; return 0; } diff --git a/test2/shader.comp b/test2/shader.comp index 3316350..ac006f7 100644 --- a/test2/shader.comp +++ b/test2/shader.comp @@ -4,9 +4,9 @@ // The number of threads spawned per-workgroup, these are substituted by the // program pre-compilation layout( - local_size_x = __lcsize_x__, - local_size_y = __lcsize_y__, - local_size_z = __lcsize_z__ + local_size_x = __lcsize_x__, + local_size_y = __lcsize_y__, + local_size_z = __lcsize_z__ ) in; // The buffers are provided via the tensors @@ -14,11 +14,24 @@ layout(binding = 0) buffer tensorA { float matA[]; }; layout(binding = 1) buffer tensorB { float matB[]; }; layout(binding = 2) buffer tensorC { float matC[]; }; -// specialization constants +// specialization constant layout(constant_id = 0) const float tensor_size_f = 0; +// each thread calculates just matC[id.y][id.x] void main() { - uint index = gl_GlobalInvocationID.x; - o[index] = a[index] * b[index]; + uint tensor_size_u = uint(tensor_size_f); + // thread ID in the workgroup and workgroup ID + uvec3 tid = gl_LocalInvocationID; + uvec3 gid = gl_WorkGroupID; + uvec3 id = gl_GlobalInvocationID; + + // Cyx = sum(k, Ayk * Bkx) + float acc = 0; + uint y = id.y * tensor_size_u; + uint x = id.x; + for (uint k = 0; k < tensor_size_u; k++) { + acc += matA[y + k] * matB[x + k * tensor_size_u]; + } + matC[y + id.x] = acc; } \ No newline at end of file