fuck it, we multiply

master
Alessandro Mauri 12 months ago
parent 366c7d2c03
commit 4ce1d245e3
  1. 2
      .clang-format
  2. 5
      test2/Makefile
  3. 75
      test2/main.cpp
  4. 25
      test2/shader.comp

@ -22,7 +22,7 @@ AlignConsecutiveDeclarations:
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: true
PadOperators: true
PadOperators: false
AlignConsecutiveMacros:
Enabled: true

@ -1,6 +1,9 @@
CPPFLAGS = -Wall -Wextra -g
# link kompute as a static library and the rest as dynamic
LDFLAGS = -L/usr/local/lib -Wl,-Bstatic -lkompute -lkp_logger -Wl,-Bdynamic -lvulkan -lfmt -Wl,--as-needed
LDFLAGS = -L/usr/local/lib \
-Wl,-Bstatic -lkompute -lkp_logger \
-Wl,-Bdynamic -lvulkan -lfmt \
-Wl,--as-needed
test2: main.cpp
g++ ${CPPFLAGS} main.cpp -o test2 ${LDFLAGS}

@ -1,7 +1,9 @@
#include <chrono>
#include <fstream>
#include <iostream>
#include <iterator>
#include <memory>
#include <numeric>
#include <regex>
#include <sstream>
#include <vector>
@ -11,7 +13,7 @@
#include <unistd.h>
#define MSIZE 128
#define MSIZE 64
static std::vector<uint32_t> compile_shader(const std::string &source)
{
@ -64,6 +66,12 @@ int main()
{
// create the kompute manager
kp::Manager mgr;
// timestampPeriod is the number of nanoseconds required for a timestamp
// query to be incremented by 1.
auto device_proprieties = mgr.getDeviceProperties();
float device_timescale = device_proprieties.limits.timestampPeriod;
// matrices are on the stack, this breaks for large MSIZE (1024)
float matrixA[MSIZE][MSIZE] = {0};
float matrixB[MSIZE][MSIZE] = {0};
@ -71,13 +79,16 @@ int main()
// fill an identity matrix
for (int y = 0; y < MSIZE; y++) {
matrixA[y][y] = 1.0;
matrixB[y][y] = 2.0;
}
// fill a matrix with data
/*
for (int y = 0; y < MSIZE; y++) {
for (int x = 0; x < MSIZE; x++) {
matrixB[y][x] = x * 0.74 - y * 0.22;
}
}
*/
// create the tensors, tensors are just arrays, in the shader we will have
// to describe how it translates to matrices
@ -93,7 +104,12 @@ int main()
// workgroup, dispatch a 2D array of workgroups (2D matrices)
// TODO: determine the size of the workgroups by doing some calls to vk
const int wgrp_x = 32, wgrp_y = 32;
const int lcsize_x = 32;
const int lcsize_y = 32;
const int lcsize_z = 1;
const int wgrp_x = std::max(MSIZE / lcsize_x, 1);
const int wgrp_y = std::max(MSIZE / lcsize_y, 1);
// this should call vkCmdDispatch(x, y, z)
kp::Workgroup workgroup({wgrp_x, wgrp_y, 1});
@ -103,31 +119,58 @@ int main()
// substitute the value for the number of threads (xyz) per workgroup since
// it has to be a compile-time constant
shader_str = replacewith<int>("__lcsize_x__", 32, shader_str);
shader_str = replacewith<int>("__lcsize_y__", 32, shader_str);
shader_str = replacewith<int>("__lcsize_z__", 1, shader_str);
shader_str = replacewith<int>("__lcsize_x__", lcsize_x, shader_str);
shader_str = replacewith<int>("__lcsize_y__", lcsize_y, shader_str);
shader_str = replacewith<int>("__lcsize_z__", lcsize_z, shader_str);
printf("%s\n", shader_str.c_str());
return 0;
const std::vector<uint32_t> shader =
compile_shader(shader_to_string("shader.comp"));
// compile the shader
const std::vector<uint32_t> shader = compile_shader(shader_str);
// prepare the algorithm with shader, parameters, workgroups to dispatch and
// a specialization constant constant to specify the size of each tensor
std::shared_ptr<kp::Algorithm> algo =
mgr.algorithm(params, shader, workgroup, {MSIZE});
mgr.sequence()
->record<kp::OpTensorSyncDevice>(params)
// start a timer to measure CPU (host) time
auto start = std::chrono::high_resolution_clock::now();
// evaluate the sequence of events synchronously on queue index 0 and
// attaching a maximum of 10 timestamp
std::shared_ptr<kp::Sequence> sq;
sq = mgr.sequence(0, 10);
sq->rerecord();
sq->record<kp::OpTensorSyncDevice>(params)
->record<kp::OpAlgoDispatch>(algo)
->record<kp::OpTensorSyncLocal>(params)
->eval();
// stop all the timers and get the device (GPU) timestamps
auto end = std::chrono::high_resolution_clock::now();
auto total_time =
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count();
std::vector<std::uint64_t> timestamps = sq->getTimestamps();
std::adjacent_difference(
timestamps.begin(), timestamps.end(), timestamps.begin()
);
// print all the timing information
printf("device timescale: %f\n", device_timescale);
printf("cpu time: %ldus\ndevice times: ", total_time);
for (auto i = std::next(timestamps.begin()); i < timestamps.end(); i++) {
float op_us = (float)(*i * device_timescale) / 1000;
printf("%.2fus ", op_us);
}
printf("\n");
// print the resulting matrix
std::cout << "Output: { ";
for (const float &elem : tensorC->vector<float>()) {
printf("%.2f, ", elem);
for (int y = 0; y < MSIZE; y++) {
for (int x = 0; x < MSIZE; x++) {
float elem = tensorC->vector<float>().at(y * MSIZE + x);
printf("%.1f ", elem);
}
printf("\n");
}
std::cout << "}" << std::endl;
return 0;
}

@ -4,9 +4,9 @@
// The number of threads spawned per-workgroup, these are substituted by the
// program pre-compilation
layout(
local_size_x = __lcsize_x__,
local_size_y = __lcsize_y__,
local_size_z = __lcsize_z__
local_size_x = __lcsize_x__,
local_size_y = __lcsize_y__,
local_size_z = __lcsize_z__
) in;
// The buffers are provided via the tensors
@ -14,11 +14,24 @@ layout(binding = 0) buffer tensorA { float matA[]; };
layout(binding = 1) buffer tensorB { float matB[]; };
layout(binding = 2) buffer tensorC { float matC[]; };
// specialization constants
// specialization constant
layout(constant_id = 0) const float tensor_size_f = 0;
// each thread calculates just matC[id.y][id.x]
void main()
{
uint index = gl_GlobalInvocationID.x;
o[index] = a[index] * b[index];
uint tensor_size_u = uint(tensor_size_f);
// thread ID in the workgroup and workgroup ID
uvec3 tid = gl_LocalInvocationID;
uvec3 gid = gl_WorkGroupID;
uvec3 id = gl_GlobalInvocationID;
// Cyx = sum(k, Ayk * Bkx)
float acc = 0;
uint y = id.y * tensor_size_u;
uint x = id.x;
for (uint k = 0; k < tensor_size_u; k++) {
acc += matA[y + k] * matB[x + k * tensor_size_u];
}
matC[y + id.x] = acc;
}
Loading…
Cancel
Save