fuck it, we multiply
This commit is contained in:
parent
366c7d2c03
commit
4ce1d245e3
@ -22,7 +22,7 @@ AlignConsecutiveDeclarations:
|
||||
AcrossEmptyLines: false
|
||||
AcrossComments: false
|
||||
AlignCompound: true
|
||||
PadOperators: true
|
||||
PadOperators: false
|
||||
|
||||
AlignConsecutiveMacros:
|
||||
Enabled: true
|
@ -1,6 +1,9 @@
|
||||
CPPFLAGS = -Wall -Wextra -g
|
||||
# link kompute as a static library and the rest as dynamic
|
||||
LDFLAGS = -L/usr/local/lib -Wl,-Bstatic -lkompute -lkp_logger -Wl,-Bdynamic -lvulkan -lfmt -Wl,--as-needed
|
||||
LDFLAGS = -L/usr/local/lib \
|
||||
-Wl,-Bstatic -lkompute -lkp_logger \
|
||||
-Wl,-Bdynamic -lvulkan -lfmt \
|
||||
-Wl,--as-needed
|
||||
|
||||
test2: main.cpp
|
||||
g++ ${CPPFLAGS} main.cpp -o test2 ${LDFLAGS}
|
||||
|
@ -1,7 +1,9 @@
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
@ -11,7 +13,7 @@
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#define MSIZE 128
|
||||
#define MSIZE 64
|
||||
|
||||
static std::vector<uint32_t> compile_shader(const std::string &source)
|
||||
{
|
||||
@ -64,6 +66,12 @@ int main()
|
||||
{
|
||||
// create the kompute manager
|
||||
kp::Manager mgr;
|
||||
|
||||
// timestampPeriod is the number of nanoseconds required for a timestamp
|
||||
// query to be incremented by 1.
|
||||
auto device_proprieties = mgr.getDeviceProperties();
|
||||
float device_timescale = device_proprieties.limits.timestampPeriod;
|
||||
|
||||
// matrices are on the stack, this breaks for large MSIZE (1024)
|
||||
float matrixA[MSIZE][MSIZE] = {0};
|
||||
float matrixB[MSIZE][MSIZE] = {0};
|
||||
@ -71,13 +79,16 @@ int main()
|
||||
// fill an identity matrix
|
||||
for (int y = 0; y < MSIZE; y++) {
|
||||
matrixA[y][y] = 1.0;
|
||||
matrixB[y][y] = 2.0;
|
||||
}
|
||||
// fill a matrix with data
|
||||
/*
|
||||
for (int y = 0; y < MSIZE; y++) {
|
||||
for (int x = 0; x < MSIZE; x++) {
|
||||
matrixB[y][x] = x * 0.74 - y * 0.22;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// create the tensors, tensors are just arrays, in the shader we will have
|
||||
// to describe how it translates to matrices
|
||||
@ -93,7 +104,12 @@ int main()
|
||||
|
||||
// workgroup, dispatch a 2D array of workgroups (2D matrices)
|
||||
// TODO: determine the size of the workgroups by doing some calls to vk
|
||||
const int wgrp_x = 32, wgrp_y = 32;
|
||||
const int lcsize_x = 32;
|
||||
const int lcsize_y = 32;
|
||||
const int lcsize_z = 1;
|
||||
const int wgrp_x = std::max(MSIZE / lcsize_x, 1);
|
||||
const int wgrp_y = std::max(MSIZE / lcsize_y, 1);
|
||||
|
||||
// this should call vkCmdDispatch(x, y, z)
|
||||
kp::Workgroup workgroup({wgrp_x, wgrp_y, 1});
|
||||
|
||||
@ -103,31 +119,58 @@ int main()
|
||||
|
||||
// substitute the value for the number of threads (xyz) per workgroup since
|
||||
// it has to be a compile-time constant
|
||||
shader_str = replacewith<int>("__lcsize_x__", 32, shader_str);
|
||||
shader_str = replacewith<int>("__lcsize_y__", 32, shader_str);
|
||||
shader_str = replacewith<int>("__lcsize_z__", 1, shader_str);
|
||||
shader_str = replacewith<int>("__lcsize_x__", lcsize_x, shader_str);
|
||||
shader_str = replacewith<int>("__lcsize_y__", lcsize_y, shader_str);
|
||||
shader_str = replacewith<int>("__lcsize_z__", lcsize_z, shader_str);
|
||||
|
||||
printf("%s\n", shader_str.c_str());
|
||||
return 0;
|
||||
|
||||
const std::vector<uint32_t> shader =
|
||||
compile_shader(shader_to_string("shader.comp"));
|
||||
// compile the shader
|
||||
const std::vector<uint32_t> shader = compile_shader(shader_str);
|
||||
|
||||
// prepare the algorithm with shader, parameters, workgroups to dispatch and
|
||||
// a specialization constant constant to specify the size of each tensor
|
||||
std::shared_ptr<kp::Algorithm> algo =
|
||||
mgr.algorithm(params, shader, workgroup, {MSIZE});
|
||||
|
||||
mgr.sequence()
|
||||
->record<kp::OpTensorSyncDevice>(params)
|
||||
// start a timer to measure CPU (host) time
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// evaluate the sequence of events synchronously on queue index 0 and
|
||||
// attaching a maximum of 10 timestamp
|
||||
std::shared_ptr<kp::Sequence> sq;
|
||||
sq = mgr.sequence(0, 10);
|
||||
sq->rerecord();
|
||||
sq->record<kp::OpTensorSyncDevice>(params)
|
||||
->record<kp::OpAlgoDispatch>(algo)
|
||||
->record<kp::OpTensorSyncLocal>(params)
|
||||
->eval();
|
||||
|
||||
// print the resulting matrix
|
||||
std::cout << "Output: { ";
|
||||
for (const float &elem : tensorC->vector<float>()) {
|
||||
printf("%.2f, ", elem);
|
||||
// stop all the timers and get the device (GPU) timestamps
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto total_time =
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
|
||||
.count();
|
||||
std::vector<std::uint64_t> timestamps = sq->getTimestamps();
|
||||
std::adjacent_difference(
|
||||
timestamps.begin(), timestamps.end(), timestamps.begin()
|
||||
);
|
||||
|
||||
// print all the timing information
|
||||
printf("device timescale: %f\n", device_timescale);
|
||||
printf("cpu time: %ldus\ndevice times: ", total_time);
|
||||
for (auto i = std::next(timestamps.begin()); i < timestamps.end(); i++) {
|
||||
float op_us = (float)(*i * device_timescale) / 1000;
|
||||
printf("%.2fus ", op_us);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// print the resulting matrix
|
||||
for (int y = 0; y < MSIZE; y++) {
|
||||
for (int x = 0; x < MSIZE; x++) {
|
||||
float elem = tensorC->vector<float>().at(y * MSIZE + x);
|
||||
printf("%.1f ", elem);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
std::cout << "}" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -4,9 +4,9 @@
|
||||
// The number of threads spawned per-workgroup, these are substituted by the
|
||||
// program pre-compilation
|
||||
layout(
|
||||
local_size_x = __lcsize_x__,
|
||||
local_size_y = __lcsize_y__,
|
||||
local_size_z = __lcsize_z__
|
||||
local_size_x = __lcsize_x__,
|
||||
local_size_y = __lcsize_y__,
|
||||
local_size_z = __lcsize_z__
|
||||
) in;
|
||||
|
||||
// The buffers are provided via the tensors
|
||||
@ -14,11 +14,24 @@ layout(binding = 0) buffer tensorA { float matA[]; };
|
||||
layout(binding = 1) buffer tensorB { float matB[]; };
|
||||
layout(binding = 2) buffer tensorC { float matC[]; };
|
||||
|
||||
// specialization constants
|
||||
// specialization constant
|
||||
layout(constant_id = 0) const float tensor_size_f = 0;
|
||||
|
||||
// each thread calculates just matC[id.y][id.x]
|
||||
void main()
|
||||
{
|
||||
uint index = gl_GlobalInvocationID.x;
|
||||
o[index] = a[index] * b[index];
|
||||
uint tensor_size_u = uint(tensor_size_f);
|
||||
// thread ID in the workgroup and workgroup ID
|
||||
uvec3 tid = gl_LocalInvocationID;
|
||||
uvec3 gid = gl_WorkGroupID;
|
||||
uvec3 id = gl_GlobalInvocationID;
|
||||
|
||||
// Cyx = sum(k, Ayk * Bkx)
|
||||
float acc = 0;
|
||||
uint y = id.y * tensor_size_u;
|
||||
uint x = id.x;
|
||||
for (uint k = 0; k < tensor_size_u; k++) {
|
||||
acc += matA[y + k] * matB[x + k * tensor_size_u];
|
||||
}
|
||||
matC[y + id.x] = acc;
|
||||
}
|
Loading…
Reference in New Issue
Block a user