fuck it, we multiply
This commit is contained in:
parent
366c7d2c03
commit
4ce1d245e3
@ -22,7 +22,7 @@ AlignConsecutiveDeclarations:
|
|||||||
AcrossEmptyLines: false
|
AcrossEmptyLines: false
|
||||||
AcrossComments: false
|
AcrossComments: false
|
||||||
AlignCompound: true
|
AlignCompound: true
|
||||||
PadOperators: true
|
PadOperators: false
|
||||||
|
|
||||||
AlignConsecutiveMacros:
|
AlignConsecutiveMacros:
|
||||||
Enabled: true
|
Enabled: true
|
@ -1,6 +1,9 @@
|
|||||||
CPPFLAGS = -Wall -Wextra -g
|
CPPFLAGS = -Wall -Wextra -g
|
||||||
# link kompute as a static library and the rest as dynamic
|
# link kompute as a static library and the rest as dynamic
|
||||||
LDFLAGS = -L/usr/local/lib -Wl,-Bstatic -lkompute -lkp_logger -Wl,-Bdynamic -lvulkan -lfmt -Wl,--as-needed
|
LDFLAGS = -L/usr/local/lib \
|
||||||
|
-Wl,-Bstatic -lkompute -lkp_logger \
|
||||||
|
-Wl,-Bdynamic -lvulkan -lfmt \
|
||||||
|
-Wl,--as-needed
|
||||||
|
|
||||||
test2: main.cpp
|
test2: main.cpp
|
||||||
g++ ${CPPFLAGS} main.cpp -o test2 ${LDFLAGS}
|
g++ ${CPPFLAGS} main.cpp -o test2 ${LDFLAGS}
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
|
#include <chrono>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <numeric>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -11,7 +13,7 @@
|
|||||||
|
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#define MSIZE 128
|
#define MSIZE 64
|
||||||
|
|
||||||
static std::vector<uint32_t> compile_shader(const std::string &source)
|
static std::vector<uint32_t> compile_shader(const std::string &source)
|
||||||
{
|
{
|
||||||
@ -64,6 +66,12 @@ int main()
|
|||||||
{
|
{
|
||||||
// create the kompute manager
|
// create the kompute manager
|
||||||
kp::Manager mgr;
|
kp::Manager mgr;
|
||||||
|
|
||||||
|
// timestampPeriod is the number of nanoseconds required for a timestamp
|
||||||
|
// query to be incremented by 1.
|
||||||
|
auto device_proprieties = mgr.getDeviceProperties();
|
||||||
|
float device_timescale = device_proprieties.limits.timestampPeriod;
|
||||||
|
|
||||||
// matrices are on the stack, this breaks for large MSIZE (1024)
|
// matrices are on the stack, this breaks for large MSIZE (1024)
|
||||||
float matrixA[MSIZE][MSIZE] = {0};
|
float matrixA[MSIZE][MSIZE] = {0};
|
||||||
float matrixB[MSIZE][MSIZE] = {0};
|
float matrixB[MSIZE][MSIZE] = {0};
|
||||||
@ -71,13 +79,16 @@ int main()
|
|||||||
// fill an identity matrix
|
// fill an identity matrix
|
||||||
for (int y = 0; y < MSIZE; y++) {
|
for (int y = 0; y < MSIZE; y++) {
|
||||||
matrixA[y][y] = 1.0;
|
matrixA[y][y] = 1.0;
|
||||||
|
matrixB[y][y] = 2.0;
|
||||||
}
|
}
|
||||||
// fill a matrix with data
|
// fill a matrix with data
|
||||||
|
/*
|
||||||
for (int y = 0; y < MSIZE; y++) {
|
for (int y = 0; y < MSIZE; y++) {
|
||||||
for (int x = 0; x < MSIZE; x++) {
|
for (int x = 0; x < MSIZE; x++) {
|
||||||
matrixB[y][x] = x * 0.74 - y * 0.22;
|
matrixB[y][x] = x * 0.74 - y * 0.22;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
// create the tensors, tensors are just arrays, in the shader we will have
|
// create the tensors, tensors are just arrays, in the shader we will have
|
||||||
// to describe how it translates to matrices
|
// to describe how it translates to matrices
|
||||||
@ -93,7 +104,12 @@ int main()
|
|||||||
|
|
||||||
// workgroup, dispatch a 2D array of workgroups (2D matrices)
|
// workgroup, dispatch a 2D array of workgroups (2D matrices)
|
||||||
// TODO: determine the size of the workgroups by doing some calls to vk
|
// TODO: determine the size of the workgroups by doing some calls to vk
|
||||||
const int wgrp_x = 32, wgrp_y = 32;
|
const int lcsize_x = 32;
|
||||||
|
const int lcsize_y = 32;
|
||||||
|
const int lcsize_z = 1;
|
||||||
|
const int wgrp_x = std::max(MSIZE / lcsize_x, 1);
|
||||||
|
const int wgrp_y = std::max(MSIZE / lcsize_y, 1);
|
||||||
|
|
||||||
// this should call vkCmdDispatch(x, y, z)
|
// this should call vkCmdDispatch(x, y, z)
|
||||||
kp::Workgroup workgroup({wgrp_x, wgrp_y, 1});
|
kp::Workgroup workgroup({wgrp_x, wgrp_y, 1});
|
||||||
|
|
||||||
@ -103,31 +119,58 @@ int main()
|
|||||||
|
|
||||||
// substitute the value for the number of threads (xyz) per workgroup since
|
// substitute the value for the number of threads (xyz) per workgroup since
|
||||||
// it has to be a compile-time constant
|
// it has to be a compile-time constant
|
||||||
shader_str = replacewith<int>("__lcsize_x__", 32, shader_str);
|
shader_str = replacewith<int>("__lcsize_x__", lcsize_x, shader_str);
|
||||||
shader_str = replacewith<int>("__lcsize_y__", 32, shader_str);
|
shader_str = replacewith<int>("__lcsize_y__", lcsize_y, shader_str);
|
||||||
shader_str = replacewith<int>("__lcsize_z__", 1, shader_str);
|
shader_str = replacewith<int>("__lcsize_z__", lcsize_z, shader_str);
|
||||||
|
|
||||||
printf("%s\n", shader_str.c_str());
|
// compile the shader
|
||||||
return 0;
|
const std::vector<uint32_t> shader = compile_shader(shader_str);
|
||||||
|
|
||||||
const std::vector<uint32_t> shader =
|
|
||||||
compile_shader(shader_to_string("shader.comp"));
|
|
||||||
|
|
||||||
|
// prepare the algorithm with shader, parameters, workgroups to dispatch and
|
||||||
|
// a specialization constant constant to specify the size of each tensor
|
||||||
std::shared_ptr<kp::Algorithm> algo =
|
std::shared_ptr<kp::Algorithm> algo =
|
||||||
mgr.algorithm(params, shader, workgroup, {MSIZE});
|
mgr.algorithm(params, shader, workgroup, {MSIZE});
|
||||||
|
|
||||||
mgr.sequence()
|
// start a timer to measure CPU (host) time
|
||||||
->record<kp::OpTensorSyncDevice>(params)
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// evaluate the sequence of events synchronously on queue index 0 and
|
||||||
|
// attaching a maximum of 10 timestamp
|
||||||
|
std::shared_ptr<kp::Sequence> sq;
|
||||||
|
sq = mgr.sequence(0, 10);
|
||||||
|
sq->rerecord();
|
||||||
|
sq->record<kp::OpTensorSyncDevice>(params)
|
||||||
->record<kp::OpAlgoDispatch>(algo)
|
->record<kp::OpAlgoDispatch>(algo)
|
||||||
->record<kp::OpTensorSyncLocal>(params)
|
->record<kp::OpTensorSyncLocal>(params)
|
||||||
->eval();
|
->eval();
|
||||||
|
|
||||||
// print the resulting matrix
|
// stop all the timers and get the device (GPU) timestamps
|
||||||
std::cout << "Output: { ";
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
for (const float &elem : tensorC->vector<float>()) {
|
auto total_time =
|
||||||
printf("%.2f, ", elem);
|
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
|
||||||
|
.count();
|
||||||
|
std::vector<std::uint64_t> timestamps = sq->getTimestamps();
|
||||||
|
std::adjacent_difference(
|
||||||
|
timestamps.begin(), timestamps.end(), timestamps.begin()
|
||||||
|
);
|
||||||
|
|
||||||
|
// print all the timing information
|
||||||
|
printf("device timescale: %f\n", device_timescale);
|
||||||
|
printf("cpu time: %ldus\ndevice times: ", total_time);
|
||||||
|
for (auto i = std::next(timestamps.begin()); i < timestamps.end(); i++) {
|
||||||
|
float op_us = (float)(*i * device_timescale) / 1000;
|
||||||
|
printf("%.2fus ", op_us);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
// print the resulting matrix
|
||||||
|
for (int y = 0; y < MSIZE; y++) {
|
||||||
|
for (int x = 0; x < MSIZE; x++) {
|
||||||
|
float elem = tensorC->vector<float>().at(y * MSIZE + x);
|
||||||
|
printf("%.1f ", elem);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
}
|
}
|
||||||
std::cout << "}" << std::endl;
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -14,11 +14,24 @@ layout(binding = 0) buffer tensorA { float matA[]; };
|
|||||||
layout(binding = 1) buffer tensorB { float matB[]; };
|
layout(binding = 1) buffer tensorB { float matB[]; };
|
||||||
layout(binding = 2) buffer tensorC { float matC[]; };
|
layout(binding = 2) buffer tensorC { float matC[]; };
|
||||||
|
|
||||||
// specialization constants
|
// specialization constant
|
||||||
layout(constant_id = 0) const float tensor_size_f = 0;
|
layout(constant_id = 0) const float tensor_size_f = 0;
|
||||||
|
|
||||||
|
// each thread calculates just matC[id.y][id.x]
|
||||||
void main()
|
void main()
|
||||||
{
|
{
|
||||||
uint index = gl_GlobalInvocationID.x;
|
uint tensor_size_u = uint(tensor_size_f);
|
||||||
o[index] = a[index] * b[index];
|
// thread ID in the workgroup and workgroup ID
|
||||||
|
uvec3 tid = gl_LocalInvocationID;
|
||||||
|
uvec3 gid = gl_WorkGroupID;
|
||||||
|
uvec3 id = gl_GlobalInvocationID;
|
||||||
|
|
||||||
|
// Cyx = sum(k, Ayk * Bkx)
|
||||||
|
float acc = 0;
|
||||||
|
uint y = id.y * tensor_size_u;
|
||||||
|
uint x = id.x;
|
||||||
|
for (uint k = 0; k < tensor_size_u; k++) {
|
||||||
|
acc += matA[y + k] * matB[x + k * tensor_size_u];
|
||||||
|
}
|
||||||
|
matC[y + id.x] = acc;
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user