Commit 4dfba2ea authored by Gustavo Valiente's avatar Gustavo Valiente

Multithread support

Tensor padding removed
catch-mini updated
parent 162571a0
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE QtCreatorProject>
<!-- Written by QtCreator 4.4.1, 2018-05-20T16:39:39. -->
<!-- Written by QtCreator 4.4.1, 2018-07-13T17:03:54. -->
<qtcreator>
<data>
<variable>EnvironmentId</variable>
......@@ -115,7 +115,7 @@
<value type="QString">CMAKE_CXX_COMPILER:STRING=%{Compiler:Executable:Cxx}</value>
<value type="QString">CMAKE_C_COMPILER:STRING=%{Compiler:Executable:C}</value>
<value type="QString">CMAKE_PREFIX_PATH:STRING=%{Qt:QT_INSTALL_PREFIX}</value>
<value type="QString">PT_BUILD_ALL:BOOL=ON</value>
<value type="QString">PT_BUILD_ALL:BOOL=OFF</value>
<value type="QString">QT_QMAKE_EXECUTABLE:STRING=%{Qt:qmakeExecutable}</value>
</valuelist>
<value type="QString" key="ProjectExplorer.BuildConfiguration.BuildDirectory">C:/dev/builds/pocket-tensor-Release</value>
......@@ -219,7 +219,7 @@
<value type="QString" key="CMakeProjectManager.CMakeRunConfiguration.UserWorkingDirectory.default">C:/dev/builds/pocket-tensor-Release/tests</value>
<value type="int" key="PE.EnvironmentAspect.Base">2</value>
<valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">pocket-tensor-tests</value>
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">pocket-tensor-tests (disabled)</value>
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName"></value>
<value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.pocket-tensor-tests</value>
<value type="uint" key="RunConfiguration.QmlDebugServerPort">3768</value>
......
......@@ -4,6 +4,7 @@ project(pocket-tensor)
# Define sources:
set(SOURCES
src/pt_tensor.cpp
src/pt_dispatcher.cpp
src/pt_layer.cpp
src/pt_dense_layer.cpp
src/pt_conv_1d_layer.cpp
......@@ -30,3 +31,8 @@ target_include_directories(${PROJECT_NAME}
target_include_directories(${PROJECT_NAME}
PUBLIC ${PT_LIBSIMDPP_PATH}
)
# Include pthread (GCC only):
if(CMAKE_COMPILER_IS_GNUCC)
target_link_libraries(${PROJECT_NAME} pthread)
endif()
/*
* pocket-tensor (c) 2018 Gustavo Valiente gustavo.valiente.m@gmail.com
* Kerasify (c) 2016 Robert W. Rose
*
* MIT License, see LICENSE file.
*/
#ifndef HIKE_THREAD_POOL_H
#define HIKE_THREAD_POOL_H
#include <vector>
#include <deque>
#include <thread>
#include <mutex>
#include <condition_variable>
namespace pt
{
class Dispatcher
{
public:
using Task = std::function<void(void)>;
Dispatcher();
explicit Dispatcher(std::size_t threads);
~Dispatcher();
std::size_t threads() const noexcept
{
return _threadsCount;
}
void add(Task&& task);
std::size_t pendingTasks() noexcept;
void join();
protected:
std::mutex _mutex;
std::condition_variable _condition;
std::deque<Task> _tasks;
std::vector<std::thread> _threads;
std::size_t _threadsCount;
bool _exit;
std::mutex _pendingTasksMutex;
std::condition_variable _pendingTasksCondition;
std::size_t _pendingTasks;
};
}
#endif
......@@ -14,8 +14,7 @@
namespace pt
{
class Tensor;
class Config;
struct LayerData;
class Layer
{
......@@ -25,7 +24,7 @@ public:
virtual ~Layer() noexcept;
virtual bool apply(const Config& config, Tensor&& in, Tensor& out) const = 0;
virtual bool apply(LayerData& layerData) const = 0;
protected:
Layer() = default;
......
/*
* pocket-tensor (c) 2018 Gustavo Valiente gustavo.valiente.m@gmail.com
* Kerasify (c) 2016 Robert W. Rose
*
* MIT License, see LICENSE file.
*/
#ifndef PT_LAYER_DATA_H
#define PT_LAYER_DATA_H
#include "pt_tensor.h"
namespace pt
{
class Config;
class Dispatcher;
struct LayerData
{
Tensor in;
Tensor& out;
Dispatcher& dispatcher;
const Config& config;
};
}
#endif
......@@ -15,6 +15,9 @@
namespace pt
{
class Tensor;
class Dispatcher;
class Model
{
......@@ -25,6 +28,8 @@ public:
bool predict(Tensor in, Tensor& out) const;
bool predict(Dispatcher& dispatcher, Tensor in, Tensor& out) const;
const Config& getConfig() const noexcept
{
return _config;
......
......@@ -17,6 +17,8 @@
namespace pt
{
class Dispatcher;
class Tensor
{
......@@ -63,11 +65,6 @@ public:
return _dims;
}
const DimsVector& getUnpaddedDims() const noexcept
{
return _unpaddedDims;
}
std::size_t getSize() const noexcept
{
return getSizeImpl(_dims);
......@@ -166,14 +163,6 @@ public:
void resize(std::size_t i, std::size_t j, std::size_t k, std::size_t l);
void resizeWithPadding(std::size_t i);
void resizeWithPadding(std::size_t i, std::size_t j);
void resizeWithPadding(std::size_t i, std::size_t j, std::size_t k);
void resizeWithPadding(std::size_t i, std::size_t j, std::size_t k, std::size_t l);
void setData(DataVector data) noexcept
{
PT_ASSERT(_data.size() == data.size());
......@@ -181,15 +170,6 @@ public:
_data = std::move(data);
}
bool hasPadding() const noexcept
{
return ! _unpaddedDims.empty();
}
void addPadding(bool copyData = true);
void removePadding(bool copyData = true);
void fill(Type value) noexcept;
void flatten();
......@@ -212,39 +192,39 @@ public:
return output;
}
void add(const Tensor& other, Tensor& out) const;
void add(const Tensor& other, Tensor& out, Dispatcher& dispatcher) const;
Tensor add(const Tensor& other) const
Tensor add(const Tensor& other, Dispatcher& dispatcher) const
{
Tensor output;
add(other, output);
add(other, output, dispatcher);
return output;
}
void multiply(const Tensor& other, Tensor& out) const;
void multiply(const Tensor& other, Tensor& out, Dispatcher& dispatcher) const;
Tensor multiply(const Tensor& other) const
Tensor multiply(const Tensor& other, Dispatcher& dispatcher) const
{
Tensor output;
multiply(other, output);
multiply(other, output, dispatcher);
return output;
}
void dot(const Tensor& other, Tensor& out) const;
void dot(const Tensor& other, Tensor& out, Dispatcher& dispatcher) const;
Tensor dot(const Tensor& other) const
Tensor dot(const Tensor& other, Dispatcher& dispatcher) const
{
Tensor output;
dot(other, output);
dot(other, output, dispatcher);
return output;
}
void fma(const Tensor& scale, const Tensor& bias, Tensor& out) const;
void fma(const Tensor& scale, const Tensor& bias, Tensor& out, Dispatcher& dispatcher) const;
Tensor fma(const Tensor& scale, const Tensor& bias) const
Tensor fma(const Tensor& scale, const Tensor& bias, Dispatcher& dispatcher) const
{
Tensor output;
fma(scale, bias, output);
fma(scale, bias, output, dispatcher);
return output;
}
......@@ -256,7 +236,6 @@ public:
protected:
DimsVector _dims;
DimsVector _unpaddedDims;
DataVector _data;
static std::size_t getSizeImpl(const DimsVector& dims) noexcept
......
......@@ -21,6 +21,9 @@
#define PT_LOOP_UNROLLING_ENABLE 0
#endif
// Define max CPU threads:
#define PT_MAX_CPU_THREADS 16
// Define libsimdpp arch:
#ifdef __arm__
#define SIMDPP_ARCH_ARM_NEON_FLT_SP
......
......@@ -7,8 +7,8 @@
#include "pt_activation_layer.h"
#include "pt_tensor.h"
#include "pt_parser.h"
#include "pt_layer_data.h"
#include "pt_linear_activation_layer.h"
#include "pt_relu_activation_layer.h"
#include "pt_elu_activation_layer.h"
......@@ -96,10 +96,10 @@ std::unique_ptr<ActivationLayer> ActivationLayer::create(std::istream& stream)
return activationLayer;
}
bool ActivationLayer::apply(const Config& config, Tensor&& in, Tensor& out) const
bool ActivationLayer::apply(LayerData& layerData) const
{
out = std::move(in);
apply(config, out);
layerData.out = std::move(layerData.in);
apply(layerData.out);
return true;
}
......
......@@ -13,15 +13,17 @@
namespace pt
{
class Tensor;
class ActivationLayer : public Layer
{
public:
static std::unique_ptr<ActivationLayer> create(std::istream& stream);
virtual void apply(const Config& config, Tensor& out) const = 0;
virtual void apply(Tensor& out) const = 0;
bool apply(const Config& config, Tensor&& in, Tensor& out) const final;
bool apply(LayerData& layerData) const final;
protected:
ActivationLayer() = default;
......
......@@ -7,6 +7,7 @@
#include "pt_batch_normalization_layer.h"
#include "pt_layer_data.h"
#include "pt_logger.h"
namespace pt
......@@ -40,17 +41,19 @@ std::unique_ptr<BatchNormalizationLayer> BatchNormalizationLayer::create(std::is
new BatchNormalizationLayer(std::move(*weights), std::move(*biases)));
}
bool BatchNormalizationLayer::apply(const Config&, Tensor&& in, Tensor& out) const
bool BatchNormalizationLayer::apply(LayerData& layerData) const
{
const Tensor& in = layerData.in;
if(in.getDims() != _weights.getDims())
{
PT_LOG_ERROR << "Input and weights tensor dims are different" <<
" (input dims: " << VectorPrinter<std::size_t>{in.getUnpaddedDims()} << ")" <<
" (weights dims: " << VectorPrinter<std::size_t>{_weights.getUnpaddedDims()} << ")" << std::endl;
" (input dims: " << VectorPrinter<std::size_t>{ in.getDims() } << ")" <<
" (weights dims: " << VectorPrinter<std::size_t>{ _weights.getDims() } << ")" << std::endl;
return false;
}
in.fma(_weights, _biases, out);
in.fma(_weights, _biases, layerData.out, layerData.dispatcher);
return true;
}
......@@ -58,8 +61,6 @@ BatchNormalizationLayer::BatchNormalizationLayer(Tensor&& weights, Tensor&& bias
_weights(std::move(weights)),
_biases(std::move(biases))
{
_weights.addPadding();
_biases.addPadding();
}
}
......@@ -20,7 +20,7 @@ class BatchNormalizationLayer : public Layer
public:
static std::unique_ptr<BatchNormalizationLayer> create(std::istream& stream);
bool apply(const Config& config, Tensor&& in, Tensor& out) const final;
bool apply(LayerData& layerData) const final;
protected:
Tensor _weights;
......
......@@ -7,6 +7,9 @@
#include "pt_conv_1d_layer.h"
#include <array>
#include "pt_dispatcher.h"
#include "pt_layer_data.h"
#include "pt_multiply_add.h"
#include "pt_logger.h"
......@@ -16,38 +19,78 @@ namespace pt
namespace
{
template<class MultiplyAddType>
void multiplyAddImpl(const Tensor& weights, const Tensor& biases, const Tensor& in,
Tensor& out) noexcept
void multiplyAddImpl(const Tensor& weights, const Tensor& biases, LayerData& layerData)
{
const auto& ww = weights.getDims();
const auto& ow = out.getDims();
auto outInc = int(ow[1]);
auto ws0 = int(ww[2] * ww[1]);
auto ws1 = int(ww[2]);
auto tx = int(ow[0]);
auto inBegin = in.begin();
auto outBegin = out.begin();
auto wBegin = weights.begin();
auto wEnd = weights.end();
auto bBegin = biases.begin();
MultiplyAddType multiplyAdd;
for(int x = 0; x != tx; ++x)
struct Task
{
auto inIt = inBegin + x * ws1;
auto outIt = outBegin;
auto bIt = bBegin;
outBegin += outInc;
const Tensor* weights;
const Tensor* biases;
LayerData* layerData;
int threads;
int taskId;
for(auto w0 = wBegin; w0 != wEnd; w0 += ws0)
void operator()() noexcept
{
*outIt = *bIt + multiplyAdd(&*inIt, &*w0, ws0);
++outIt;
++bIt;
const Tensor& in = layerData->in;
Tensor& out = layerData->out;
const auto& ww = weights->getDims();
const auto& ow = out.getDims();
auto outInc = int(ow[1]);
auto wInc = int(ww[2] * ww[1]);
auto wInc2 = int(ww[2]);
auto tx = int(ow[0]);
auto inBegin = in.begin();
auto outBegin = out.begin();
auto wBegin = weights->begin();
auto wEnd = weights->end();
auto bBegin = biases->begin();
MultiplyAddType multiplyAdd;
int its = tx;
int taskIts = its / threads;
int taskBegin = taskIts * taskId;
int taskEnd;
if(taskId == threads - 1)
{
taskEnd = its;
}
else
{
taskEnd = taskBegin + taskIts;
}
for(int x = taskBegin; x != taskEnd; ++x)
{
auto inIt = inBegin + x * wInc2;
auto outIt = outBegin + x * outInc;
auto bIt = bBegin;
for(auto wIt = wBegin; wIt != wEnd; wIt += wInc)
{
*outIt = *bIt + multiplyAdd(&*inIt, &*wIt, wInc);
++outIt;
++bIt;
}
}
}
};
std::array<Task, PT_MAX_CPU_THREADS> tasks;
Dispatcher& dispatcher = layerData.dispatcher;
int threads = int(dispatcher.threads());
for(int taskId = 0; taskId != threads; ++taskId)
{
Task& task = tasks[std::size_t(taskId)];
task = Task{ &weights, &biases, &layerData, threads, taskId };
dispatcher.add([&task]{ task(); });
}
dispatcher.join();
}
}
......@@ -81,14 +124,15 @@ std::unique_ptr<Conv1DLayer> Conv1DLayer::create(std::istream& stream)
std::move(activation)));
}
bool Conv1DLayer::apply(const Config& config, Tensor&& in, Tensor& out) const
bool Conv1DLayer::apply(LayerData& layerData) const
{
const Tensor& in = layerData.in;
const auto& iw = in.getDims();
if(iw.size() != 2)
{
PT_LOG_ERROR << "Input tensor dims count must be 2" <<
" (input dims: " << VectorPrinter<std::size_t>{in.getUnpaddedDims()} << ")" << std::endl;
" (input dims: " << VectorPrinter<std::size_t>{ iw } << ")" << std::endl;
return false;
}
......@@ -97,24 +141,32 @@ bool Conv1DLayer::apply(const Config& config, Tensor&& in, Tensor& out) const
if(iw[1] != ww[2])
{
PT_LOG_ERROR << "Input tensor dims[1] must be the same as weights dims[2]" <<
" (input dims: " << VectorPrinter<std::size_t>{in.getUnpaddedDims()} << ")" <<
" (weights dims: " << VectorPrinter<std::size_t>{_weights.getUnpaddedDims()} << ")" << std::endl;
" (input dims: " << VectorPrinter<std::size_t>{ iw } << ")" <<
" (weights dims: " << VectorPrinter<std::size_t>{ ww } << ")" << std::endl;
return false;
}
auto offset = ww[1] - 1;
out.resizeWithPadding(iw[0] - offset, ww[0]);
Tensor& out = layerData.out;
out.resize(iw[0] - offset, ww[0]);
if(PT_LOOP_UNROLLING_ENABLE && (ww[2] * ww[1]) % (Tensor::VectorSize * 2) == 0)
int threads = int(layerData.dispatcher.threads());
int threadSize = int(ww[2] * ww[1]) / threads;
if(PT_LOOP_UNROLLING_ENABLE && threadSize && threadSize % (Tensor::VectorSize * 2) == 0)
{
multiplyAddImpl<Vector2MultiplyAdd>(_weights, _biases, layerData);
}
else if(threadSize && threadSize % Tensor::VectorSize == 0)
{
multiplyAddImpl<Vector2MultiplyAdd>(_weights, _biases, in, out);
multiplyAddImpl<VectorMultiplyAdd>(_weights, _biases, layerData);
}
else
{
multiplyAddImpl<VectorMultiplyAdd>(_weights, _biases, in, out);
multiplyAddImpl<ScalarMultiplyAdd>(_weights, _biases, layerData);
}
_activation->apply(config, out);
_activation->apply(out);
return true;
}
......@@ -124,8 +176,6 @@ Conv1DLayer::Conv1DLayer(Tensor&& weights, Tensor&& biases,
_biases(std::move(biases)),
_activation(std::move(activation))
{
_weights.addPadding();
_biases.addPadding();
}
}
......@@ -20,7 +20,7 @@ class Conv1DLayer : public Layer
public:
static std::unique_ptr<Conv1DLayer> create(std::istream& stream);
bool apply(const Config& config, Tensor&& in, Tensor& out) const final;
bool apply(LayerData& layerData) const final;
protected:
Tensor _weights;
......
......@@ -7,6 +7,9 @@
#include "pt_conv_2d_layer.h"
#include <array>
#include "pt_dispatcher.h"
#include "pt_layer_data.h"
#include "pt_multiply_add.h"
#include "pt_logger.h"
......@@ -16,50 +19,93 @@ namespace pt
namespace
{
template<class MultiplyAddType>
void multiplyAddImpl(const Tensor& weights, const Tensor& biases, const Tensor& in,
Tensor& out) noexcept
void multiplyAddImpl(const Tensor& weights, const Tensor& biases, LayerData& layerData)
{
const auto& iw = in.getDims();
const auto& ww = weights.getDims();
const auto& ow = out.getDims();
auto outInc = int(ow[2]);
auto ws = int(ww[0] * ww[1] * ww[2] * ww[3]);
auto ws0 = int(ww[1] * ww[2] * ww[3]);
auto ws1 = int(ww[2] * ww[3]);
auto ws2 = int(ww[3]);
auto is0 = int(ww[3] * iw[1]);
auto ty = int(ow[0]);
auto tx = int(ow[1]);
auto inBegin = in.getData().data();
auto outBegin = const_cast<Tensor::Type*>(out.getData().data());
auto wBegin = weights.getData().data();
auto bBegin = biases.getData().data();
MultiplyAddType multiplyAdd;
for(int y = 0; y != ty; ++y)
struct Task
{
for(int x = 0; x != tx; ++x)
{
auto