Commit 691155c7 authored by Gustavo Valiente's avatar Gustavo Valiente

Benchmark

parent df6d8f06
cmake_minimum_required(VERSION 3.4)
project(pocket-tensor-project)
# Enable C++11:
set(CMAKE_CXX_STANDARD 11)
# Define build options:
option(PT_BUILD_ALL "Build all pocket-tensor artefacts" OFF)
option(PT_BUILD_TESTS "Build pocket-tensor tests" OFF)
option(PT_BUILD_BENCHMARK "Build pocket-tensor benchmark" OFF)
# Define C++ version:
if(PT_BUILD_BENCHMARK OR PT_BUILD_ALL)
# Enable C++14:
set(CMAKE_CXX_STANDARD 14)
else()
# Enable C++11:
set(CMAKE_CXX_STANDARD 11)
endif()
# Detect Clang:
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
......@@ -46,11 +57,12 @@ set(PT_LIBSIMDPP_PATH "${CMAKE_CURRENT_SOURCE_DIR}/3rd_party/libsimdpp" CACHE ST
# Add library subdirectory:
add_subdirectory(lib)
# Define build all artefacts option:
option(PT_BUILD_ALL "Build all pocket-tensor artefacts" OFF)
# Add tests subdirectory:
option(PT_BUILD_TESTS "Build pocket-tensor tests" OFF)
if(PT_BUILD_TESTS OR PT_BUILD_ALL)
add_subdirectory(tests)
endif()
# Add benchmark subdirectory:
if(PT_BUILD_BENCHMARK OR PT_BUILD_ALL)
add_subdirectory(benchmark)
endif()
......@@ -47,7 +47,7 @@ To build and run the unit tests, you need to generate them first:
python make_tests.py
mkdir tests_build
cd tests_build
cmake -DPT_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release ../..
cmake -DPT_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release ..
make
./tests/pocket-tensor-tests
```
......@@ -123,44 +123,55 @@ The most common layer types used in image recognition and sequences prediction a
## Performance
The prediction time of the following models has been measured on a Raspberry Pi 3:
A benchmark application is included with this library. To build and run it:
### CNN
```
mkdir benchmark_build
cd benchmark_build
cmake -DPT_BUILD_BENCHMARK=ON -DCMAKE_BUILD_TYPE=Release ..
make
./benchmark/pocket-tensor-benchmark
```
The prediction time of the following models has been measured on a PC with a Intel Core i7-6500U CPU @ 2.50GHz and on a Raspberry Pi 3:
### Mnist
```python
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
activation='relu',
input_shape=input_shape))
input_shape=(28, 28, 1)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
```
| Library | Elapsed time (μs) |
| ------------------ | ----------------: |
| Keras | 23363 |
| arquolo's Kerasify | 64238 |
| frugally-deep | 29298 |
| pocket-tensor | 27329 |
| Library | PC elapsed time (μs) | RPi3 elapsed time (μs) |
| ------------------ | -------------------: | ---------------------: |
| Keras | 1470 | 23363 |
| arquolo's Kerasify | 3502 | 64238 |
| frugally-deep | 1402 | 29298 |
| pocket-tensor | 1049 | 27329 |
### LSTM
### Imdb
```python
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(Embedding(20000, 128))
model.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(LSTM(128, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
```
| Library | Elapsed time (μs) |
| ------------------ | ----------------: |
| Keras | 89344 |
| arquolo's Kerasify | 79060 |
| frugally-deep | Not supported |
| pocket-tensor | 67115 |
| Library | PC elapsed time (μs) | RPi3 elapsed time (μs) |
| ------------------ | -------------------: | ---------------------: |
| Keras | 10160 | 89344 |
| arquolo's Kerasify | 5378 | 79060 |
| frugally-deep | Not supported | Not supported |
| pocket-tensor | 3314 | 67115 |
cmake_minimum_required(VERSION 3.4)
project(frugally-deep-project)
# Add library subdirectory:
add_subdirectory(lib)
The MIT License (MIT)
Copyright (c) 2016 Tobias Hermann
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
![logo](logo/fdeep.png)
[![Build Status](https://travis-ci.org/Dobiasd/frugally-deep.svg?branch=master)][travis]
[![(License MIT 1.0)](https://img.shields.io/badge/license-MIT%201.0-blue.svg)][license]
[travis]: https://travis-ci.org/Dobiasd/frugally-deep
[license]: LICENSE
frugally-deep
=============
**Use Keras models in C++ with ease**
Table of contents
-----------------
* [Introduction](#introduction)
* [Usage](#usage)
* [Performance](#performance)
* [Requirements and Installation](#requirements-and-installation)
* [Internals](#internals)
Introduction
------------
Would you like to build/train a model using Keras/Python? And would you like run the prediction (forward pass) on your model in C++ without linking your application against TensorFlow? Then frugally-deep is exactly for you.
**frugally-deep**
* **is a small header-only library** written in modern and pure C++.
* is very easy to integrate and use.
* depends only on [FunctionalPlus](https://github.com/Dobiasd/FunctionalPlus), [Eigen](http://eigen.tuxfamily.org/) and [json](https://github.com/nlohmann/json) - also header-only libraries.
* supports inference (`model.predict`) not only for [sequential models](https://keras.io/getting-started/sequential-model-guide/) but also for computational graphs with a more complex topology, created with the [functional API](https://keras.io/getting-started/functional-api-guide/).
* re-implements a (small) subset of TensorFlow, i.e. the operations needed to support prediction.
* results in a much smaller binary size than linking against TensorFlow.
* works out of-the-box also when compiled into a 32-bit executable.
* utterly ignores even the most powerful GPU in your system and uses only one CPU core. ;-)
* but is quite fast on one CPU core [compared to TensorFlow](#performance).
### Supported layer types
Layer types typically used in image recognition/generation are supported, making many popular model architectures possible (see [Performance section](#performance)).
* `Add`, `Concatenate`, `Subtract`, `Multiply`, `Average`, `Maximum`
* `AveragePooling1D/2D`, `GlobalAveragePooling1D/2D`
* `Conv1D/2D`, `SeparableConv2D`
* `Cropping1D/2D`, `ZeroPadding1D/2D`
* `BatchNormalization`, `Dense`, `Dropout`, `Flatten`
* `MaxPooling1D/2D`, `GlobalMaxPooling1D/2D`
* `ELU`, `LeakyReLU`, `ReLU`, `SeLU`
* `Sigmoid`, `Softmax`, `Softplus`, `Tanh`
* `UpSampling1D/2D`
* `Reshape`
### Also supported
* multiple inputs and outputs
* nested models
* residual connections
* shared layers
* arbitrary complex model architectures / computational graphs
Currently not supported are the following layer types:
`ActivityRegularization`,
`AlphaDropout`,
`AveragePooling3D`,
`Bidirectional`,
`Conv2DTranspose`,
`Conv3D`,
`ConvLSTM2D`,
`CuDNNGRU`,
`CuDNNLSTM`,
`Cropping3D`,
`DepthwiseConv2D`,
`Dot`,
`Embedding`,
`GaussianDropout`,
`GaussianNoise`,
`GRU`,
`GRUCell`,
`Lambda`,
`LocallyConnected1D`,
`LocallyConnected2D`,
`LSTM`,
`LSTMCell`,
`Masking`,
`MaxPooling3D`,
`Permute`,
`PReLU`,
`RepeatVector`,
`RNN`,
`SimpleRNN`,
`SimpleRNNCell`,
`StackedRNNCells`,
`ThresholdedReLU`,
`TimeDistributed`,
`Upsampling3D`,
`any custom layers`
Usage
-----
1) Use Keras/Python to build (`model.compile(...)`), train (`model.fit(...)`) and test (`model.evaluate(...)`) your model as usual. Then save it to a single HDF5 file using `model.save('....h5', include_optimizer=False)`. The `image_data_format` in your model must be `channels_last`, which is the default when using the TensorFlow backend. Models created with a different `image_data_format` and other backends are not supported.
2) Now convert it to the frugally-deep file format with `keras_export/convert_model.py`
3) Finally load it in C++ (`fdeep::load_model(...)`) and use `model.predict(...)` to invoke a forward pass with your data.
The following minimal example shows the full workflow:
```python
# create_model.py
import numpy as np
from keras.layers import Input, Dense
from keras.models import Model
inputs = Input(shape=(4,))
x = Dense(5, activation='relu')(inputs)
predictions = Dense(3, activation='softmax')(x)
model = Model(inputs=inputs, outputs=predictions)
model.compile(loss='categorical_crossentropy', optimizer='nadam')
model.fit(
np.asarray([[1,2,3,4], [2,3,4,5]]),
np.asarray([[1,0,0], [0,0,1]]), epochs=10)
model.save('keras_model.h5', include_optimizer=False)
```
```
python3 keras_export/convert_model.py keras_model.h5 fdeep_model.json
```
```cpp
// main.cpp
#include <fdeep/fdeep.hpp>
int main()
{
const auto model = fdeep::load_model("fdeep_model.json");
const auto result = model.predict(
{fdeep::tensor3(fdeep::shape3(4, 1, 1), {1, 2, 3, 4})});
std::cout << fdeep::show_tensor3s(result) << std::endl;
}
```
When using `convert_model.py` a test case (input and corresponding output values) is generated automatically and saved along with your model. `fdeep::load_model` runs this test to make sure the results of a forward pass in frugally-deep are the same as in Keras.
In order to convert images to `fdeep::tensor3` the convenience function `tensor3_from_bytes` is provided ([cimg example](https://gist.github.com/Dobiasd/21651861b73042762126e8eea52d9974), [opencv example](https://gist.github.com/Dobiasd/3140cfd9f539b6adb346e0b4a0ce157b), [tensor3_to_cv_mat.cpp](https://gist.github.com/Dobiasd/7ef20a0ad47d3f8dc1654a0ca5d1c77c)).
In case you want to convert an `Eigen::Matrix` to `fdeep::tensor3`, have a look at the following two examples: [copy values](https://gist.github.com/Dobiasd/966334bb867d170b334c8374e635cb9b), [reuse memory](https://gist.github.com/Dobiasd/2852c81adbd57a57e89d2d0385cc4c06)
Performance
-----------
Below you can find the average durations of multiple consecutive forward passes for some popular models ran on a single core of an Intel Core i5-6600 CPU @ 3.30GHz. frugally-deep was compiled (GCC ver. 5.4.0) with `g++ -O3 -mavx` (same as TensorFlow 1.7.0 binaries). The processes were started with `CUDA_VISIBLE_DEVICES='' taskset --cpu-list 1 ...` to disable the GPU and to only allow usage of one CPU.
```
| Model | Keras + TensorFlow | frugally-deep |
|-------------------|--------------------|---------------|
| InceptionV3 | 0.40 s | 0.33 s |
| ResNet50 | 0.39 s | 0.21 s |
| VGG16 | 0.37 s | 0.77 s |
| VGG19 | 0.45 s | 0.93 s |
| Xception | 0.84 s | 0.54 s |
| MobileNet | 0.18 s | 0.06 s |
| DenseNet201 | 0.81 s | 0.34 s |
| NASNetLarge | 2.28 s | 2.20 s |
```
Requirements and Installation
-----------------------------
A **C++14**-compatible compiler is needed. Compilers from these versions on are fine: GCC 4.9, Clang 3.7 (libc++ 3.7) and Visual C++ 2015.
Guides for different ways to install frugally-deep can be found in [`INSTALL.md`](INSTALL.md).
Internals
---------
frugally-deep uses `channels_first` (`depth/channels, height, width`) as its `image_data_format` internally. `convert_model.py` takes care of all necessary conversions.
From then on everything is handled as a float32 tensor with rank 3. Dense layers for example take its input flattened to a shape of `(n, 1, 1)`. This is also the shape you will receive as the output of a final `softmax` layer for example.
In case you would like to use `double` instead of `float` for all calculations, simply do this:
```cpp
#define FDEEP_FLOAT_TYPE double
#include <fdeep/fdeep.hpp>
```
A frugally-deep model is thread-safe, i.e. you can call `model.predict` on the same model instance from different threads simultaneously. This way you may utilize up to as many CPU cores as you have predictions to make. With `model::predict_multi` there is a convenience function available to handle the parallelism for you.
Disclaimer
----------
The API of this library still might change in the future. If you have any suggestions, find errors or want to give general feedback/criticism, I'd [love to hear from you](https://github.com/Dobiasd/frugally-deep/issues). Of course, [contributions](https://github.com/Dobiasd/frugally-deep/pulls) are also very welcome.
License
-------
Distributed under the MIT License.
(See accompanying file [`LICENSE`](https://github.com/Dobiasd/frugally-deep/blob/master/LICENSE) or at
[https://opensource.org/licenses/MIT](https://opensource.org/licenses/MIT))
cmake_minimum_required(VERSION 3.4)
project(frugally-deep)
# Add a library with headers only:
add_library(${PROJECT_NAME} INTERFACE)
# Define include directories:
target_include_directories(${PROJECT_NAME}
INTERFACE ${PROJECT_SOURCE_DIR}/include
)
// Copyright 2016, Tobias Hermann.
// https://github.com/Dobiasd/frugally-deep
// Distributed under the MIT License.
// (See accompanying LICENSE file or at
// https://opensource.org/licenses/MIT)
#pragma once
#include <cstdint>
#include <string>
#include <vector>
namespace fdeep { namespace internal
{
// source: https://stackoverflow.com/a/31322410/1866775
static const std::uint8_t from_base64[] = { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 62, 255, 63,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255,
255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 63,
255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255};
static const char to_base64[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
inline std::vector<std::uint8_t> Base64_decode(std::string encoded_string)
{
// Make sure string length is a multiple of 4
while ((encoded_string.size() % 4) != 0)
{
encoded_string.push_back('=');
}
const size_t encoded_size = encoded_string.size();
std::vector<std::uint8_t> ret;
ret.reserve(3 * encoded_size / 4);
for (size_t i = 0; i < encoded_size; i += 4)
{
// Get values for each group of four base 64 characters
std::uint8_t b4[4];
b4[0] = (encoded_string[i+0] <= 'z') ? from_base64[static_cast<std::size_t>(encoded_string[i+0])] : 0xff;
b4[1] = (encoded_string[i+1] <= 'z') ? from_base64[static_cast<std::size_t>(encoded_string[i+1])] : 0xff;
b4[2] = (encoded_string[i+2] <= 'z') ? from_base64[static_cast<std::size_t>(encoded_string[i+2])] : 0xff;
b4[3] = (encoded_string[i+3] <= 'z') ? from_base64[static_cast<std::size_t>(encoded_string[i+3])] : 0xff;
// Transform into a group of three bytes
std::uint8_t b3[3];
b3[0] = static_cast<std::uint8_t>(((b4[0] & 0x3f) << 2) + ((b4[1] & 0x30) >> 4));
b3[1] = static_cast<std::uint8_t>(((b4[1] & 0x0f) << 4) + ((b4[2] & 0x3c) >> 2));
b3[2] = static_cast<std::uint8_t>(((b4[2] & 0x03) << 6) + ((b4[3] & 0x3f) >> 0));
// Add the byte to the return value if it isn't part of an '=' character (indicated by 0xff)
if (b4[1] != 0xff) ret.push_back(b3[0]);
if (b4[2] != 0xff) ret.push_back(b3[1]);
if (b4[3] != 0xff) ret.push_back(b3[2]);
}
return ret;
}
} } // namespace fdeep, namespace internal
// Copyright 2016, Tobias Hermann.
// https://github.com/Dobiasd/frugally-deep
// Distributed under the MIT License.
// (See accompanying LICENSE file or at
// https://opensource.org/licenses/MIT)
#pragma once
#if defined(__GNUC__) || defined(__GNUG__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wctor-dtor-privacy"
#pragma GCC diagnostic ignored "-Wold-style-cast"
#pragma GCC diagnostic ignored "-Wsign-conversion"
#pragma GCC diagnostic ignored "-Weffc++"
#pragma GCC diagnostic ignored "-Wconversion"
#pragma GCC diagnostic ignored "-Wshadow"
#endif
#if defined _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4706)
#pragma warning(disable : 4996)
#endif
#include <Eigen/Core>
#if defined _MSC_VER
#pragma warning(pop)
#endif
#if defined(__GNUC__) || defined(__GNUG__)
#pragma GCC diagnostic pop
#endif
#include <fplus/fplus.hpp>
#include <cmath>
#include <memory>
#include <vector>
#include <string>
#include <stdexcept>
#if defined(__GNUC__) || defined(__GNUG__)
#define FDEEP_FORCE_INLINE __attribute__((always_inline)) inline
#elif defined(_MSC_VER)
#define FDEEP_FORCE_INLINE __forceinline
#else
#define FDEEP_FORCE_INLINE inline
#endif
namespace fdeep { namespace internal
{
inline std::runtime_error error(const std::string& error)
{
return std::runtime_error(error);
}
inline void raise_error(const std::string& msg)
{
throw error(msg);
}
inline void assertion(bool cond, const std::string& error)
{
if (!cond)
{
raise_error(error);
}
}
#ifdef FDEEP_FLOAT_TYPE
typedef FDEEP_FLOAT_TYPE float_type;
#else
typedef float float_type;
#endif
typedef std::vector<float_type> float_vec;
typedef fplus::shared_ref<float_vec> shared_float_vec;
using RowMajorMatrixXf = Eigen::Matrix<float_type, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
} } // namespace fdeep, namespace internal
// Copyright 2016, Tobias Hermann.
// https://github.com/Dobiasd/frugally-deep
// Distributed under the MIT License.
// (See accompanying LICENSE file or at
// https://opensource.org/licenses/MIT)
#pragma once
#include "fdeep/common.hpp"
#include "fdeep/filter.hpp"
#include <cassert>
#include <cstddef>
#include <vector>
namespace fdeep { namespace internal
{
struct im2col_filter_matrix
{
RowMajorMatrixXf mat_;
shape3 filter_shape_;
std::size_t filter_count_;
};
inline im2col_filter_matrix generate_im2col_filter_matrix(
const std::vector<filter>& filters)
{
assertion(fplus::all_the_same_on(
fplus_c_mem_fn_t(filter, shape, shape3), filters),
"all filters must have the same shape");
const std::size_t fz = filters.front().shape().depth_;
const std::size_t fy = filters.front().shape().height_;
const std::size_t fx = filters.front().shape().width_;
RowMajorMatrixXf b(filters.size(), fz * fy * fx + 1);
Eigen::Index b_y = 0;
Eigen::Index b_x = 0;
for (std::size_t f = 0; f < filters.size(); ++f)
{
b_x = 0;
const filter& filter = filters[f];
for (std::size_t zf = 0; zf < fz; ++zf)
{
for (std::size_t yf = 0; yf < fy; ++yf)
{
for (std::size_t xf = 0; xf < fx; ++xf)
{
b(b_y, b_x++) = filter.get(zf, yf, xf);
}
}
}
b(b_y, b_x++) = filter.get_bias();
++b_y;
}
return {b, filters.front().shape(), filters.size()};
}
inline im2col_filter_matrix generate_im2col_single_filter_matrix(
const filter& filter)
{
return generate_im2col_filter_matrix(filter_vec(1, filter));
}