Program Listing for File facedetectcnn.h
↰ Return to documentation for file (include/libfacedetectcnn/facedetectcnn.h)
// Copyright (c) 2023 PAL Robotics S.L. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Copyright (c) 2018-2021, Shiqi Yu, all rights reserved. shiqi.yu@gmail.com
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * Neither the name of the {copyright_holder} nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
// Copied and adapted from https://github.com/ShiqiYu/libfacedetection
#pragma once
#include "facedetection_export.h"
//#define _ENABLE_AVX512 //Please enable it if X64 CPU
//#define _ENABLE_AVX2 //Please enable it if X64 CPU
//#define _ENABLE_NEON //Please enable it if ARM CPU
FACEDETECTION_EXPORT int * facedetect_cnn(unsigned char * result_buffer, //buffer memory for storing face detection results, !!its size must be 0x20000 Bytes!!
unsigned char * rgb_image_data, int width, int height, int step); //input image, it must be BGR (three channels) insteed of RGB image!
/*
DO NOT EDIT the following code if you don't really understand it.
*/
#if defined(_ENABLE_AVX512) || defined(_ENABLE_AVX2)
#include <immintrin.h>
#endif
#if defined(_ENABLE_NEON)
#include "arm_neon.h"
//NEON does not support UINT8*INT8 dot product
//to conver the input data to range [0, 127],
//and then use INT8*INT8 dot product
#define _MAX_UINT8_VALUE 127
#else
#define _MAX_UINT8_VALUE 255
#endif
#if defined(_ENABLE_AVX512)
#define _MALLOC_ALIGN 512
#elif defined(_ENABLE_AVX2)
#define _MALLOC_ALIGN 256
#else
#define _MALLOC_ALIGN 128
#endif
#if defined(_ENABLE_AVX512)&& defined(_ENABLE_NEON)
#error Cannot enable the two of AVX512 and NEON at the same time.
#endif
#if defined(_ENABLE_AVX2)&& defined(_ENABLE_NEON)
#error Cannot enable the two of AVX and NEON at the same time.
#endif
#if defined(_ENABLE_AVX512)&& defined(_ENABLE_AVX2)
#error Cannot enable the two of AVX512 and AVX2 at the same time.
#endif
#if defined(_OPENMP)
#include <omp.h>
#endif
#include <string.h>
#include <vector>
#include <iostream>
#include <typeinfo>
void* myAlloc(size_t size);
void myFree_(void* ptr);
#define myFree(ptr) (myFree_(*(ptr)), *(ptr)=0);
#ifndef MIN
# define MIN(a,b) ((a) > (b) ? (b) : (a))
#endif
#ifndef MAX
# define MAX(a,b) ((a) < (b) ? (b) : (a))
#endif
typedef struct FaceRect_
{
float score;
int x;
int y;
int w;
int h;
int lm[10];
}FaceRect;
typedef struct ConvInfoStruct_ {
int channels;
int num_filters;
bool is_depthwise;
bool is_pointwise;
bool with_relu;
float* pWeights;
float* pBiases;
}ConvInfoStruct;
template <typename T>
class CDataBlob
{
public:
int rows;
int cols;
int channels; //in element
int channelStep; //in byte
T * data;
public:
CDataBlob() {
rows = 0;
cols = 0;
channels = 0;
channelStep = 0;
data = nullptr;
}
CDataBlob(int r, int c, int ch)
{
data = nullptr;
create(r, c, ch);
//#warning "confirm later"
setZero();
}
~CDataBlob()
{
setNULL();
}
CDataBlob(CDataBlob<T> &&other) {
data = other.data;
other.data = nullptr;
rows = other.rows;
cols = other.cols;
channels = other.channels;
channelStep = other.channelStep;
}
CDataBlob<T> &operator=(CDataBlob<T> &&other) {
this->~CDataBlob();
new (this) CDataBlob<T>(std::move(other));
return *this;
}
void setNULL()
{
if (data)
myFree(&data);
rows = cols = channels = channelStep = 0;
data = nullptr;
}
void setZero()
{
if(data)
memset(data, 0, channelStep * rows * cols);
}
inline bool isEmpty() const
{
return (rows <= 0 || cols <= 0 || channels == 0 || data == nullptr);
}
bool create(int r, int c, int ch)
{
setNULL();
rows = r;
cols = c;
channels = ch;
//alloc space for int8 array
int remBytes = (sizeof(T)* channels) % (_MALLOC_ALIGN / 8);
if (remBytes == 0)
this->channelStep = channels * sizeof(T);
else
this->channelStep = (channels * sizeof(T)) + (_MALLOC_ALIGN / 8) - remBytes;
data = (T*)myAlloc(size_t(rows) * cols * this->channelStep);
if (data == nullptr)
{
std::cerr << "Failed to alloc memeory for uint8 data blob: "
<< rows << "*"
<< cols << "*"
<< channels << std::endl;
return false;
}
//memset(data, 0, width * height * channelStep);
//the following code is faster than memset
//but not only the padding bytes are set to zero.
//BE CAREFUL!!!
//#if defined(_OPENMP)
//#pragma omp parallel for
//#endif
// for (int r = 0; r < this->rows; r++)
// {
// for (int c = 0; c < this->cols; c++)
// {
// int pixel_end = this->channelStep / sizeof(T);
// T * pI = this->ptr(r, c);
// for (int ch = this->channels; ch < pixel_end; ch++)
// pI[ch] = 0;
// }
// }
return true;
}
inline T * ptr(int r, int c)
{
if( r < 0 || r >= this->rows || c < 0 || c >= this->cols )
return nullptr;
return (this->data + (size_t(r) * this->cols + c) * this->channelStep /sizeof(T));
}
inline const T * ptr(int r, int c) const
{
if( r < 0 || r >= this->rows || c < 0 || c >= this->cols )
return nullptr;
return (this->data + (size_t(r) * this->cols + c) * this->channelStep /sizeof(T));
}
inline const T getElement(int r, int c, int ch) const
{
if (this->data)
{
if (r >= 0 && r < this->rows &&
c >= 0 && c < this->cols &&
ch >= 0 && ch < this->channels)
{
const T * p = this->ptr(r, c);
return (p[ch]);
}
}
return (T)(0);
}
friend std::ostream &operator<<(std::ostream &output, CDataBlob &dataBlob)
{
output << "DataBlob Size (channels, rows, cols) = ("
<< dataBlob.channels
<< ", " << dataBlob.rows
<< ", " << dataBlob.cols
<< ")" << std::endl;
if( dataBlob.rows * dataBlob.cols * dataBlob.channels <= 16)
{ //print the elements only when the total number is less than 64
for (int ch = 0; ch < dataBlob.channels; ch++)
{
output << "Channel " << ch << ": " << std::endl;
for (int r = 0; r < dataBlob.rows; r++)
{
output << "(";
for (int c = 0; c < dataBlob.cols; c++)
{
T * p = dataBlob.ptr(r, c);
if(sizeof(T)<4)
output << (int)(p[ch]);
else
output << p[ch];
if (c != dataBlob.cols - 1)
output << ", ";
}
output << ")" << std::endl;
}
}
}
else{
output << "(" ;
int idx = 0;
bool outloop = false;
for(int r = 0; r < dataBlob.rows && !outloop; ++r) {
for(int c = 0; c < dataBlob.cols && !outloop; ++c) {
for(int ch = 0; ch < dataBlob.channels && !outloop; ++ch) {
output << dataBlob.getElement(r, c, ch) << ", ";
++idx;
if(idx >= 16) {
outloop = true;
}
}
}
}
output << "..., "
<< dataBlob.getElement(dataBlob.rows-1, dataBlob.cols-1, dataBlob.channels-1) << ")"
<< std::endl;
float max_it = -500.f;
float min_it = 500.f;
for(int r = 0; r < dataBlob.rows; ++r) {
for(int c = 0; c < dataBlob.cols; ++c) {
for(int ch = 0; ch < dataBlob.channels; ++ch) {
max_it = std::max(max_it, dataBlob.getElement(r, c, ch));
min_it = std::min(min_it, dataBlob.getElement(r, c, ch));
}
}
}
output << "max_it: " << max_it << " min_it: " << min_it << std::endl;
}
return output;
}
};
template <typename T>
class Filters{
public:
int channels;
int num_filters;
bool is_depthwise;
bool is_pointwise;
bool with_relu;
CDataBlob<T> weights;
CDataBlob<T> biases;
Filters()
{
channels = 0;
num_filters = 0;
is_depthwise = false;
is_pointwise = false;
with_relu = true;
}
Filters & operator=(ConvInfoStruct & convinfo)
{
if (typeid(float) != typeid(T))
{
std::cerr << "The data type must be float in this version." << std::endl;
return *this;
}
if (typeid(float*) != typeid(convinfo.pWeights) ||
typeid(float*) != typeid(convinfo.pBiases))
{
std::cerr << "The data type of the filter parameters must be float in this version." << std::endl;
return *this;
}
this->channels = convinfo.channels;
this->num_filters = convinfo.num_filters;
this->is_depthwise = convinfo.is_depthwise;
this->is_pointwise = convinfo.is_pointwise;
this->with_relu = convinfo.with_relu;
if(!this->is_depthwise && this->is_pointwise) //1x1 point wise
{
this->weights.create(1, num_filters, channels);
}
else if(this->is_depthwise && !this->is_pointwise) //3x3 depth wise
{
this->weights.create(1, 9, channels);
}
else
{
std::cerr << "Unsupported filter type. Only 1x1 point-wise and 3x3 depth-wise are supported." << std::endl;
return *this;
}
this->biases.create(1, 1, num_filters);
//the format of convinfo.pWeights/biases must meet the format in this->weigths/biases
for(int fidx = 0; fidx < this->weights.cols; fidx++)
memcpy(this->weights.ptr(0,fidx),
convinfo.pWeights + channels * fidx ,
channels * sizeof(T));
memcpy(this->biases.ptr(0,0), convinfo.pBiases, sizeof(T) * this->num_filters);
return *this;
}
};
std::vector<FaceRect> objectdetect_cnn(const unsigned char* rgbImageData, int with, int height, int step);
CDataBlob<float> setDataFrom3x3S2P1to1x1S1P0FromImage(const unsigned char* inputData, int imgWidth, int imgHeight, int imgChannels, int imgWidthStep, int padDivisor=32);
CDataBlob<float> convolution(const CDataBlob<float>& inputData, const Filters<float>& filters, bool do_relu = true);
CDataBlob<float> convolutionDP(const CDataBlob<float>& inputData,
const Filters<float>& filtersP, const Filters<float>& filtersD, bool do_relu = true);
CDataBlob<float> convolution4layerUnit(const CDataBlob<float>& inputData,
const Filters<float>& filtersP1, const Filters<float>& filtersD1,
const Filters<float>& filtersP2, const Filters<float>& filtersD2, bool do_relu = true);
CDataBlob<float> maxpooling2x2S2(const CDataBlob<float>& inputData);
CDataBlob<float> elementAdd(const CDataBlob<float>& inputData1, const CDataBlob<float>& inputData2);
CDataBlob<float> upsampleX2(const CDataBlob<float>& inputData);
CDataBlob<float> meshgrid(int feature_width, int feature_height, int stride, float offset=0.0f);
// TODO implement in SIMD
void bbox_decode(CDataBlob<float>& bbox_pred, const CDataBlob<float>& priors, int stride);
void kps_decode(CDataBlob<float>& bbox_pred, const CDataBlob<float>& priors, int stride);
template<typename T>
CDataBlob<T> blob2vector(const CDataBlob<T> &inputData);
template<typename T>
CDataBlob<T> concat3(const CDataBlob<T>& inputData1, const CDataBlob<T>& inputData2, const CDataBlob<T>& inputData3);
// TODO implement in SIMD
void sigmoid(CDataBlob<float>& inputData);
std::vector<FaceRect> detection_output(const CDataBlob<float>& cls,
const CDataBlob<float>& reg,
const CDataBlob<float>& kps,
const CDataBlob<float>& obj,
float overlap_threshold, float confidence_threshold, int top_k, int keep_top_k);