10月 31

Caffe Blob源码阅读

Published at Oct 31, 2016 • caffe

本文仅用于整理阅读Caffe源码过程中的记录,所写内容只确保本人看懂

Blob是Caffe中处理和传递实际数据的数据封装包,并且在CPU于GPU之间具有同步处理能力

Blob类成员

Blob是连续存储的N维数组,在 blob.hpp 中定义了Blob的成员变量

protected:  
  shared_ptr<SyncedMemory> data_;  
  shared_ptr<SyncedMemory> diff_;  
  shared_ptr<SyncedMemory> shape_data_; // 每一维数据的大小  
  vector<int> shape_;  
  int count_; // 当前容量  
  int capacity_; // 能承受的容量

data_ 存储前向传播的数据,diff_ 存储反向传播的梯度;shape_ data_ 和 shape_记录了每一维度数据的大小

一个Blob最多可以表示 const int kMaxBlobAxes = 32; 32维的数组

Blob类相关操作

1.构造函数

explicit Blob(const int num, const int channels, const int height,  
      const int width);  
explicit Blob(const vector<int>& shape);

blob.cpp 中,构造函数会调用 Reshape 完成初始化操作,为 data_ 和 diff_ 分配共享内存对象SyncedMemory

template <typename Dtype>  
Blob<Dtype>::Blob(const int num, const int channels, const int height,  
    const int width)  
  // capacity_ must be initialized before calling Reshape  
  : capacity_(0) {  
  Reshape(num, channels, height, width);  
}  

template <typename Dtype>  
Blob<Dtype>::Blob(const vector<int>& shape)  
  // capacity_ must be initialized before calling Reshape  
  : capacity_(0) {  
  Reshape(shape);  
}

2.Reshape:修改Blob的大小(尺寸)

void Reshape(const int num, const int channels, const int height,  
      const int width);  
void Reshape(const vector<int>& shape);  
void Reshape(const BlobShape& shape);  
void ReshapeLike(const Blob& other);

所有重载Reshape函数最终都调用 void Blob::Reshape(const vector& shape)。特别地,当内存空间不足时(count_ > capacity_),会为Blob重新分配内存

template <typename Dtype>  
void Blob<Dtype>::Reshape(const vector<int>& shape) {  
  CHECK_LE(shape.size(), kMaxBlobAxes);  
  count_ = 1;  
  shape_.resize(shape.size());  
  if (!shape_data_ || shape_data_->size() < shape.size() * sizeof(int)) {  
    shape_data_.reset(new SyncedMemory(shape.size() * sizeof(int)));  
  }  
  int* shape_data = static_cast<int*>(shape_data_->mutable_cpu_data());  
  for (int i = 0; i < shape.size(); ++i) {  
    CHECK_GE(shape[i], 0);  
    if (count_ != 0) {  
      CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";  
    }  
    count_ *= shape[i];  
    shape_[i] = shape[i];  
    shape_data[i] = shape[i];  
  }  
  if (count_ > capacity_) {  
    capacity_ = count_;  
    data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));  
    diff_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));  
  }  
}

3.offset用于给定坐标,获取Blob相应坐标数据。例如,对于批量图像数据来说,blob常规的维数为图像数量 N * 通道数 K * 图像高度 H * 图像宽度 W。Blob按行为主存储。所以4维blob,坐标为(n,k,h,w)的值在blob中的物理位置为((n * K + k)* H + h)* W + w

inline int offset(const int n, const int c = 0, const int h = 0,  
      const int w = 0) const {  
    CHECK_GE(n, 0);  
    CHECK_LE(n, num());  
    CHECK_GE(channels(), 0);  
    CHECK_LE(c, channels());  
    CHECK_GE(height(), 0);  
    CHECK_LE(h, height());  
    CHECK_GE(width(), 0);  
    CHECK_LE(w, width());  
    return ((n * channels() + c) * height() + h) * width() + w;  
  }  

  inline int offset(const vector<int>& indices) const {  
    CHECK_LE(indices.size(), num_axes());  
    int offset = 0;  
    for (int i = 0; i < num_axes(); ++i) {  
      offset *= shape(i);  
      if (indices.size() > i) {  
        CHECK_GE(indices[i], 0);  
        CHECK_LT(indices[i], shape(i));  
        offset += indices[i];  
      }  
    }  
    return offset;  
  }

4.Blob的Update:参数更新操作

完成梯度下降过程中的参数更新:

$$data_{k+1} = data_k - diff$$

template <typename Dtype>  
void Blob<Dtype>::Update() {  
  // We will perform update based on where the data is located.  
  switch (data_->head()) {  
  case SyncedMemory::HEAD_AT_CPU:  
    // perform computation on CPU  
    caffe_axpy<Dtype>(count_, Dtype(-1),  
        static_cast<const Dtype*>(diff_->cpu_data()),  
        static_cast<Dtype*>(data_->mutable_cpu_data()));  
    break;  
  case SyncedMemory::HEAD_AT_GPU:  
  case SyncedMemory::SYNCED:  
#ifndef CPU_ONLY  
    // perform computation on GPU  
    caffe_gpu_axpy<Dtype>(count_, Dtype(-1),  
        static_cast<const Dtype*>(diff_->gpu_data()),  
        static_cast<Dtype*>(data_->mutable_gpu_data()));  
#else  
    NO_GPU;  
#endif  
    break;  
  default:  
    LOG(FATAL) << "Syncedmem not initialized.";  
  }  
}

SyncedMemory类

Blob中的data 和 diff 既可以存储在CPU上,也可以存储在GPU上。Blob不用去担心数据的分配以及GPU/CPU同步问题,而交由SyncedMemory去管理

SyncedMemory完成了数据在CPU和GPU上的同步,以及内存的分配和释放操作

在Blob中访问数据:

  • 一种是以 const 方式只读,不改变数值
  • 一种mutable可改变数值的动态方式

最终都是交由 SyncedMemory 的数据访问函数 cpu_data() 和 mutable_cpu_data()

template <typename Dtype>  
const Dtype* Blob<Dtype>::cpu_data() const {  
  CHECK(data_);  
  return (const Dtype*)data_->cpu_data();  
}  

template <typename Dtype>  
const Dtype* Blob<Dtype>::gpu_data() const {  
  CHECK(data_);  
  return (const Dtype*)data_->gpu_data();  
}  

template <typename Dtype>  
Dtype* Blob<Dtype>::mutable_cpu_data() {  
  CHECK(data_);  
  return static_cast<Dtype*>(data_->mutable_cpu_data());  
}  

template <typename Dtype>  
Dtype* Blob<Dtype>::mutable_gpu_data() {  
  CHECK(data_);  
  return static_cast<Dtype*>(data_->mutable_gpu_data());  
}

在SyncedMemory中,head_ 表示数据同步状态

enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };

并且通过 to_cpu 和 to_gpu 进行数据同步

inline void SyncedMemory::to_cpu() {  
  switch (head_) {  
  case UNINITIALIZED:  
    CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);  
    caffe_memset(size_, 0, cpu_ptr_);  
    head_ = HEAD_AT_CPU;  
    own_cpu_data_ = true;  
    break;  
  case HEAD_AT_GPU:  
#ifndef CPU_ONLY  
    if (cpu_ptr_ == NULL) {  
      CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);  
      own_cpu_data_ = true;  
    }  
    caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);  
    head_ = SYNCED;  
#else  
    NO_GPU;  
#endif  
    break;  
  case HEAD_AT_CPU:  
  case SYNCED:  
    break;  
  }  
}  

inline void SyncedMemory::to_gpu() {  
#ifndef CPU_ONLY  
  switch (head_) {  
  case UNINITIALIZED:  
    CUDA_CHECK(cudaGetDevice(&gpu_device_));  
    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));  
    caffe_gpu_memset(size_, 0, gpu_ptr_);  
    head_ = HEAD_AT_GPU;  
    own_gpu_data_ = true;  
    break;  
  case HEAD_AT_CPU:  
    if (gpu_ptr_ == NULL) {  
      CUDA_CHECK(cudaGetDevice(&gpu_device_));  
      CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));  
      own_gpu_data_ = true;  
    }  
    caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);  
    head_ = SYNCED;  
    break;  
  case HEAD_AT_GPU:  
  case SYNCED:  
    break;  
  }  
#else  
  NO_GPU;  
#endif  
}

官方有个CPU/GPU数据同步的例子:

// Assuming that data are on the CPU initially, and we have a blob.  
const Dtype* foo;  
Dtype* bar;  
foo = blob.gpu_data(); // data copied cpu->gpu.  
foo = blob.cpu_data(); // no data copied since both have up-to-date contents.  
bar = blob.mutable_gpu_data(); // no data copied.  
// ... some operations ...  
bar = blob.mutable_gpu_data(); // no data copied when we are still on GPU.  
foo = blob.cpu_data(); // data copied gpu->cpu, since the gpu side has modified the data  
foo = blob.gpu_data(); // no data copied since both have up-to-date contents  
bar = blob.mutable_cpu_data(); // still no data copied.  
bar = blob.mutable_gpu_data(); // data copied cpu->gpu.  
bar = blob.mutable_cpu_data(); // data copied gpu->cpu.

Blob的序列化

Blob通过Google Protocol Buffers进行相应的序列化操作,相应的函数有

void Blob<Dtype>::FromProto(const BlobProto& proto, bool reshape)  
void Blob<double>::ToProto(BlobProto* proto, bool write_diff)  
void Blob<float>::ToProto(BlobProto* proto, bool write_diff)

总结

Blon详细描述了信息是如何在Layer和Net中存储和交换的,是理解Layer和Net的基础


References