img

在使用Caffe训练网络的时候,往往会在train的prototxt里对图像有一些预处理操作、例如做resize、对训练数据减去均值等,在实际推理的时候我们还需把输入图像resize到网络输入的大小,必要的时候还需要做图像通道的transpose。在用PyCaffe进行推理的时候,这些操作需要使用caffe的transformer来实现,本文记述prototxt的预处理键值

通过查看Transformer对象的源代码可以知道有哪些方法:

class Transformer:
"""
Transform input for feeding into a Net.
Note: this is mostly for illustrative purposes and it is likely better
to define your own input preprocessing routine for your needs.
Parameters
----------
net : a Net for which the input should be prepared
"""
def __init__(self, inputs):
    self.inputs = inputs #  inputs存的是一个name和shape的字典,比如 {'data':(1,3,256,340)}
    self.transpose = {}  # Transformer中的这些参数全部是字典dict
    self.channel_swap = {}
    self.raw_scale = {}
    self.mean = {}
    self.input_scale = {}
def __check_input(self, in_):
    if in_ not in self.inputs:
        raise Exception('{} is not one of the net inputs: {}'.format(
            in_, self.inputs))
def preprocess(self, in_, data):
    """
    Format input for Caffe:
    - convert to single
    - resize to input dimensions (preserving number of channels)
    - transpose dimensions to K x H x W
    - reorder channels (for instance color to BGR)
    - scale raw input (e.g. from [0, 1] to [0, 255] for ImageNet models)
    - subtract mean
    - scale feature
    Parameters
    ----------
    in_ : name of input blob to preprocess for
    data : (H' x W' x K) ndarray  !!!!输入data必须是单个image,后面有限制
    Returns
    -------
    caffe_in : (K x H x W) ndarray for input to a Net
    """
    self.__check_input(in_)
    caffe_in = data.astype(np.float32, copy=False)
    transpose = self.transpose.get(in_)
    channel_swap = self.channel_swap.get(in_)
    raw_scale = self.raw_scale.get(in_)
    mean = self.mean.get(in_)
    input_scale = self.input_scale.get(in_)
    in_dims = self.inputs[in_][2:] # in_dims即inputs设定的shape的(H,W)
    # 检查输入data的(H,W)是否与in_dims一样,注意这里[:2]限制了输入data只能是单个iamge
    if caffe_in.shape[:2] != in_dims: 
        caffe_in = resize_image(caffe_in, in_dims)
    if transpose is not None:
        caffe_in = caffe_in.transpose(transpose)
    if channel_swap is not None:  # 如果没有transpose第一维不是通道,这里会报错吧。。。
        caffe_in = caffe_in[channel_swap, :, :]
    if raw_scale is not None:
        caffe_in *= raw_scale
    if mean is not None:
        caffe_in -= mean
    if input_scale is not None:
        caffe_in *= input_scale
    return caffe_in
def deprocess(self, in_, data):
    """
    Invert Caffe formatting; see preprocess().
    """
    self.__check_input(in_)
    decaf_in = data.copy().squeeze()
    transpose = self.transpose.get(in_)
    channel_swap = self.channel_swap.get(in_)
    raw_scale = self.raw_scale.get(in_)
    mean = self.mean.get(in_)
    input_scale = self.input_scale.get(in_)
    if input_scale is not None:
        decaf_in /= input_scale
    if mean is not None:
        decaf_in += mean
    if raw_scale is not None:
        decaf_in /= raw_scale
    if channel_swap is not None:
        decaf_in = decaf_in[np.argsort(channel_swap), :, :]
    if transpose is not None:
        decaf_in = decaf_in.transpose(np.argsort(transpose))
    return decaf_in
def set_transpose(self, in_, order):
    """
    Set the input channel order for e.g. RGB to BGR conversion
    as needed for the reference ImageNet model.
    Parameters
    ----------
    in_ : which input to assign this channel order
    order : the order to transpose the dimensions
    """
    self.__check_input(in_)
    if len(order) != len(self.inputs[in_]) - 1:  #注意,这里只比较设定shape的后3维
        raise Exception('Transpose order needs to have the same number of '
                        'dimensions as the input.')
    self.transpose[in_] = order
def set_channel_swap(self, in_, order):
    """
    Set the input channel order for e.g. RGB to BGR conversion
    as needed for the reference ImageNet model.
    N.B. this assumes the channels are the first dimension AFTER transpose.
    Parameters
    ----------
    in_ : which input to assign this channel order
    order : the order to take the channels.
        (2,1,0) maps RGB to BGR for example.
    """
    self.__check_input(in_)
    if len(order) != self.inputs[in_][1]:
        raise Exception('Channel swap needs to have the same number of '
                        'dimensions as the input channels.')
    self.channel_swap[in_] = order
def set_raw_scale(self, in_, scale):
    """
    Set the scale of raw features s.t. the input blob = input * scale.
    While Python represents images in [0, 1], certain Caffe models
    like CaffeNet and AlexNet represent images in [0, 255] so the raw_scale
    of these models must be 255.
    Parameters
    ----------
    in_ : which input to assign this scale factor
    scale : scale coefficient
    """
    self.__check_input(in_)
    self.raw_scale[in_] = scale
def set_mean(self, in_, mean):
    """
    Set the mean to subtract for centering the data.
    Parameters
    ----------
    in_ : which input to assign this mean.
    mean : mean ndarray (input dimensional or broadcastable)
    """
    self.__check_input(in_)
    ms = mean.shape
    if mean.ndim == 1:
        # broadcast channels
        if ms[0] != self.inputs[in_][1]:
            raise ValueError('Mean channels incompatible with input.')
        mean = mean[:, np.newaxis, np.newaxis]
    else:
        # elementwise mean
        if len(ms) == 2:
            ms = (1,) + ms
        if len(ms) != 3:
            raise ValueError('Mean shape invalid')
        if ms != self.inputs[in_][1:]:
            raise ValueError('Mean shape incompatible with input shape.')
    self.mean[in_] = mean
def set_input_scale(self, in_, scale):
    """
    Set the scale of preprocessed inputs s.t. the blob = blob * scale.
    N.B. input_scale is done AFTER mean subtraction and other preprocessing
    while raw_scale is done BEFORE.
    Parameters
    ----------
    in_ : which input to assign this scale factor
    scale : scale coefficient
    """
    self.__check_input(in_)
    self.input_scale[in_] = scale

在Python中有很典型的用法:


import numpy as np
import sys,os
# 设置当前的工作环境在caffe下
caffe_root = '/home/xxx/caffe/' 
# 我们也把caffe/python也添加到当前环境
sys.path.insert(0, caffe_root + 'python')
import caffe
os.chdir(caffe_root)#更换工作目录
# 设置网络结构
net_file=caffe_root + 'models/bvlc_reference_caffenet/deploy.prototxt'
# 添加训练之后的参数
caffe_model=caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'
# 均值文件
mean_file=caffe_root + 'python/caffe/imagenet/ilsvrc_2012_mean.npy'
# 这里对任何一个程序都是通用的,就是处理图片
# 把上面添加的两个变量都作为参数构造一个Net
net = caffe.Net(net_file,caffe_model,caffe.TEST)
# 得到data的形状,这里的图片是默认matplotlib底层加载的
transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
# matplotlib加载的image是像素[0-1],图片的数据格式[weight,high,channels],RGB
# caffe加载的图片需要的是[0-255]像素,数据格式[channels,weight,high],BGR,那么就需要转换 
# channel 放到前面
transformer.set_transpose('data', (2,0,1))
transformer.set_mean('data', np.load(mean_file).mean(1).mean(1))
# 图片像素放大到[0-255]
transformer.set_raw_scale('data', 255) 
# RGB-->BGR 转换
transformer.set_channel_swap('data', (2,1,0))
# 这里才是加载图片
im=caffe.io.load_image(caffe_root+'examples/images/cat.jpg')
# 用上面的transformer.preprocess来处理刚刚加载图片
net.blobs['data'].data[...] = transformer.preprocess('data',im)
#注意,网络开始向前传播啦
out = net.forward()
# 最终的结果: 当前这个图片的属于哪个物体的概率(列表表示)
output_prob = output['prob'][0]
# 找出最大的那个概率
print('predicted class is:', output_prob.argmax())
# 也可以找出前五名的概率
top_inds = output_prob.argsort()[::-1][:5]  
print('probabilities and labels:')
for i in top_inds:
    print(i,output_prob[i], labels[i])

Resize

注意到,Transformer并没有提供resize的接口,其实在:

transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})

这里会自动指定输入图像的shape,之后调用preprocess:

net.blobs['data'].data[...] = transformer.preprocess('data',im)

会自动做resize。

手动指定mean

一般caffe的数据集会提供mean的binary文件,但有的时候一些train的脚本不会使用到这个文件,举个例子,这是一个针对imagenet的train.prototxt:

layer {
    name: "data"
    type: "Data"
    top: "data"
    top: "label"
    include {
        phase: TRAIN
    }
    transform_param {
        mirror: true
        crop_size: 224
        mean_value: 104
        mean_value: 117
        mean_value: 123
    }
    data_param {
    source: "../ilsvrc12_lmdb_shrt_256/ilsvrc12_train_lmdb"
    batch_size: 128
    backend: LMDB
    }
}

这时候该怎么办呢,这部分含义是,对R、G、B通道减去104、117、123,所以需要:

mean = np.ones([3, 224, 224], dtype=np.float)
mean[0,:,:] = 104
mean[1,:,:] = 117
mean[2,:,:] = 123
transformer.set_mean('data', mean)            # subtract the dataset-mean value in each channel    

图像的保存

这里,我推荐大家使用opencv、因为PIL.Image这个库我使用了有很多不方便的地方。

saveim = transformed_image.transpose((1, 2, 0))
cv.imwrite(filepath, saveim)
image = cv.imread(image)
image = image.transpose((2,0,1))