Module 3_mxrcnn.lib.mx-rcnn.symdata.image
Expand source code
import numpy as np
import cv2
def get_image(roi_rec, short, max_size, mean, std):
"""
read, resize, transform image, return im_tensor, im_info, gt_boxes
roi_rec should have keys: ["image", "boxes", "gt_classes", "flipped"]
0 --- x (width, second dim of im)
|
y (height, first dim of im)
"""
im = imdecode(roi_rec['image'])
if roi_rec["flipped"]:
im = im[:, ::-1, :]
im, im_scale = resize(im, short, max_size)
height, width = im.shape[:2]
im_info = np.array([height, width, im_scale], dtype=np.float32)
im_tensor = transform(im, mean, std)
# gt boxes: (x1, y1, x2, y2, cls)
if roi_rec['gt_classes'].size > 0:
gt_inds = np.where(roi_rec['gt_classes'] != 0)[0]
gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
gt_boxes[:, 0:4] = roi_rec['boxes'][gt_inds, :]
gt_boxes[:, 4] = roi_rec['gt_classes'][gt_inds]
# scale gt_boxes
gt_boxes[:, 0:4] *= im_scale
else:
gt_boxes = np.empty((0, 5), dtype=np.float32)
return im_tensor, im_info, gt_boxes
def imdecode(image_path):
"""Return BGR image read by opencv"""
import os
assert os.path.exists(image_path), image_path + ' not found'
im = cv2.imread(image_path)
return im
def resize(im, short, max_size):
"""
only resize input image to target size and return scale
:param im: BGR image input by opencv
:param short: one dimensional size (the short side)
:param max_size: one dimensional max size (the long side)
:return: resized image (NDArray) and scale (float)
"""
im_shape = im.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
im_scale = float(short) / float(im_size_min)
# prevent bigger axis from being more than max_size:
if np.round(im_scale * im_size_max) > max_size:
im_scale = float(max_size) / float(im_size_max)
im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
return im, im_scale
def transform(im, mean, std):
"""
transform into mxnet tensor,
subtract pixel size and transform to correct format
:param im: [height, width, channel] in BGR
:param mean: [RGB pixel mean]
:param std: [RGB pixel std var]
:return: [batch, channel, height, width]
"""
im_tensor = np.zeros((3, im.shape[0], im.shape[1]))
for i in range(3):
im_tensor[i, :, :] = (im[:, :, 2 - i] - mean[i]) / std[i]
return im_tensor
def transform_inverse(im_tensor, mean, std):
"""
transform from mxnet im_tensor to ordinary RGB image
im_tensor is limited to one image
:param im_tensor: [batch, channel, height, width]
:param mean: [RGB pixel mean]
:param std: [RGB pixel std var]
:return: im [height, width, channel(RGB)]
"""
assert im_tensor.shape[0] == 3
im = im_tensor.transpose((1, 2, 0))
im = im * std + mean
im = im.astype(np.uint8)
return im
def tensor_vstack(tensor_list, pad=0):
"""
vertically stack tensors by adding a new axis
expand dims if only 1 tensor
:param tensor_list: list of tensor to be stacked vertically
:param pad: label to pad with
:return: tensor with max shape
"""
if len(tensor_list) == 1:
return tensor_list[0][np.newaxis, :]
ndim = len(tensor_list[0].shape)
dimensions = [len(tensor_list)] # first dim is batch size
for dim in range(ndim):
dimensions.append(max([tensor.shape[dim] for tensor in tensor_list]))
dtype = tensor_list[0].dtype
if pad == 0:
all_tensor = np.zeros(tuple(dimensions), dtype=dtype)
elif pad == 1:
all_tensor = np.ones(tuple(dimensions), dtype=dtype)
else:
all_tensor = np.full(tuple(dimensions), pad, dtype=dtype)
if ndim == 1:
for ind, tensor in enumerate(tensor_list):
all_tensor[ind, :tensor.shape[0]] = tensor
elif ndim == 2:
for ind, tensor in enumerate(tensor_list):
all_tensor[ind, :tensor.shape[0], :tensor.shape[1]] = tensor
elif ndim == 3:
for ind, tensor in enumerate(tensor_list):
all_tensor[ind, :tensor.shape[0], :tensor.shape[1], :tensor.shape[2]] = tensor
else:
raise Exception('Sorry, unimplemented.')
return all_tensor
Functions
def get_image(roi_rec, short, max_size, mean, std)
-
read, resize, transform image, return im_tensor, im_info, gt_boxes roi_rec should have keys: ["image", "boxes", "gt_classes", "flipped"] 0 — x (width, second dim of im) | y (height, first dim of im)
Expand source code
def get_image(roi_rec, short, max_size, mean, std): """ read, resize, transform image, return im_tensor, im_info, gt_boxes roi_rec should have keys: ["image", "boxes", "gt_classes", "flipped"] 0 --- x (width, second dim of im) | y (height, first dim of im) """ im = imdecode(roi_rec['image']) if roi_rec["flipped"]: im = im[:, ::-1, :] im, im_scale = resize(im, short, max_size) height, width = im.shape[:2] im_info = np.array([height, width, im_scale], dtype=np.float32) im_tensor = transform(im, mean, std) # gt boxes: (x1, y1, x2, y2, cls) if roi_rec['gt_classes'].size > 0: gt_inds = np.where(roi_rec['gt_classes'] != 0)[0] gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) gt_boxes[:, 0:4] = roi_rec['boxes'][gt_inds, :] gt_boxes[:, 4] = roi_rec['gt_classes'][gt_inds] # scale gt_boxes gt_boxes[:, 0:4] *= im_scale else: gt_boxes = np.empty((0, 5), dtype=np.float32) return im_tensor, im_info, gt_boxes
def imdecode(image_path)
-
Return BGR image read by opencv
Expand source code
def imdecode(image_path): """Return BGR image read by opencv""" import os assert os.path.exists(image_path), image_path + ' not found' im = cv2.imread(image_path) return im
def resize(im, short, max_size)
-
only resize input image to target size and return scale :param im: BGR image input by opencv :param short: one dimensional size (the short side) :param max_size: one dimensional max size (the long side) :return: resized image (NDArray) and scale (float)
Expand source code
def resize(im, short, max_size): """ only resize input image to target size and return scale :param im: BGR image input by opencv :param short: one dimensional size (the short side) :param max_size: one dimensional max size (the long side) :return: resized image (NDArray) and scale (float) """ im_shape = im.shape im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) im_scale = float(short) / float(im_size_min) # prevent bigger axis from being more than max_size: if np.round(im_scale * im_size_max) > max_size: im_scale = float(max_size) / float(im_size_max) im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) return im, im_scale
def tensor_vstack(tensor_list, pad=0)
-
vertically stack tensors by adding a new axis expand dims if only 1 tensor :param tensor_list: list of tensor to be stacked vertically :param pad: label to pad with :return: tensor with max shape
Expand source code
def tensor_vstack(tensor_list, pad=0): """ vertically stack tensors by adding a new axis expand dims if only 1 tensor :param tensor_list: list of tensor to be stacked vertically :param pad: label to pad with :return: tensor with max shape """ if len(tensor_list) == 1: return tensor_list[0][np.newaxis, :] ndim = len(tensor_list[0].shape) dimensions = [len(tensor_list)] # first dim is batch size for dim in range(ndim): dimensions.append(max([tensor.shape[dim] for tensor in tensor_list])) dtype = tensor_list[0].dtype if pad == 0: all_tensor = np.zeros(tuple(dimensions), dtype=dtype) elif pad == 1: all_tensor = np.ones(tuple(dimensions), dtype=dtype) else: all_tensor = np.full(tuple(dimensions), pad, dtype=dtype) if ndim == 1: for ind, tensor in enumerate(tensor_list): all_tensor[ind, :tensor.shape[0]] = tensor elif ndim == 2: for ind, tensor in enumerate(tensor_list): all_tensor[ind, :tensor.shape[0], :tensor.shape[1]] = tensor elif ndim == 3: for ind, tensor in enumerate(tensor_list): all_tensor[ind, :tensor.shape[0], :tensor.shape[1], :tensor.shape[2]] = tensor else: raise Exception('Sorry, unimplemented.') return all_tensor
def transform(im, mean, std)
-
transform into mxnet tensor, subtract pixel size and transform to correct format :param im: [height, width, channel] in BGR :param mean: [RGB pixel mean] :param std: [RGB pixel std var] :return: [batch, channel, height, width]
Expand source code
def transform(im, mean, std): """ transform into mxnet tensor, subtract pixel size and transform to correct format :param im: [height, width, channel] in BGR :param mean: [RGB pixel mean] :param std: [RGB pixel std var] :return: [batch, channel, height, width] """ im_tensor = np.zeros((3, im.shape[0], im.shape[1])) for i in range(3): im_tensor[i, :, :] = (im[:, :, 2 - i] - mean[i]) / std[i] return im_tensor
def transform_inverse(im_tensor, mean, std)
-
transform from mxnet im_tensor to ordinary RGB image im_tensor is limited to one image :param im_tensor: [batch, channel, height, width] :param mean: [RGB pixel mean] :param std: [RGB pixel std var] :return: im [height, width, channel(RGB)]
Expand source code
def transform_inverse(im_tensor, mean, std): """ transform from mxnet im_tensor to ordinary RGB image im_tensor is limited to one image :param im_tensor: [batch, channel, height, width] :param mean: [RGB pixel mean] :param std: [RGB pixel std var] :return: im [height, width, channel(RGB)] """ assert im_tensor.shape[0] == 3 im = im_tensor.transpose((1, 2, 0)) im = im * std + mean im = im.astype(np.uint8) return im