Skip to content
2 changes: 2 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@
# Written by Joao Carreira, Pulkit Agrawal and Katerina Fragkiadki
# --------------------------------------------------------
from . import config
import _init_paths

151 changes: 139 additions & 12 deletions src/pose_video_demo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python2
# vim: set shiftwidth=1
"""
/************************************************************************
Copyright (c) 2016, Stefan Helmert
Expand All @@ -7,17 +8,79 @@

************************************************************************/
"""
import _init_paths
import cv2
import test_demo as td
import scipy.misc as scm
import numpy as np
import csv
import time, os, sys
import argparse
try:
import _init_py_faster_rcnn_paths
import detectcore
except:
print('No person detector found! - Person detection not useable. Please specify the coordinates where humans appear that should be analyzed.')
import collections
import copy
#try:
#except:
# print('py-faster-rcnn not available - no automatic human detection')
class rectangle_c:
def __init__(self):
self.x_center = 0
self.y_center = 0
self.x_range = 0
self.y_range = 0

def posevideo(input_video_name, output_video_name=None, output_csv_name=None, isGPU=True, deviceId=0, bodyPt=[600, 400]):
def humdet(frame, threshold=0.5):
global gnet
cls_vec, dets_vec = detectcore.detect_object(gnet, frame)
person = dets_vec[cls_vec.index("person")]
human_vec = []
score_vec = []
detection_vec = []
det_world_vec = []
dets = person
thresh = threshold
inds = np.where(dets[:, -1] >= thresh)[0]
for i in inds:
bbox = dets[i, :4]
score = dets[i, -1]
score_vec.append(score)
detection = rectangle_c()
detection.x_center = (bbox[0] + bbox[2])/2
detection.y_center = (bbox[1] + bbox[3])/2
detection.x_range = (bbox[2] - bbox[0])
detection.y_range = (bbox[3] - bbox[1])
detection_vec.append(detection)
return detection_vec



def sameorder(objs, objs_old):
objs_new = copy.deepcopy(objs_old)
while len(objs_new) < len(objs):
objs_new.append(rectangle_c())
objs_set = np.zeros(len(objs_new))
while np.sum(objs_set) < len(objs) and np.sum(objs_set) < len(objs_set):
for i, obj in enumerate(objs):
dist_min = 10000000
idx = i
for j, obj_old in enumerate(objs_old):
if 0 == objs_set[j]:
dist = np.sqrt(np.power(obj_old.x_center - obj.x_center, 2) + np.power(obj_old.y_center - obj.y_center, 2))
if dist < dist_min:
dist_min = dist
idx = j
objs_new[idx] = copy.deepcopy(obj)
objs_set[idx] = 1
return objs_new

def posevideo(input_video_name, output_video_name=None, output_csv_name=None, isGPU=True, deviceId=0, bodyPt=[600, 400], iterations=4, fixedScale=False, scaleIdx=0, fbfactor=0.0, thresh=0.9, detinterv=10, bodyPts= [600, 400], maxhumans=4):
""" processing the video """
# Find OpenCV version
global gnet
(major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')

ief = td.PoseIEF(isGPU=isGPU, deviceId=deviceId)
Expand All @@ -27,29 +90,70 @@ def posevideo(input_video_name, output_video_name=None, output_csv_name=None, is
if(output_csv_name is not None and '' != output_csv_name):
pose_csv_file = open(output_csv_name, 'w')
pose_csv = csv.writer(pose_csv_file)
pose_csv.writerows([['x_rft', 'y_rft', 'x_rkn', 'y_rkn', 'x_rhp', 'y_rhp', 'x_lhp', 'y_lhp', 'x_lkn', 'y_lkn', 'x_lft', 'y_lft', 'x_plv', 'y_plv', 'x_trx', 'y_trx', 'x_un', 'y_un', 'x_hd', 'y_hd', 'x_rhn', 'y_rhn', 'x_rlb', 'y_rlb', 'x_rsh', 'y_rsh', 'x_lsh', 'y_lsh', 'x_llb', 'y_llb', 'x_lhn', 'y_lhn', 'x_hum', 'y_hum']])
pose_csv.writerows([['no_frm', 'no_prs', 'x_rft', 'y_rft', 'x_rkn', 'y_rkn', 'x_rhp', 'y_rhp', 'x_lhp', 'y_lhp', 'x_lkn', 'y_lkn', 'x_lft', 'y_lft', 'x_plv', 'y_plv', 'x_trx', 'y_trx', 'x_un', 'y_un', 'x_hd', 'y_hd', 'x_rhn', 'y_rhn', 'x_rlb', 'y_rlb', 'x_rsh', 'y_rsh', 'x_lsh', 'y_lsh', 'x_llb', 'y_llb', 'x_lhn', 'y_lhn', 'x_hum', 'y_hum']])
cnt = 0

humans_old = []
currPoses = []
read_bodyPts = True
while(True):
ret, frame = cap.read()
if ret is False:
return
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
if(output_video_name is not None and '' != output_video_name):
if(False == outv.isOpened()):
if(major_ver<3):
if int(major_ver) < 3:
fps = cap.get(cv2.cv.CV_CAP_PROP_FPS)
outv.open(output_video_name, cv2.cv.CV_FOURCC('A', 'P', '4', '1'), fps, (np.size(frame, 1), np.size(frame, 0)), True) #, frame.shape, True)
else:
fps = cap.get(cv2.CAP_PROP_FPS)
outv.open(output_video_name, cv2.VideoWriter_fourcc('A', 'P', '4', '1'), fps, (np.size(frame, 1), np.size(frame, 0)), True) #, frame.shape, True)
pose,_ = ief.predict(frame, bodyPt)
poses = []
if 0 < detinterv:
if 0 == cnt % detinterv:
humans = humdet(frame, thresh)
else:
humans = []
if read_bodyPts:
human = rectangle_c()
for i, val in enumerate(bodyPts):
if 0 == i % 2:
human.x_center = val
else:
human.y_center = val
humans.append(copy.deepcopy(human))
read_bodyPts = False

humans = sameorder(humans, humans_old)
humans = humans[0:maxhumans]
humans_old = humans
cnt += 1
print('Frame number: '+str(cnt))
if(output_csv_name is not None and '' != output_csv_name):
pose_arr = np.append(pose,[])
pose_csv.writerows([pose_arr])
for i, human in enumerate(humans):
bodyPt = [human.x_center, human.y_center]
try:
currPose = currPoses[i]
pose, currPose = ief.predict(frame, bodyPt, False, iterations, fixedScale, scaleIdx, True, currPose, fbfactor)
except:
pose, currPose = ief.predict(frame, bodyPt, False, iterations, fixedScale, scaleIdx)
humans_old[i].x_center = pose[7][0]
humans_old[i].y_center = pose[7][1]
try:
currPoses[i] = currPose
except:
currPoses.append(currPose)
poses.append(pose)
if(output_csv_name is not None and '' != output_csv_name):
pose_arr = np.append([cnt ,i], pose)
pose_csv.writerows([pose_arr])
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
if(output_video_name is not None and '' != output_video_name):
frame = td.vis.plot_pose_stickmodel_cv2mat(frame, pose.squeeze().transpose((1,0)))
for i, pose in enumerate(poses):
frame = td.vis.plot_pose_stickmodel_cv2mat(frame, pose.squeeze().transpose((1,0)), 2, False)
cv2.putText(frame, str(i), (int(humans[i].x_center), int(humans[i].y_center)), cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 0))
cv2.imshow('stickmodel', frame)
cv2.waitKey(1)
outv.write(frame)
if(output_video_name is not None and '' != output_video_name):
outv.close()
Expand All @@ -62,21 +166,44 @@ def parse_args():
parser.add_argument('--isGPU', dest='isGPU', help='Boolean value that specifies if a GPU should be used for detection - isGPU=False means the network runs on CPU', default=True, type=bool)
parser.add_argument('--deviceId', dest='deviceId', help='Natural value that specifies the number of the GPU which should be used. It starts with 0.', default='0', type=int)
parser.add_argument('--input_video', dest='input_video_name', help='The name of the video which should be analyzed.', default='video/demo.avi', type=str)
default_output_name = (parser.parse_args().input_video_name).rsplit('.', 1)[0]
parser.add_argument('--output_video', dest='output_video_name', help='The name of the video to be newly created containing the stick model.', default=default_output_name+'_PoseIEF.avi', type=str)
parser.add_argument('--output_csv', dest='output_csv_name', help='The name of the csv file to be newly created containing the joint postions.', default=default_output_name+'_PoseIEF.csv', type=str)
parser.add_argument('--output_video', dest='output_video_name', help='The name of the video to be newly created containing the stick model.', default='?', type=str)
parser.add_argument('--output_csv', dest='output_csv_name', help='The name of the csv file to be newly created containing the joint postions.', default='?', type=str)
parser.add_argument('--x_bodyPt', dest='x_bodyPt', help='Natural value that represents the x-coordinate of the pointer telling which human should be analyzed.', default=600, type=int)
parser.add_argument('--y_bodyPt', dest='y_bodyPt', help='Natural value that represents the y-coordinate of the pointer telling which human should be analyzed.', default=400, type=int)
parser.add_argument('--iterations', dest='iterations', help='Natural value that specifies how much IEF iterations per image should be done.', default=4, type=int)
parser.add_argument('--scaleIdx', dest='scaleIdx', help='Natural value that specifies the IEF scaleIdx if fixed scale flag is set.', default=0, type=int)
parser.add_argument('--fbfactor', dest='fbfactor', help='Fractional value between 0.0 and 1.0 that specifies the weight of the last pose of the last image for the current image.', default=0.0, type=float)
parser.add_argument('--fixedScale', dest='fixedScale', help='Boolean value that deactivates the autoscale network netScale. The scale index scaleIdx has to be specified manualy (default=0)', default=False, type=bool)
parser.add_argument('--bodyPts', dest='bodyPts', help='Natural value coordinates of human appearance/starting points for pose detection. Order: x_person_1 y_person_1 x_person_2 ...', default=[600, 400], type=int, nargs='+')
# arguments for human detection
parser.add_argument('--thresh', dest='thresh', help='Fractional value between 0.0 and 1.0 that specifies the selectivity of human detection.', default=0.9, type=float)
parser.add_argument('--detinterv', dest='detinterv', help='Natural value that specifies every how much images human detection is done.', default=10, type=int)
parser.add_argument('--net', dest='net', help='The name of network used for human detection.', default='vgg16', type=str)
parser.add_argument('--maxhumans', dest='maxhumans', help='Natural value that specifies maximum number of tracked humans.', default=4, type=int)


if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)

args = parser.parse_args()
default_output_name = args.input_video_name.rsplit('.', 1)[0]
if '?' == args.output_video_name:
args.output_video_name = default_output_name+'_PoseIEF.avi'
if '?' == args.output_csv_name:
args.output_csv_name = default_output_name+'_PoseIEF.csv'

return args

if __name__ == '__main__':
global gnet
args = parse_args()
print('Called with args:')
print(args)
posevideo(args.input_video_name, args.output_video_name, args.output_csv_name, isGPU=args.isGPU, deviceId=args.deviceId, bodyPt=[args.x_bodyPt, args.y_bodyPt])
frargs = collections.namedtuple('args', 'demo_net cpu_mode gpu_id')
frargs.cpu_mode = not args.isGPU
frargs.demo_net = args.net
frargs.gpu_id = args.deviceId
gnet = detectcore.init(frargs)
posevideo(args.input_video_name, args.output_video_name, args.output_csv_name, isGPU=args.isGPU, deviceId=args.deviceId, bodyPt=[args.x_bodyPt, args.y_bodyPt], iterations=args.iterations, fixedScale=args.fixedScale, scaleIdx=args.scaleIdx, fbfactor=args.fbfactor, thresh=args.thresh, detinterv=args.detinterv, bodyPts=args.bodyPts, maxhumans=args.maxhumans)

127 changes: 87 additions & 40 deletions src/test_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import scipy.io as sio
import scipy.misc as scm
import pdb
import time
import cv2

LIST_SCALES = cfg.SCALE_LAMBDA

Expand Down Expand Up @@ -51,6 +53,7 @@ def get_pose_net(isGPU=True, deviceId=0):
metaData = pickle.load(open(metaFile, 'r'))
return net, metaData


##
# Predicting Poses
class PoseIEF(object):
Expand All @@ -68,73 +71,117 @@ def __init__(self, netScale=None, netPose=None, metaPose=None, cropSz=256, poseI
self.cropSz_ = cropSz
self.poseImSz_ = poseImSz

##
#Predict pose
def predict(self, imName='./test_images/mpii-test-079555750.jpg',
bodyPt=(249,249), returnIm=False):
'''
imName : image file name for which the pose needs to be predicted
bodyPt : A point on the body of the person (torso) for whom the pose
is to be predicted
returnIm: If True, return the image also
'''
cropSz, poseImSz = self.cropSz_, self.poseImSz_
#Read the image
if(isinstance(imName, str)):
im = scm.imread(imName)
else:
im = imName


def calc_scaleIdx_from_bbox(self, width, height):
cropSz = self.cropSz_
hscale = cropSz / height
wscale = cropSz / width
for i,s in enumerate(LIST_SCALES):
if s < hscale or s < wscale:
return s

def proc_fixedScale(self, im, cropSz, poseImSz, bodyPt, scaleIdx):
imScale = np.zeros((cropSz, cropSz, 3))
oScale = np.zeros((2))
oPos = np.zeros((2))
scale = LIST_SCALES[scaleIdx]
imScale, scs, crpPos = imu.centered_crop(cropSz, copy.deepcopy(im), bodyPt, scale, returnScale=True)
oScale = np.array(scs).reshape(1,2)
oPos = np.array(crpPos).reshape(1,2)
xSt, ySt = (cropSz - poseImSz)/2, (cropSz - poseImSz)/2
xEn, yEn = xSt + poseImSz, ySt + poseImSz
imScale = imScale[ySt:yEn, xSt:xEn,:].reshape((1,poseImSz,poseImSz,3))
return imScale, xSt, ySt, oPos, oScale, scaleIdx



def proc_netScale(self, im, cropSz, poseImSz, bodyPt):
#Crop the image at different scales
t = time.time()
imData = np.zeros((len(LIST_SCALES), cropSz, cropSz, 3))
scData = np.zeros((len(LIST_SCALES), 2))
posData = np.zeros((len(LIST_SCALES), 2))
for i,s in enumerate(LIST_SCALES):
imData[i], scs, crpPos = imu.centered_crop(cropSz, copy.deepcopy(im), bodyPt, s,
returnScale=True)
imData[i], scs, crpPos = imu.centered_crop(cropSz, copy.deepcopy(im), bodyPt, s, returnScale=True)
scData[i] = np.array(scs).reshape(1,2)
posData[i] = np.array(crpPos).reshape(1,2)

print('crop time: {:.3f}s').format(time.time() - t)
#Use the scale net to find the best scale
scaleOp = self.netScale_.forward(blobs=['fc-op'], data=imData)
t = time.time()
scaleOp = self.netScale_.forward(blobs=['fc-op'], data=imData)
print('netScale time: {:.3f}s').format(time.time() - t)
scaleIdx = scaleOp['fc-op'].squeeze().argmax()
scale = LIST_SCALES[scaleIdx]
#Scale to use to return the image in the original space
oScale = scData[scaleIdx]
#Original location of the cropped image
oPos = posData[scaleIdx]

#Prepare image for pose prediction
imScale = imData[scaleIdx]
print(scaleIdx)
print(len(imData))
xSt, ySt = (cropSz - poseImSz)/2, (cropSz - poseImSz)/2
xEn, yEn = xSt + poseImSz, ySt + poseImSz
imScale = imScale[ySt:yEn, xSt:xEn,:].reshape((1,poseImSz,poseImSz,3))
return imScale, xSt, ySt, oPos, oScale, scaleIdx

#Seed pose
currPose = np.zeros((1,17,2,1)).astype(np.float32)
for i in range(16):
currPose[0,i,0] = copy.deepcopy(self.seedPose_[0,i] - xSt)
currPose[0,i,1] = copy.deepcopy(self.seedPose_[1,i] - ySt)
#The marking point is the center of the image
currPose[0, 16, 0] = poseImSz / 2
currPose[0, 16, 1] = poseImSz / 2

def proc_netPose(self, imScale, currPose):
t = time.time()
#Dummy labels
labels = np.zeros((1,16,2,1)).astype(np.float32)
poseOp = self.netPose_.forward(blobs=['cls3_fc'], image=imScale, kp_pos=copy.deepcopy(currPose), label=labels)
print('netPose time: {:.3f}s').format(time.time() - t)
kPred = copy.deepcopy(poseOp['cls3_fc'].squeeze())
for i in range(16):
dx, dy = kPred[i], kPred[16 + i]
#print(dx, dy)
currPose[0,i,0] = currPose[0,i,0] + self.mxStepSz_ * dx
currPose[0,i,1] = currPose[0,i,1] + self.mxStepSz_ * dy
return currPose
##
#Predict pose
def predict(self, imName='./test_images/mpii-test-079555750.jpg', bodyPt=(249,249), returnIm=False, noIterations=4, fixedScale=False, scaleIdx=None, initialPose=False, currPose=None, loopfactor=1.0):
'''
imName : image file name for which the pose needs to be predicted
bodyPt : A point on the body of the person (torso) for whom the pose
is to be predicted
returnIm: If True, return the image also
'''
tt = time.time()
cropSz, poseImSz = self.cropSz_, self.poseImSz_
#Read the image
if(isinstance(imName, str)):
im = scm.imread(imName)
else:
im = imName

if True == fixedScale:
imScale, xSt, ySt, oPos, oScale, scaleIdx = self.proc_fixedScale(im, cropSz, poseImSz, bodyPt, scaleIdx)
else:
imScale, xSt, ySt, oPos, oScale, scaleIdx = self.proc_netScale(im, cropSz, poseImSz, bodyPt)

#Seed pose
currPose_ = np.zeros((1,17,2,1)).astype(np.float32)
for i in range(16):
currPose_[0,i,0] = copy.deepcopy(self.seedPose_[0,i] - xSt)
currPose_[0,i,1] = copy.deepcopy(self.seedPose_[1,i] - ySt)
#The marking point is the center of the image
currPose_[0, 16, 0] = poseImSz / 2
currPose_[0, 16, 1] = poseImSz / 2

if False == initialPose:
currPose = currPose_
#cv2.imshow('imScale', imScale[0])
#cv2.waitKey(1)
currPose = np.add(np.multiply(currPose, loopfactor), np.multiply(currPose_, 1.0-loopfactor))
#Predict Pose
for step in range(4):
poseOp = self.netPose_.forward(blobs=['cls3_fc'], image=imScale,
kp_pos=copy.deepcopy(currPose), label=labels)
kPred = copy.deepcopy(poseOp['cls3_fc'].squeeze())
for i in range(16):
dx, dy = kPred[i], kPred[16 + i]
currPose[0,i,0] = currPose[0,i,0] + self.mxStepSz_ * dx
currPose[0,i,1] = currPose[0,i,1] + self.mxStepSz_ * dy

for step in range(noIterations):
currPose = self.proc_netPose(imScale, currPose)
#Convert the pose in the original image coordinated
origPose = (currPose.squeeze() + np.array([xSt, ySt]).reshape(1,2)) * oScale + oPos


print('predict time: {:.3f}s').format(time.time() - tt)
if returnIm:
#return origPose, copy.deepcopy(currPose), imScale[0]
return origPose, im
Expand Down
Loading