diff --git a/src/__init__.py b/src/__init__.py index ac5e7c8..6a049da 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -5,3 +5,5 @@ # Written by Joao Carreira, Pulkit Agrawal and Katerina Fragkiadki # -------------------------------------------------------- from . import config +import _init_paths + diff --git a/src/pose_video_demo.py b/src/pose_video_demo.py index 7b82d96..a2a39f0 100755 --- a/src/pose_video_demo.py +++ b/src/pose_video_demo.py @@ -1,4 +1,5 @@ #!/usr/bin/env python2 +# vim: set shiftwidth=1 """ /************************************************************************ Copyright (c) 2016, Stefan Helmert @@ -7,6 +8,7 @@ ************************************************************************/ """ +import _init_paths import cv2 import test_demo as td import scipy.misc as scm @@ -14,10 +16,71 @@ import csv import time, os, sys import argparse +try: + import _init_py_faster_rcnn_paths + import detectcore +except: + print('No person detector found! - Person detection not useable. Please specify the coordinates where humans appear that should be analyzed.') +import collections +import copy +#try: +#except: +# print('py-faster-rcnn not available - no automatic human detection') +class rectangle_c: + def __init__(self): + self.x_center = 0 + self.y_center = 0 + self.x_range = 0 + self.y_range = 0 -def posevideo(input_video_name, output_video_name=None, output_csv_name=None, isGPU=True, deviceId=0, bodyPt=[600, 400]): +def humdet(frame, threshold=0.5): + global gnet + cls_vec, dets_vec = detectcore.detect_object(gnet, frame) + person = dets_vec[cls_vec.index("person")] + human_vec = [] + score_vec = [] + detection_vec = [] + det_world_vec = [] + dets = person + thresh = threshold + inds = np.where(dets[:, -1] >= thresh)[0] + for i in inds: + bbox = dets[i, :4] + score = dets[i, -1] + score_vec.append(score) + detection = rectangle_c() + detection.x_center = (bbox[0] + bbox[2])/2 + detection.y_center = (bbox[1] + bbox[3])/2 + detection.x_range = (bbox[2] - bbox[0]) + detection.y_range = (bbox[3] - bbox[1]) + detection_vec.append(detection) + return detection_vec + + + +def sameorder(objs, objs_old): + objs_new = copy.deepcopy(objs_old) + while len(objs_new) < len(objs): + objs_new.append(rectangle_c()) + objs_set = np.zeros(len(objs_new)) + while np.sum(objs_set) < len(objs) and np.sum(objs_set) < len(objs_set): + for i, obj in enumerate(objs): + dist_min = 10000000 + idx = i + for j, obj_old in enumerate(objs_old): + if 0 == objs_set[j]: + dist = np.sqrt(np.power(obj_old.x_center - obj.x_center, 2) + np.power(obj_old.y_center - obj.y_center, 2)) + if dist < dist_min: + dist_min = dist + idx = j + objs_new[idx] = copy.deepcopy(obj) + objs_set[idx] = 1 + return objs_new + +def posevideo(input_video_name, output_video_name=None, output_csv_name=None, isGPU=True, deviceId=0, bodyPt=[600, 400], iterations=4, fixedScale=False, scaleIdx=0, fbfactor=0.0, thresh=0.9, detinterv=10, bodyPts= [600, 400], maxhumans=4): """ processing the video """ # Find OpenCV version + global gnet (major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.') ief = td.PoseIEF(isGPU=isGPU, deviceId=deviceId) @@ -27,8 +90,12 @@ def posevideo(input_video_name, output_video_name=None, output_csv_name=None, is if(output_csv_name is not None and '' != output_csv_name): pose_csv_file = open(output_csv_name, 'w') pose_csv = csv.writer(pose_csv_file) - pose_csv.writerows([['x_rft', 'y_rft', 'x_rkn', 'y_rkn', 'x_rhp', 'y_rhp', 'x_lhp', 'y_lhp', 'x_lkn', 'y_lkn', 'x_lft', 'y_lft', 'x_plv', 'y_plv', 'x_trx', 'y_trx', 'x_un', 'y_un', 'x_hd', 'y_hd', 'x_rhn', 'y_rhn', 'x_rlb', 'y_rlb', 'x_rsh', 'y_rsh', 'x_lsh', 'y_lsh', 'x_llb', 'y_llb', 'x_lhn', 'y_lhn', 'x_hum', 'y_hum']]) + pose_csv.writerows([['no_frm', 'no_prs', 'x_rft', 'y_rft', 'x_rkn', 'y_rkn', 'x_rhp', 'y_rhp', 'x_lhp', 'y_lhp', 'x_lkn', 'y_lkn', 'x_lft', 'y_lft', 'x_plv', 'y_plv', 'x_trx', 'y_trx', 'x_un', 'y_un', 'x_hd', 'y_hd', 'x_rhn', 'y_rhn', 'x_rlb', 'y_rlb', 'x_rsh', 'y_rsh', 'x_lsh', 'y_lsh', 'x_llb', 'y_llb', 'x_lhn', 'y_lhn', 'x_hum', 'y_hum']]) cnt = 0 + + humans_old = [] + currPoses = [] + read_bodyPts = True while(True): ret, frame = cap.read() if ret is False: @@ -36,20 +103,57 @@ def posevideo(input_video_name, output_video_name=None, output_csv_name=None, is frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) if(output_video_name is not None and '' != output_video_name): if(False == outv.isOpened()): - if(major_ver<3): + if int(major_ver) < 3: fps = cap.get(cv2.cv.CV_CAP_PROP_FPS) outv.open(output_video_name, cv2.cv.CV_FOURCC('A', 'P', '4', '1'), fps, (np.size(frame, 1), np.size(frame, 0)), True) #, frame.shape, True) else: fps = cap.get(cv2.CAP_PROP_FPS) outv.open(output_video_name, cv2.VideoWriter_fourcc('A', 'P', '4', '1'), fps, (np.size(frame, 1), np.size(frame, 0)), True) #, frame.shape, True) - pose,_ = ief.predict(frame, bodyPt) + poses = [] + if 0 < detinterv: + if 0 == cnt % detinterv: + humans = humdet(frame, thresh) + else: + humans = [] + if read_bodyPts: + human = rectangle_c() + for i, val in enumerate(bodyPts): + if 0 == i % 2: + human.x_center = val + else: + human.y_center = val + humans.append(copy.deepcopy(human)) + read_bodyPts = False + + humans = sameorder(humans, humans_old) + humans = humans[0:maxhumans] + humans_old = humans cnt += 1 print('Frame number: '+str(cnt)) - if(output_csv_name is not None and '' != output_csv_name): - pose_arr = np.append(pose,[]) - pose_csv.writerows([pose_arr]) + for i, human in enumerate(humans): + bodyPt = [human.x_center, human.y_center] + try: + currPose = currPoses[i] + pose, currPose = ief.predict(frame, bodyPt, False, iterations, fixedScale, scaleIdx, True, currPose, fbfactor) + except: + pose, currPose = ief.predict(frame, bodyPt, False, iterations, fixedScale, scaleIdx) + humans_old[i].x_center = pose[7][0] + humans_old[i].y_center = pose[7][1] + try: + currPoses[i] = currPose + except: + currPoses.append(currPose) + poses.append(pose) + if(output_csv_name is not None and '' != output_csv_name): + pose_arr = np.append([cnt ,i], pose) + pose_csv.writerows([pose_arr]) + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) if(output_video_name is not None and '' != output_video_name): - frame = td.vis.plot_pose_stickmodel_cv2mat(frame, pose.squeeze().transpose((1,0))) + for i, pose in enumerate(poses): + frame = td.vis.plot_pose_stickmodel_cv2mat(frame, pose.squeeze().transpose((1,0)), 2, False) + cv2.putText(frame, str(i), (int(humans[i].x_center), int(humans[i].y_center)), cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 0)) + cv2.imshow('stickmodel', frame) + cv2.waitKey(1) outv.write(frame) if(output_video_name is not None and '' != output_video_name): outv.close() @@ -62,21 +166,44 @@ def parse_args(): parser.add_argument('--isGPU', dest='isGPU', help='Boolean value that specifies if a GPU should be used for detection - isGPU=False means the network runs on CPU', default=True, type=bool) parser.add_argument('--deviceId', dest='deviceId', help='Natural value that specifies the number of the GPU which should be used. It starts with 0.', default='0', type=int) parser.add_argument('--input_video', dest='input_video_name', help='The name of the video which should be analyzed.', default='video/demo.avi', type=str) - default_output_name = (parser.parse_args().input_video_name).rsplit('.', 1)[0] - parser.add_argument('--output_video', dest='output_video_name', help='The name of the video to be newly created containing the stick model.', default=default_output_name+'_PoseIEF.avi', type=str) - parser.add_argument('--output_csv', dest='output_csv_name', help='The name of the csv file to be newly created containing the joint postions.', default=default_output_name+'_PoseIEF.csv', type=str) + parser.add_argument('--output_video', dest='output_video_name', help='The name of the video to be newly created containing the stick model.', default='?', type=str) + parser.add_argument('--output_csv', dest='output_csv_name', help='The name of the csv file to be newly created containing the joint postions.', default='?', type=str) parser.add_argument('--x_bodyPt', dest='x_bodyPt', help='Natural value that represents the x-coordinate of the pointer telling which human should be analyzed.', default=600, type=int) parser.add_argument('--y_bodyPt', dest='y_bodyPt', help='Natural value that represents the y-coordinate of the pointer telling which human should be analyzed.', default=400, type=int) + parser.add_argument('--iterations', dest='iterations', help='Natural value that specifies how much IEF iterations per image should be done.', default=4, type=int) + parser.add_argument('--scaleIdx', dest='scaleIdx', help='Natural value that specifies the IEF scaleIdx if fixed scale flag is set.', default=0, type=int) + parser.add_argument('--fbfactor', dest='fbfactor', help='Fractional value between 0.0 and 1.0 that specifies the weight of the last pose of the last image for the current image.', default=0.0, type=float) + parser.add_argument('--fixedScale', dest='fixedScale', help='Boolean value that deactivates the autoscale network netScale. The scale index scaleIdx has to be specified manualy (default=0)', default=False, type=bool) + parser.add_argument('--bodyPts', dest='bodyPts', help='Natural value coordinates of human appearance/starting points for pose detection. Order: x_person_1 y_person_1 x_person_2 ...', default=[600, 400], type=int, nargs='+') + # arguments for human detection + parser.add_argument('--thresh', dest='thresh', help='Fractional value between 0.0 and 1.0 that specifies the selectivity of human detection.', default=0.9, type=float) + parser.add_argument('--detinterv', dest='detinterv', help='Natural value that specifies every how much images human detection is done.', default=10, type=int) + parser.add_argument('--net', dest='net', help='The name of network used for human detection.', default='vgg16', type=str) + parser.add_argument('--maxhumans', dest='maxhumans', help='Natural value that specifies maximum number of tracked humans.', default=4, type=int) + + if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() + default_output_name = args.input_video_name.rsplit('.', 1)[0] + if '?' == args.output_video_name: + args.output_video_name = default_output_name+'_PoseIEF.avi' + if '?' == args.output_csv_name: + args.output_csv_name = default_output_name+'_PoseIEF.csv' + return args if __name__ == '__main__': + global gnet args = parse_args() print('Called with args:') print(args) - posevideo(args.input_video_name, args.output_video_name, args.output_csv_name, isGPU=args.isGPU, deviceId=args.deviceId, bodyPt=[args.x_bodyPt, args.y_bodyPt]) + frargs = collections.namedtuple('args', 'demo_net cpu_mode gpu_id') + frargs.cpu_mode = not args.isGPU + frargs.demo_net = args.net + frargs.gpu_id = args.deviceId + gnet = detectcore.init(frargs) + posevideo(args.input_video_name, args.output_video_name, args.output_csv_name, isGPU=args.isGPU, deviceId=args.deviceId, bodyPt=[args.x_bodyPt, args.y_bodyPt], iterations=args.iterations, fixedScale=args.fixedScale, scaleIdx=args.scaleIdx, fbfactor=args.fbfactor, thresh=args.thresh, detinterv=args.detinterv, bodyPts=args.bodyPts, maxhumans=args.maxhumans) diff --git a/src/test_demo.py b/src/test_demo.py index 2de1539..0b23a02 100644 --- a/src/test_demo.py +++ b/src/test_demo.py @@ -22,6 +22,8 @@ import scipy.io as sio import scipy.misc as scm import pdb +import time +import cv2 LIST_SCALES = cfg.SCALE_LAMBDA @@ -51,6 +53,7 @@ def get_pose_net(isGPU=True, deviceId=0): metaData = pickle.load(open(metaFile, 'r')) return net, metaData + ## # Predicting Poses class PoseIEF(object): @@ -68,73 +71,117 @@ def __init__(self, netScale=None, netPose=None, metaPose=None, cropSz=256, poseI self.cropSz_ = cropSz self.poseImSz_ = poseImSz - ## - #Predict pose - def predict(self, imName='./test_images/mpii-test-079555750.jpg', - bodyPt=(249,249), returnIm=False): - ''' - imName : image file name for which the pose needs to be predicted - bodyPt : A point on the body of the person (torso) for whom the pose - is to be predicted - returnIm: If True, return the image also - ''' - cropSz, poseImSz = self.cropSz_, self.poseImSz_ - #Read the image - if(isinstance(imName, str)): - im = scm.imread(imName) - else: - im = imName - + + def calc_scaleIdx_from_bbox(self, width, height): + cropSz = self.cropSz_ + hscale = cropSz / height + wscale = cropSz / width + for i,s in enumerate(LIST_SCALES): + if s < hscale or s < wscale: + return s + + def proc_fixedScale(self, im, cropSz, poseImSz, bodyPt, scaleIdx): + imScale = np.zeros((cropSz, cropSz, 3)) + oScale = np.zeros((2)) + oPos = np.zeros((2)) + scale = LIST_SCALES[scaleIdx] + imScale, scs, crpPos = imu.centered_crop(cropSz, copy.deepcopy(im), bodyPt, scale, returnScale=True) + oScale = np.array(scs).reshape(1,2) + oPos = np.array(crpPos).reshape(1,2) + xSt, ySt = (cropSz - poseImSz)/2, (cropSz - poseImSz)/2 + xEn, yEn = xSt + poseImSz, ySt + poseImSz + imScale = imScale[ySt:yEn, xSt:xEn,:].reshape((1,poseImSz,poseImSz,3)) + return imScale, xSt, ySt, oPos, oScale, scaleIdx + + + + def proc_netScale(self, im, cropSz, poseImSz, bodyPt): #Crop the image at different scales + t = time.time() imData = np.zeros((len(LIST_SCALES), cropSz, cropSz, 3)) scData = np.zeros((len(LIST_SCALES), 2)) posData = np.zeros((len(LIST_SCALES), 2)) for i,s in enumerate(LIST_SCALES): - imData[i], scs, crpPos = imu.centered_crop(cropSz, copy.deepcopy(im), bodyPt, s, - returnScale=True) + imData[i], scs, crpPos = imu.centered_crop(cropSz, copy.deepcopy(im), bodyPt, s, returnScale=True) scData[i] = np.array(scs).reshape(1,2) posData[i] = np.array(crpPos).reshape(1,2) + print('crop time: {:.3f}s').format(time.time() - t) #Use the scale net to find the best scale - scaleOp = self.netScale_.forward(blobs=['fc-op'], data=imData) + t = time.time() + scaleOp = self.netScale_.forward(blobs=['fc-op'], data=imData) + print('netScale time: {:.3f}s').format(time.time() - t) scaleIdx = scaleOp['fc-op'].squeeze().argmax() scale = LIST_SCALES[scaleIdx] #Scale to use to return the image in the original space oScale = scData[scaleIdx] #Original location of the cropped image oPos = posData[scaleIdx] - #Prepare image for pose prediction imScale = imData[scaleIdx] + print(scaleIdx) + print(len(imData)) xSt, ySt = (cropSz - poseImSz)/2, (cropSz - poseImSz)/2 xEn, yEn = xSt + poseImSz, ySt + poseImSz imScale = imScale[ySt:yEn, xSt:xEn,:].reshape((1,poseImSz,poseImSz,3)) + return imScale, xSt, ySt, oPos, oScale, scaleIdx - #Seed pose - currPose = np.zeros((1,17,2,1)).astype(np.float32) - for i in range(16): - currPose[0,i,0] = copy.deepcopy(self.seedPose_[0,i] - xSt) - currPose[0,i,1] = copy.deepcopy(self.seedPose_[1,i] - ySt) - #The marking point is the center of the image - currPose[0, 16, 0] = poseImSz / 2 - currPose[0, 16, 1] = poseImSz / 2 - + def proc_netPose(self, imScale, currPose): + t = time.time() #Dummy labels labels = np.zeros((1,16,2,1)).astype(np.float32) + poseOp = self.netPose_.forward(blobs=['cls3_fc'], image=imScale, kp_pos=copy.deepcopy(currPose), label=labels) + print('netPose time: {:.3f}s').format(time.time() - t) + kPred = copy.deepcopy(poseOp['cls3_fc'].squeeze()) + for i in range(16): + dx, dy = kPred[i], kPred[16 + i] + #print(dx, dy) + currPose[0,i,0] = currPose[0,i,0] + self.mxStepSz_ * dx + currPose[0,i,1] = currPose[0,i,1] + self.mxStepSz_ * dy + return currPose + ## + #Predict pose + def predict(self, imName='./test_images/mpii-test-079555750.jpg', bodyPt=(249,249), returnIm=False, noIterations=4, fixedScale=False, scaleIdx=None, initialPose=False, currPose=None, loopfactor=1.0): + ''' + imName : image file name for which the pose needs to be predicted + bodyPt : A point on the body of the person (torso) for whom the pose + is to be predicted + returnIm: If True, return the image also + ''' + tt = time.time() + cropSz, poseImSz = self.cropSz_, self.poseImSz_ + #Read the image + if(isinstance(imName, str)): + im = scm.imread(imName) + else: + im = imName + + if True == fixedScale: + imScale, xSt, ySt, oPos, oScale, scaleIdx = self.proc_fixedScale(im, cropSz, poseImSz, bodyPt, scaleIdx) + else: + imScale, xSt, ySt, oPos, oScale, scaleIdx = self.proc_netScale(im, cropSz, poseImSz, bodyPt) + + #Seed pose + currPose_ = np.zeros((1,17,2,1)).astype(np.float32) + for i in range(16): + currPose_[0,i,0] = copy.deepcopy(self.seedPose_[0,i] - xSt) + currPose_[0,i,1] = copy.deepcopy(self.seedPose_[1,i] - ySt) + #The marking point is the center of the image + currPose_[0, 16, 0] = poseImSz / 2 + currPose_[0, 16, 1] = poseImSz / 2 + if False == initialPose: + currPose = currPose_ + #cv2.imshow('imScale', imScale[0]) + #cv2.waitKey(1) + currPose = np.add(np.multiply(currPose, loopfactor), np.multiply(currPose_, 1.0-loopfactor)) #Predict Pose - for step in range(4): - poseOp = self.netPose_.forward(blobs=['cls3_fc'], image=imScale, - kp_pos=copy.deepcopy(currPose), label=labels) - kPred = copy.deepcopy(poseOp['cls3_fc'].squeeze()) - for i in range(16): - dx, dy = kPred[i], kPred[16 + i] - currPose[0,i,0] = currPose[0,i,0] + self.mxStepSz_ * dx - currPose[0,i,1] = currPose[0,i,1] + self.mxStepSz_ * dy - + for step in range(noIterations): + currPose = self.proc_netPose(imScale, currPose) #Convert the pose in the original image coordinated origPose = (currPose.squeeze() + np.array([xSt, ySt]).reshape(1,2)) * oScale + oPos - + + print('predict time: {:.3f}s').format(time.time() - tt) if returnIm: #return origPose, copy.deepcopy(currPose), imScale[0] return origPose, im diff --git a/src/utils/visualization.py b/src/utils/visualization.py index efa2f9d..2adb7dc 100644 --- a/src/utils/visualization.py +++ b/src/utils/visualization.py @@ -13,15 +13,15 @@ print('opencv not available - function plot_pose_stickmodel_cv2mat() will not work') -def plot_pose_stickmodel_cv2mat(im, kpts, lw=3): +def plot_pose_stickmodel_cv2mat(im, kpts, lw=3, isRGB=True): ''' im : image kpts: key points 2 x N, where N is the number of keypoints (x,y) format lw : line width ''' - - im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) + if isRGB: + im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) #Plot the keypoints - this works for MPII style keypoints #Right leg