Perform inference on a single video or all videos with a certain extension (e.g., mp4) in a folder. Returns a dataset with predicted: 2D keypoints, and bounding boxes. Right now it is the COCO-keypoints.

parse_args[source]

parse_args()

def parse_args():
    parser = argparse.ArgumentParser(description='End-to-end inference')
    parser.add_argument(
        '--cfg',
        dest='cfg',
        help='cfg model file (/path/to/model_config.yaml)',
        default=None,
        type=str
    )
    parser.add_argument(
        '--output-dir',
        dest='output_dir',
        help='directory for visualization pdfs (default: /tmp/infer_simple)',
        default='/tmp/infer_simple',
        type=str
    )
    parser.add_argument(
        '--image-ext',
        dest='image_ext',
        help='image file name extension (default: mp4)',
        default='mp4',
        type=str
    )
    parser.add_argument(
        'im_or_folder', help='image or folder of images', default=None
    )
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    return parser.parse_args()

get_resolution[source]

get_resolution(filename)

Returns the width and height for a given video file.

read_video[source]

read_video(filename)

Loads a given video file and returns it as a data generator.

def read_video(filename):
    """Loads a given video file and returns it as a data generator."""
    w, h = get_resolution(filename)

    command = [
        'ffmpeg',
        '-hide_banner',
        '-i', filename,
        '-f', 'image2pipe',
        '-pix_fmt', 'bgr24',
        '-vsync', '0',
        '-vcodec', 'rawvideo', '-'
    ]

    pipe = sp.Popen(command, stdout=sp.PIPE, bufsize=-1)
    while True:
        data = pipe.stdout.read(w*h*3)
        if not data:
            break
        yield np.frombuffer(data, dtype='uint8').reshape((h, w, 3))

main[source]

main(args)

Runs inference on the video files and saves the dataset in .npz file format. Predicts the boundary box and the coco keypoints.

def main(args):
    """
    Runs inference on the video files and saves the dataset in .npz file format.
    Predicts the boundary box and the coco keypoints. 
    """
    # Create a detectron2 config and a detectron2 DefaultPredictor to run inference on video.
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(args.cfg))
    cfg.MODEL.ROI_HEADS.SCORE_TRESH_TEST = 0.7 # Set threshold for this model.
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(args.cfg) 
    predictor = DefaultPredictor(cfg)

    # Load the video folder in which we should predict.
    if os.path.isdir(args.im_or_folder):
        im_list = glob.iglob(args.im_or_folder + '/*.' + args.image_ext)
    else:
        im_list = [args.im_or_folder]

    for video_name in im_list:
        out_name = os.path.join(args.output_dir, os.path.basename(video_name))
        print("Processing {}".format(video_name))

        # Initialize results:
        boundary_boxes = []
        segments = [] # Sets to None.
        keypoints = []
        for frame_i, im in enumerate(read_video(video_name)):
            t = time.time()
            outputs = predictor(im)['instances'].to('cpu')
            print("Frame {} processed in {:.3f}s".format(frame_i, time.time()-t)) 
            
            # Checks if image is "empty or not".
            has_bbox = False 
            if outputs.has('pred_boxes'):
                bbox_tensor = outputs.pred_boxes.tensor.numpy()
                if len(bbox_tensor) > 0:
                    has_bbox = True
                    scores = outputs.scores.numpy()[:, None]
                    bbox_tensor = np.concatenate((bbox_tensor, scores), axis=1)
            
            if has_bbox:
                kps = outputs.pred_keypoints.numpy()
                kps_xy = kps[:, :, :2]
                kps_prob = kps[:, :, 2:3]
                kps_logit = np.zeros_like(kps_prob) # Dummy variable.
                kps = np.concatenate((kps_xy, kps_logit, kps_prob), axis=2)
                kps = kps.transpose(0, 2, 1)
            else:
                kps = []
                bbox_tensor = []

            # Mimic Detectron1 format
            cls_boxes = [[], bbox_tensor]
            cls_keyps = [[], kps]

            boundary_boxes.append(cls_boxes)
            segments.append(None)
            keypoints.append(cls_keyps)
        
        # Video resolution.
        metadata = {
            'w': im.shape[1],
            'h': im.shape[0],
        }
        
        np.savez_compressed(
            out_name, boxes=boundary_boxes, segments=segments, 
            keypoints=keypoints, metadata=metadata
        )