Skip to content

Models on huggingface were all wrong #164

@lucasjinreal

Description

@lucasjinreal

[{'keypoints': array([[ 568.8, -410.61, 1.2564],
[ 608.13, -406.3, 1.5268],
[ 556.61, -432.35, 1.4458],
[ 570.44, -421.37, 1.4584],
[ 562.61, -417.5, 1.7987],
[ 412.97, -404.38, 1.8151],
[ 606.61, -436.17, 1.7308],
[ 601.06, -407.91, 2.178],
[ 422.61, -411.21, 2.2401],
[ 554.05, -419.9, 1.692],
[ 583.79, -412.86, 1.5924],
[ 575.5, -439.32, 1.3324],
[ 592.95, -433.89, 1.4292],
[ 596.16, -414.88, 1.5972],
[ 600.63, -448.23, 2.638],
[ 568.6, -417.18, 1.8483],
[ 558.62, -445.62, 2.0585]],

the vitposehuge coco-wholebody result is totally wrong

class ViTPoseModel(object):

    def __init__(self, device: str | torch.device, model_path=None):
        self.device = torch.device(device)
        self.model_name = "ViTPose+-G (multi-task train, COCO)"
        if model_path is None:
            model_path = "usyd-community/vitpose-plus-huge"
        logger.info(f"loading vitpose model from {model_path}, device={self.device}")
        self.pose_processor = AutoProcessor.from_pretrained(model_path)
        self.pose_model = VitPoseForPoseEstimation.from_pretrained(
            model_path, device_map="cpu"
        ).to(self.device)
        self.model = self._load_model(self.model_name)

    def _load_all_models_once(self) -> None:
        self._load_model()

    def _load_model(self, name: str) -> nn.Module:
        return self.pose_model

    def set_model(self, name: str) -> None:
        if name == self.model_name:
            return
        self.model_name = name
        self.model = self._load_model(name)

    def predict_pose_and_visualize(
        self,
        image: np.ndarray,
        det_results: list[np.ndarray],
        box_score_threshold: float,
        kpt_score_threshold: float,
        vis_dot_radius: int,
        vis_line_thickness: int,
    ) -> tuple[list[dict[str, np.ndarray]], np.ndarray]:

        out = self.predict_pose(image, det_results, box_score_threshold)
        vis = self.visualize_pose_results(
            image, out, kpt_score_threshold, vis_dot_radius, vis_line_thickness
        )
        return out, vis

    def predict_pose(
        self,
        image: np.ndarray,
        det_results: list[np.ndarray],
        box_score_threshold: float = 0.5,
    ) -> list[dict[str, np.ndarray]]:

        pil_img = Image.fromarray(image)

        boxes = np.asarray(det_results[0], dtype=np.float32)
        boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
        boxes[:, 3] = boxes[:, 3] - boxes[:, 1]

        
        # for i, box in enumerate(boxes):
        #     x, y, w, h = box[:4]  # take only the first 4 elements
        #     crop = pil_img.crop((int(x), int(y), int(x + w), int(y + h)))
        #     crop.save(f"results/crop_{i}.png")
    
        inputs = self.pose_processor(pil_img, boxes=[boxes], return_tensors="pt").to(
            self.device
        )
        # print(f'inputs: {inputs}')
        # ViTPose++ has 6 different MoE expert heads 
        # (COCO validation 0, AiC 1, MPII 2, AP-10K 3, APT-36K 4, COCO-WholeBody 5) 
        # which supports 6 different datasets.
        # dataset_index = torch.tensor([0]).to(self.device)
        dataset_index = torch.tensor([5]).to(self.device)

        with torch.no_grad():
            outputs = self.pose_model(**inputs, dataset_index=dataset_index)

        pose = self.pose_processor.post_process_pose_estimation(
            outputs, boxes=[boxes], threshold=box_score_threshold
        )[0]

        final = []
        for p in pose:
            keypoints = p["keypoints"].cpu().numpy()  # shape: (N, 3) or (N,2)
            scores = p["scores"].cpu().numpy()  # shape: (N,)
            # concatenate scores as last column
            keypoints_with_scores = np.concatenate([keypoints, scores[:, None]], axis=1)

            final.append(
                {
                    "keypoints": keypoints_with_scores,
                    "keypoint_scores": p["scores"].cpu().numpy(),
                    "bbox": p["bbox"].cpu().numpy() if "bbox" in p else None,
                }
            )
        print(f"final keypoints: {final}")
        return final

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions