Models on huggingface were all wrong

 [{'keypoints': array([[      568.8,     -410.61,      1.2564],
       [     608.13,      -406.3,      1.5268],
       [     556.61,     -432.35,      1.4458],
       [     570.44,     -421.37,      1.4584],
       [     562.61,      -417.5,      1.7987],
       [     412.97,     -404.38,      1.8151],
       [     606.61,     -436.17,      1.7308],
       [     601.06,     -407.91,       2.178],
       [     422.61,     -411.21,      2.2401],
       [     554.05,      -419.9,       1.692],
       [     583.79,     -412.86,      1.5924],
       [      575.5,     -439.32,      1.3324],
       [     592.95,     -433.89,      1.4292],
       [     596.16,     -414.88,      1.5972],
       [     600.63,     -448.23,       2.638],
       [      568.6,     -417.18,      1.8483],
       [     558.62,     -445.62,      2.0585]],

the vitposehuge coco-wholebody result is totally wrong

```python

class ViTPoseModel(object):

    def __init__(self, device: str | torch.device, model_path=None):
        self.device = torch.device(device)
        self.model_name = "ViTPose+-G (multi-task train, COCO)"
        if model_path is None:
            model_path = "usyd-community/vitpose-plus-huge"
        logger.info(f"loading vitpose model from {model_path}, device={self.device}")
        self.pose_processor = AutoProcessor.from_pretrained(model_path)
        self.pose_model = VitPoseForPoseEstimation.from_pretrained(
            model_path, device_map="cpu"
        ).to(self.device)
        self.model = self._load_model(self.model_name)

    def _load_all_models_once(self) -> None:
        self._load_model()

    def _load_model(self, name: str) -> nn.Module:
        return self.pose_model

    def set_model(self, name: str) -> None:
        if name == self.model_name:
            return
        self.model_name = name
        self.model = self._load_model(name)

    def predict_pose_and_visualize(
        self,
        image: np.ndarray,
        det_results: list[np.ndarray],
        box_score_threshold: float,
        kpt_score_threshold: float,
        vis_dot_radius: int,
        vis_line_thickness: int,
    ) -> tuple[list[dict[str, np.ndarray]], np.ndarray]:

        out = self.predict_pose(image, det_results, box_score_threshold)
        vis = self.visualize_pose_results(
            image, out, kpt_score_threshold, vis_dot_radius, vis_line_thickness
        )
        return out, vis

    def predict_pose(
        self,
        image: np.ndarray,
        det_results: list[np.ndarray],
        box_score_threshold: float = 0.5,
    ) -> list[dict[str, np.ndarray]]:

        pil_img = Image.fromarray(image)

        boxes = np.asarray(det_results[0], dtype=np.float32)
        boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
        boxes[:, 3] = boxes[:, 3] - boxes[:, 1]

        
        # for i, box in enumerate(boxes):
        #     x, y, w, h = box[:4]  # take only the first 4 elements
        #     crop = pil_img.crop((int(x), int(y), int(x + w), int(y + h)))
        #     crop.save(f"results/crop_{i}.png")
    
        inputs = self.pose_processor(pil_img, boxes=[boxes], return_tensors="pt").to(
            self.device
        )
        # print(f'inputs: {inputs}')
        # ViTPose++ has 6 different MoE expert heads 
        # (COCO validation 0, AiC 1, MPII 2, AP-10K 3, APT-36K 4, COCO-WholeBody 5) 
        # which supports 6 different datasets.
        # dataset_index = torch.tensor([0]).to(self.device)
        dataset_index = torch.tensor([5]).to(self.device)

        with torch.no_grad():
            outputs = self.pose_model(**inputs, dataset_index=dataset_index)

        pose = self.pose_processor.post_process_pose_estimation(
            outputs, boxes=[boxes], threshold=box_score_threshold
        )[0]

        final = []
        for p in pose:
            keypoints = p["keypoints"].cpu().numpy()  # shape: (N, 3) or (N,2)
            scores = p["scores"].cpu().numpy()  # shape: (N,)
            # concatenate scores as last column
            keypoints_with_scores = np.concatenate([keypoints, scores[:, None]], axis=1)

            final.append(
                {
                    "keypoints": keypoints_with_scores,
                    "keypoint_scores": p["scores"].cpu().numpy(),
                    "bbox": p["bbox"].cpu().numpy() if "bbox" in p else None,
                }
            )
        print(f"final keypoints: {final}")
        return final
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Models on huggingface were all wrong #164

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Models on huggingface were all wrong #164

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions