-
Notifications
You must be signed in to change notification settings - Fork 237
Description
[{'keypoints': array([[ 568.8, -410.61, 1.2564],
[ 608.13, -406.3, 1.5268],
[ 556.61, -432.35, 1.4458],
[ 570.44, -421.37, 1.4584],
[ 562.61, -417.5, 1.7987],
[ 412.97, -404.38, 1.8151],
[ 606.61, -436.17, 1.7308],
[ 601.06, -407.91, 2.178],
[ 422.61, -411.21, 2.2401],
[ 554.05, -419.9, 1.692],
[ 583.79, -412.86, 1.5924],
[ 575.5, -439.32, 1.3324],
[ 592.95, -433.89, 1.4292],
[ 596.16, -414.88, 1.5972],
[ 600.63, -448.23, 2.638],
[ 568.6, -417.18, 1.8483],
[ 558.62, -445.62, 2.0585]],
the vitposehuge coco-wholebody result is totally wrong
class ViTPoseModel(object):
def __init__(self, device: str | torch.device, model_path=None):
self.device = torch.device(device)
self.model_name = "ViTPose+-G (multi-task train, COCO)"
if model_path is None:
model_path = "usyd-community/vitpose-plus-huge"
logger.info(f"loading vitpose model from {model_path}, device={self.device}")
self.pose_processor = AutoProcessor.from_pretrained(model_path)
self.pose_model = VitPoseForPoseEstimation.from_pretrained(
model_path, device_map="cpu"
).to(self.device)
self.model = self._load_model(self.model_name)
def _load_all_models_once(self) -> None:
self._load_model()
def _load_model(self, name: str) -> nn.Module:
return self.pose_model
def set_model(self, name: str) -> None:
if name == self.model_name:
return
self.model_name = name
self.model = self._load_model(name)
def predict_pose_and_visualize(
self,
image: np.ndarray,
det_results: list[np.ndarray],
box_score_threshold: float,
kpt_score_threshold: float,
vis_dot_radius: int,
vis_line_thickness: int,
) -> tuple[list[dict[str, np.ndarray]], np.ndarray]:
out = self.predict_pose(image, det_results, box_score_threshold)
vis = self.visualize_pose_results(
image, out, kpt_score_threshold, vis_dot_radius, vis_line_thickness
)
return out, vis
def predict_pose(
self,
image: np.ndarray,
det_results: list[np.ndarray],
box_score_threshold: float = 0.5,
) -> list[dict[str, np.ndarray]]:
pil_img = Image.fromarray(image)
boxes = np.asarray(det_results[0], dtype=np.float32)
boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
# for i, box in enumerate(boxes):
# x, y, w, h = box[:4] # take only the first 4 elements
# crop = pil_img.crop((int(x), int(y), int(x + w), int(y + h)))
# crop.save(f"results/crop_{i}.png")
inputs = self.pose_processor(pil_img, boxes=[boxes], return_tensors="pt").to(
self.device
)
# print(f'inputs: {inputs}')
# ViTPose++ has 6 different MoE expert heads
# (COCO validation 0, AiC 1, MPII 2, AP-10K 3, APT-36K 4, COCO-WholeBody 5)
# which supports 6 different datasets.
# dataset_index = torch.tensor([0]).to(self.device)
dataset_index = torch.tensor([5]).to(self.device)
with torch.no_grad():
outputs = self.pose_model(**inputs, dataset_index=dataset_index)
pose = self.pose_processor.post_process_pose_estimation(
outputs, boxes=[boxes], threshold=box_score_threshold
)[0]
final = []
for p in pose:
keypoints = p["keypoints"].cpu().numpy() # shape: (N, 3) or (N,2)
scores = p["scores"].cpu().numpy() # shape: (N,)
# concatenate scores as last column
keypoints_with_scores = np.concatenate([keypoints, scores[:, None]], axis=1)
final.append(
{
"keypoints": keypoints_with_scores,
"keypoint_scores": p["scores"].cpu().numpy(),
"bbox": p["bbox"].cpu().numpy() if "bbox" in p else None,
}
)
print(f"final keypoints: {final}")
return final