#442 · Deep Learning · Easy
⊣ Solve on deep-ml.comCalculate the number of visual tokens produced by a Vision-Language Model (VLM) given an input image resolution and the patch size used by the vision encoder. Optionally account for any spatial downsampling applied after the vision encoder.
def vlm_visual_token_count(
image_height: int,
image_width: int,
patch_size: int,
downsample_factor: int = 1
) -> dict:
patches_h = image_height // patch_size
patches_w = image_width // patch_size
total_patches = patches_h * patches_w
effective_h = patches_h // downsample_factor
effective_w = patches_w // downsample_factor
visual_tokens = effective_h * effective_w
return {
"patches_h": patches_h,
"patches_w": patches_w,
"total_patches": total_patches,
"visual_tokens": visual_tokens
}patch_size x patch_size.dimension // patch_size.downsample_factor^2.