[{"data":1,"prerenderedAt":412},["ShallowReactive",2],{"navigation":3,"\u002Fproviders\u002Fqwen":99,"\u002Fproviders\u002Fqwen-surround":407},[4,23,44,78],{"title":5,"path":6,"stem":7,"children":8,"icon":22},"Getting Started","\u002Fgetting-started","1.getting-started\u002F1.index",[9,12,17],{"title":10,"path":6,"stem":7,"icon":11},"Introduction","i-lucide-house",{"title":13,"path":14,"stem":15,"icon":16},"Installation","\u002Fgetting-started\u002Finstallation","1.getting-started\u002F2.installation","i-lucide-download",{"title":18,"path":19,"stem":20,"icon":21},"MCP Setup","\u002Fgetting-started\u002Fmcp-setup","1.getting-started\u002F3.mcp-setup","i-lucide-plug","i-lucide-rocket",{"title":24,"icon":25,"path":26,"stem":27,"children":28,"page":43},"CLI","i-lucide-terminal","\u002Fcli","2.cli",[29,33,38],{"title":30,"path":31,"stem":32,"icon":25},"Usage","\u002Fcli\u002Fusage","2.cli\u002F1.usage",{"title":34,"path":35,"stem":36,"icon":37},"Options","\u002Fcli\u002Foptions","2.cli\u002F2.options","i-lucide-sliders-horizontal",{"title":39,"path":40,"stem":41,"icon":42},"Batch & JSON","\u002Fcli\u002Fbatch-json","2.cli\u002F3.batch-json","i-lucide-package",false,{"title":45,"icon":46,"path":47,"stem":48,"children":49,"page":43},"Providers","i-lucide-cpu","\u002Fproviders","3.providers",[50,55,60,65,70,74],{"title":51,"path":52,"stem":53,"icon":54},"Claude (Area-Based)","\u002Fproviders\u002Fclaude","3.providers\u002F1.claude","i-lucide-square",{"title":56,"path":57,"stem":58,"icon":59},"GPT-4o & GPT-5 (Tiling)","\u002Fproviders\u002Fgpt","3.providers\u002F2.gpt","i-lucide-grid-2x2",{"title":61,"path":62,"stem":63,"icon":64},"Gemini (Large Tiles)","\u002Fproviders\u002Fgemini","3.providers\u002F3.gemini","i-lucide-grid-3x3",{"title":66,"path":67,"stem":68,"icon":69},"Llama Vision (Tiles)","\u002Fproviders\u002Fllama","3.providers\u002F4.llama","i-simple-icons-meta",{"title":71,"path":72,"stem":73,"icon":64},"Qwen-VL (Patch Grid)","\u002Fproviders\u002Fqwen","3.providers\u002F5.qwen",{"title":75,"path":76,"stem":77,"icon":59},"DeepSeek-VL (Open Weights)","\u002Fproviders\u002Fdeepseek","3.providers\u002F6.deepseek",{"title":79,"icon":80,"path":81,"stem":82,"children":83,"page":43},"Guides","i-lucide-book-open","\u002Fguides","4.guides",[84,89,94],{"title":85,"path":86,"stem":87,"icon":88},"Python Bindings","\u002Fguides\u002Fpython-bindings","4.guides\u002F1.python-bindings","i-lucide-file-code",{"title":90,"path":91,"stem":92,"icon":93},"Sandbox (Think in Code)","\u002Fguides\u002Fsandbox","4.guides\u002F2.sandbox","i-lucide-flask-conical",{"title":95,"path":96,"stem":97,"icon":98},"Crawler Integration","\u002Fguides\u002Fcrawler-integration","4.guides\u002F3.crawler-integration","i-lucide-globe",{"id":100,"title":71,"body":101,"description":400,"extension":401,"links":402,"meta":403,"navigation":404,"path":72,"seo":405,"stem":73,"__hash__":406},"docs\u002F3.providers\u002F5.qwen.md",{"type":102,"value":103,"toc":393},"minimark",[104,125,130,153,163,181,185,188,207,217,244,263,267,274,337,340,344,389],[105,106,108],"callout",{"color":107,"icon":93},"warning",[109,110,111,115,116,120,121,124],"p",{},[112,113,114],"strong",{},"Experimental connector."," The 28px effective grid and the patch-count formula are stable (",[117,118,119],"code",{},"qwen_vl_utils.smart_resize","). The token clamp defaults shown here are the library defaults; a DashScope endpoint may cap lower via a per-request ",[117,122,123],{},"max_pixels",". Verify against your deployment.",[126,127,129],"h2",{"id":128},"how-qwen-vl-bills-images","How Qwen-VL bills images",[109,131,132,133,136,137,140,141,144,145,148,149,152],{},"Alibaba Qwen2-VL \u002F Qwen2.5-VL use a ",[112,134,135],{},"14px ViT patch"," with a ",[112,138,139],{},"2×2 spatial merge",", giving a ",[112,142,143],{},"28px effective grid"," (",[117,146,147],{},"image_factor = 14 × 2 = 28","). ",[117,150,151],{},"smart_resize"," rounds each side to a multiple of 28; the image token count is the number of merged patches:",[154,155,160],"pre",{"className":156,"code":158,"language":159},[157],"language-text","tokens = (W\u002F28) · (H\u002F28)\n","text",[117,161,158],{"__ignoreMap":162},"",[109,164,165,166,169,170,173,174,177,178,180],{},"bounded by ",[117,167,168],{},"[IMAGE_MIN_TOKEN_NUM, IMAGE_MAX_TOKEN_NUM] = [4, 16384]"," tokens (the ",[117,171,172],{},"qwen_vl_utils"," defaults: ",[117,175,176],{},"max_pixels = 16384 × 28²","). Production DashScope tiers often set a lower ",[117,179,123],{}," per request.",[126,182,184],{"id":183},"does-snapping-help","Does snapping help?",[109,186,187],{},"The patch is small (28px), so spill-over per step is modest — but two effects still pay off:",[189,190,191,198],"ul",{},[192,193,194,197],"li",{},[112,195,196],{},"Snapping to the 28px grid"," removes fractional rows\u002Fcolumns that round up.",[192,199,200,203,204,206],{},[112,201,202],{},"Padding strip"," lowers the pixel area, which directly lowers the patch count until the ",[117,205,123],{}," clamp is hit.",[109,208,209,210,212,213,216],{},"For high-resolution inputs the image is first scaled under ",[117,211,123],{},", so the win is often capped — Qwen's value is more about ",[112,214,215],{},"file size + context budget"," than dramatic token cuts.",[154,218,223],{"className":219,"code":220,"filename":221,"language":222,"meta":162,"style":162},"language-bash shiki shiki-themes material-theme-lighter material-theme material-theme-palenight","vision-squeezer image.png --model qwen\n","Terminal","bash",[117,224,225],{"__ignoreMap":162},[226,227,230,234,238,241],"span",{"class":228,"line":229},"line",1,[226,231,233],{"class":232},"sBMFI","vision-squeezer",[226,235,237],{"class":236},"sfazB"," image.png",[226,239,240],{"class":236}," --model",[226,242,243],{"class":236}," qwen\n",[109,245,246,247,250,251,254,255,258,259,262],{},"CLI aliases: ",[117,248,249],{},"qwen",", ",[117,252,253],{},"qwen-vl",". MCP ",[117,256,257],{},"target_model",": ",[117,260,261],{},"\"qwen\"",".",[126,264,266],{"id":265},"token-savings","Token savings",[109,268,269,270,273],{},"The 28px patch is small, so the lever here is ",[112,271,272],{},"area"," (padding strip), not boundary snapping.",[275,276,277,296],"table",{},[278,279,280],"thead",{},[281,282,283,287,290,293],"tr",{},[284,285,286],"th",{},"Scenario",[284,288,289],{},"Before",[284,291,292],{},"After",[284,294,295],{},"Saved",[297,298,299,320],"tbody",{},[281,300,301,305,310,315],{},[302,303,304],"td",{},"1024×1024 → strip solid border to 896×896",[302,306,307],{},[112,308,309],{},"1,369 tok",[302,311,312],{},[112,313,314],{},"1,024 tok",[302,316,317],{},[112,318,319],{},"−25%",[281,321,322,325,329,334],{},[302,323,324],{},"1024×1024 → 28px grid snap only (1008×1008)",[302,326,327],{},[112,328,309],{},[302,330,331],{},[112,332,333],{},"1,296 tok",[302,335,336],{},"−5%",[109,338,339],{},"Cropping wasted area pays off; pure grid snapping is incremental. For very large inputs the count is also capped at 16,384 tokens, so savings there come from staying under that ceiling.",[126,341,343],{"id":342},"source","Source",[109,345,346,347,349,350,357,358,361,362,364,365,368,369,372,373,376,377,250,382,385,386,262],{},"Grid and clamp taken from the official ",[117,348,119],{}," reference implementation (",[351,352,356],"a",{"href":353,"rel":354},"https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen2.5-VL",[355],"nofollow","QwenLM\u002FQwen2.5-VL",") — ",[117,359,360],{},"image_factor = patch_size(14) × spatial_merge(2) = 28",", bounds ",[117,363,168],{},". The same 28px grid carries across ",[112,366,367],{},"Qwen2-VL, Qwen2.5-VL, and Qwen3-VL"," (identical ",[117,370,371],{},"vision_process.py","). Background: ",[112,374,375],{},"Qwen2.5-VL"," technical report (",[351,378,381],{"href":379,"rel":380},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13923",[355],"arXiv:2502.13923",[112,383,384],{},"Feb 2025","). Verified ",[112,387,388],{},"2026-06-11",[390,391,392],"style",{},"html pre.shiki code .sBMFI, html code.shiki .sBMFI{--shiki-light:#E2931D;--shiki-default:#FFCB6B;--shiki-dark:#FFCB6B}html pre.shiki code .sfazB, html code.shiki .sfazB{--shiki-light:#91B859;--shiki-default:#C3E88D;--shiki-dark:#C3E88D}html .light .shiki span {color: var(--shiki-light);background: var(--shiki-light-bg);font-style: var(--shiki-light-font-style);font-weight: var(--shiki-light-font-weight);text-decoration: var(--shiki-light-text-decoration);}html.light .shiki span {color: var(--shiki-light);background: var(--shiki-light-bg);font-style: var(--shiki-light-font-style);font-weight: var(--shiki-light-font-weight);text-decoration: var(--shiki-light-text-decoration);}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html.dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}",{"title":162,"searchDepth":229,"depth":394,"links":395},2,[396,397,398,399],{"id":128,"depth":394,"text":129},{"id":183,"depth":394,"text":184},{"id":265,"depth":394,"text":266},{"id":342,"depth":394,"text":343},"How Alibaba Qwen2-VL \u002F Qwen2.5-VL tokenizes images on a 28px grid and how VisionSqueezer snaps to it.","md",null,{},{"icon":64},{"title":71,"description":400},"zyaqTtZll0mYXNT3LkcrKunLFq16CCTQh9gI1JlZFEw",[408,410],{"title":66,"path":67,"stem":68,"description":409,"icon":69,"children":-1},"How Meta Llama 3.2\u002F4 Vision tiles images and how VisionSqueezer snaps to the 560px canvas.",{"title":75,"path":76,"stem":77,"description":411,"icon":59,"children":-1},"How DeepSeek-VL2 tiles images at 384px and why the win is local-inference context, not API billing.",1782053692263]