[{"data":1,"prerenderedAt":418},["ShallowReactive",2],{"navigation":3,"\u002Fproviders\u002Fllama":99,"\u002Fproviders\u002Fllama-surround":413},[4,23,44,78],{"title":5,"path":6,"stem":7,"children":8,"icon":22},"Getting Started","\u002Fgetting-started","1.getting-started\u002F1.index",[9,12,17],{"title":10,"path":6,"stem":7,"icon":11},"Introduction","i-lucide-house",{"title":13,"path":14,"stem":15,"icon":16},"Installation","\u002Fgetting-started\u002Finstallation","1.getting-started\u002F2.installation","i-lucide-download",{"title":18,"path":19,"stem":20,"icon":21},"MCP Setup","\u002Fgetting-started\u002Fmcp-setup","1.getting-started\u002F3.mcp-setup","i-lucide-plug","i-lucide-rocket",{"title":24,"icon":25,"path":26,"stem":27,"children":28,"page":43},"CLI","i-lucide-terminal","\u002Fcli","2.cli",[29,33,38],{"title":30,"path":31,"stem":32,"icon":25},"Usage","\u002Fcli\u002Fusage","2.cli\u002F1.usage",{"title":34,"path":35,"stem":36,"icon":37},"Options","\u002Fcli\u002Foptions","2.cli\u002F2.options","i-lucide-sliders-horizontal",{"title":39,"path":40,"stem":41,"icon":42},"Batch & JSON","\u002Fcli\u002Fbatch-json","2.cli\u002F3.batch-json","i-lucide-package",false,{"title":45,"icon":46,"path":47,"stem":48,"children":49,"page":43},"Providers","i-lucide-cpu","\u002Fproviders","3.providers",[50,55,60,65,70,74],{"title":51,"path":52,"stem":53,"icon":54},"Claude (Area-Based)","\u002Fproviders\u002Fclaude","3.providers\u002F1.claude","i-lucide-square",{"title":56,"path":57,"stem":58,"icon":59},"GPT-4o & GPT-5 (Tiling)","\u002Fproviders\u002Fgpt","3.providers\u002F2.gpt","i-lucide-grid-2x2",{"title":61,"path":62,"stem":63,"icon":64},"Gemini (Large Tiles)","\u002Fproviders\u002Fgemini","3.providers\u002F3.gemini","i-lucide-grid-3x3",{"title":66,"path":67,"stem":68,"icon":69},"Llama Vision (Tiles)","\u002Fproviders\u002Fllama","3.providers\u002F4.llama","i-simple-icons-meta",{"title":71,"path":72,"stem":73,"icon":64},"Qwen-VL (Patch Grid)","\u002Fproviders\u002Fqwen","3.providers\u002F5.qwen",{"title":75,"path":76,"stem":77,"icon":59},"DeepSeek-VL (Open Weights)","\u002Fproviders\u002Fdeepseek","3.providers\u002F6.deepseek",{"title":79,"icon":80,"path":81,"stem":82,"children":83,"page":43},"Guides","i-lucide-book-open","\u002Fguides","4.guides",[84,89,94],{"title":85,"path":86,"stem":87,"icon":88},"Python Bindings","\u002Fguides\u002Fpython-bindings","4.guides\u002F1.python-bindings","i-lucide-file-code",{"title":90,"path":91,"stem":92,"icon":93},"Sandbox (Think in Code)","\u002Fguides\u002Fsandbox","4.guides\u002F2.sandbox","i-lucide-flask-conical",{"title":95,"path":96,"stem":97,"icon":98},"Crawler Integration","\u002Fguides\u002Fcrawler-integration","4.guides\u002F3.crawler-integration","i-lucide-globe",{"id":100,"title":66,"body":101,"description":406,"extension":407,"links":408,"meta":409,"navigation":410,"path":67,"seo":411,"stem":68,"__hash__":412},"docs\u002F3.providers\u002F4.llama.md",{"type":102,"value":103,"toc":398},"minimark",[104,126,131,138,159,170,180,184,187,231,237,241,248,275,293,297,300,342,345,349,394],[105,106,108],"callout",{"color":107,"icon":93},"warning",[109,110,111,115,116,120,121,125],"p",{},[112,113,114],"strong",{},"Experimental connector."," The 560px tile grid, 14px patch and 4-tile cap are architectural constants (transformers ",[117,118,119],"code",{},"MllamaVisionConfig","). The per-tile token cost (~1601) can vary by host (Together, Amazon Bedrock, Fireworks, Groq) — verify against your provider's billing. The ",[122,123,124],"em",{},"tile count"," is what drives savings and is stable.",[127,128,130],"h2",{"id":129},"how-llama-vision-bills-images","How Llama Vision bills images",[109,132,133,134,137],{},"Meta Llama 3.2 \u002F 3.3 Vision (Mllama) tile images on a ",[112,135,136],{},"560×560"," grid:",[139,140,141,149],"ol",{},[142,143,144,145,148],"li",{},"The image is fit into an ",[112,146,147],{},"aspect-ratio canvas"," built from 560px tiles.",[142,150,151,152,158],{},"The canvas is capped at ",[112,153,154,157],{},[117,155,156],{},"max_num_tiles"," = 4"," (e.g. 2×2 \u002F 1×4 \u002F 4×1).",[109,160,161,162,165,166,169],{},"There is ",[112,163,164],{},"no separate global-thumbnail tile"," — the canvas is the full representation. With a 14px ViT patch, each 560px tile is 40×40 = 1600 patches (+1 CLS) ≈ ",[112,167,168],{},"1601 tokens\u002Ftile",".",[171,172,177],"pre",{"className":173,"code":175,"language":176},[174],"language-text","tiles  = min(⌈W\u002F560⌉ · ⌈H\u002F560⌉, 4)\ntokens ≈ tiles × 1601\n","text",[117,178,175],{"__ignoreMap":179},"",[127,181,183],{"id":182},"the-spill-over-trap","The spill-over trap",[109,185,186],{},"Like Gemini, the tiles are large — crossing a 560px boundary by a few pixels adds an entire tile:",[188,189,190,206],"table",{},[191,192,193],"thead",{},[194,195,196,200,203],"tr",{},[197,198,199],"th",{},"Image",[197,201,202],{},"Tiles",[197,204,205],{},"Tokens",[207,208,209,220],"tbody",{},[194,210,211,214,217],{},[212,213,136],"td",{},[212,215,216],{},"1",[212,218,219],{},"~1,601",[194,221,222,225,228],{},[212,223,224],{},"561×560",[212,226,227],{},"2",[212,229,230],{},"~3,202",[109,232,233,234,169],{},"32 extra pixels of width ≈ ",[112,235,236],{},"1,600 extra tokens",[127,238,240],{"id":239},"optimization-strategy","Optimization strategy",[109,242,243,244,247],{},"Snap each side ",[112,245,246],{},"down"," to the 560px grid within the 2×2 (1120px) max canvas, eliminating spill-over tiles.",[171,249,254],{"className":250,"code":251,"filename":252,"language":253,"meta":179,"style":179},"language-bash shiki shiki-themes material-theme-lighter material-theme material-theme-palenight","vision-squeezer image.png --model llama\n","Terminal","bash",[117,255,256],{"__ignoreMap":179},[257,258,261,265,269,272],"span",{"class":259,"line":260},"line",1,[257,262,264],{"class":263},"sBMFI","vision-squeezer",[257,266,268],{"class":267},"sfazB"," image.png",[257,270,271],{"class":267}," --model",[257,273,274],{"class":267}," llama\n",[109,276,277,278,281,282,285,286,289,290,169],{},"CLI aliases: ",[117,279,280],{},"llama",", ",[117,283,284],{},"llama-vision",". MCP ",[117,287,288],{},"target_model",": ",[117,291,292],{},"\"llama\"",[127,294,296],{"id":295},"token-savings","Token savings",[109,298,299],{},"Each 560px tile is ~1,601 tokens, so removing a single tile row or column is a big proportional win.",[188,301,302,318],{},[191,303,304],{},[194,305,306,309,312,315],{},[197,307,308],{},"Scenario",[197,310,311],{},"Before",[197,313,314],{},"After",[197,316,317],{},"Saved",[207,319,320],{},[194,321,322,325,331,337],{},[212,323,324],{},"2400×1670 screenshot → trim padding to 2400×1200",[212,326,327,328],{},"4 tiles · ",[112,329,330],{},"6,404 tok",[212,332,333,334],{},"2 tiles · ",[112,335,336],{},"3,202 tok",[212,338,339],{},[112,340,341],{},"−50%",[109,343,344],{},"Dropping the image from a 2×2 canvas to a 2×1 canvas halves the bill. Savings depend entirely on how much padding pushes you into an extra tile — images already snug inside their tiles save little.",[127,346,348],{"id":347},"source","Source",[109,350,351,352,355,356,360,361,281,364,281,367,370,371,374,375,378,379,382,383,386,387,390,391,169],{},"Grid constants from the ",[117,353,354],{},"transformers"," ",[112,357,358],{},[117,359,119],{}," (",[117,362,363],{},"image_size 560",[117,365,366],{},"patch_size 14",[117,368,369],{},"max_num_tiles 4",") — the architecture behind Meta's Llama 3.2 Vision (",[112,372,373],{},"released Sep 2024",") and Llama 3.3. The ~1601 tokens\u002Ftile is the model's own footprint ",[117,376,377],{},"(560\u002F14)² + 1","; hosted APIs (Together, Bedrock, Fireworks, Groq) may bill a different per-tile rate, so treat absolute tokens as indicative. ",[112,380,381],{},"Llama 4"," uses a different native-multimodal vision encoder (",[117,384,385],{},"Llama4ForConditionalGeneration",", not Mllama) and is ",[112,388,389],{},"not"," modeled by this connector. Verified ",[112,392,393],{},"2026-06-11",[395,396,397],"style",{},"html pre.shiki code .sBMFI, html code.shiki .sBMFI{--shiki-light:#E2931D;--shiki-default:#FFCB6B;--shiki-dark:#FFCB6B}html pre.shiki code .sfazB, html code.shiki .sfazB{--shiki-light:#91B859;--shiki-default:#C3E88D;--shiki-dark:#C3E88D}html .light .shiki span {color: var(--shiki-light);background: var(--shiki-light-bg);font-style: var(--shiki-light-font-style);font-weight: var(--shiki-light-font-weight);text-decoration: var(--shiki-light-text-decoration);}html.light .shiki span {color: var(--shiki-light);background: var(--shiki-light-bg);font-style: var(--shiki-light-font-style);font-weight: var(--shiki-light-font-weight);text-decoration: var(--shiki-light-text-decoration);}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html.dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}",{"title":179,"searchDepth":260,"depth":399,"links":400},2,[401,402,403,404,405],{"id":129,"depth":399,"text":130},{"id":182,"depth":399,"text":183},{"id":239,"depth":399,"text":240},{"id":295,"depth":399,"text":296},{"id":347,"depth":399,"text":348},"How Meta Llama 3.2\u002F4 Vision tiles images and how VisionSqueezer snaps to the 560px canvas.","md",null,{},{"icon":69},{"title":66,"description":406},"bJZ6zzG4m1IkVSBkVVY3G63P5KmlM24KMxqzxXU56OU",[414,416],{"title":61,"path":62,"stem":63,"description":415,"icon":64,"children":-1},"How Google Gemini tiles images and why snapping to 768px boundaries halves the cost.",{"title":71,"path":72,"stem":73,"description":417,"icon":64,"children":-1},"How Alibaba Qwen2-VL \u002F Qwen2.5-VL tokenizes images on a 28px grid and how VisionSqueezer snaps to it.",1782053692263]