[{"data":1,"prerenderedAt":131},["ShallowReactive",2],{"term-t\u002Ftokenizer":3,"related-t\u002Ftokenizer":115},{"id":4,"title":5,"acronym":6,"body":7,"category":95,"description":96,"difficulty":97,"extension":98,"letter":99,"meta":100,"navigation":101,"path":102,"related":103,"seo":109,"sitemap":110,"stem":113,"subcategory":6,"__hash__":114},"terms\u002Fterms\u002Ft\u002Ftokenizer.md","Tokenizer",null,{"type":8,"value":9,"toc":89},"minimark",[10,15,19,23,26,30,78,82,85],[11,12,14],"h2",{"id":13},"eli5-the-vibe-check","ELI5 — The Vibe Check",[16,17,18],"p",{},"A tokenizer chops text into pieces that the AI model can understand — but not in ways humans would expect. 'Hello' might be one token, but 'unbelievable' might be three: 'un', 'believ', 'able'. Spaces count. Emojis are expensive. And 'ChatGPT' is somehow 3 tokens but 'the' is just 1. It's like the model's weird dictionary where common words are cheap and rare words are pricey.",[11,20,22],{"id":21},"real-talk","Real Talk",[16,24,25],{},"A tokenizer converts text into a sequence of discrete tokens (subword units) that a language model can process. Common algorithms include BPE (Byte Pair Encoding), WordPiece, and SentencePiece. The tokenizer's vocabulary and encoding scheme directly affect model context limits, pricing (tokens = cost), and multilingual capability. Different models use different tokenizers.",[11,27,29],{"id":28},"show-me-the-code","Show Me The Code",[31,32,37],"pre",{"className":33,"code":34,"language":35,"meta":36,"style":36},"language-python shiki shiki-themes material-theme-lighter material-theme material-theme-palenight","# Counting tokens with tiktoken (OpenAI)\nimport tiktoken\nenc = tiktoken.encoding_for_model(\"gpt-4\")\ntokens = enc.encode(\"Hello, world!\")\nprint(len(tokens))  # 4 tokens\nprint(tokens)       # [9906, 11, 1917, 0]\n","python","",[38,39,40,48,54,60,66,72],"code",{"__ignoreMap":36},[41,42,45],"span",{"class":43,"line":44},"line",1,[41,46,47],{},"# Counting tokens with tiktoken (OpenAI)\n",[41,49,51],{"class":43,"line":50},2,[41,52,53],{},"import tiktoken\n",[41,55,57],{"class":43,"line":56},3,[41,58,59],{},"enc = tiktoken.encoding_for_model(\"gpt-4\")\n",[41,61,63],{"class":43,"line":62},4,[41,64,65],{},"tokens = enc.encode(\"Hello, world!\")\n",[41,67,69],{"class":43,"line":68},5,[41,70,71],{},"print(len(tokens))  # 4 tokens\n",[41,73,75],{"class":43,"line":74},6,[41,76,77],{},"print(tokens)       # [9906, 11, 1917, 0]\n",[11,79,81],{"id":80},"when-youll-hear-this","When You'll Hear This",[16,83,84],{},"\"The prompt is 4,000 tokens — we need to trim it.\" \u002F \"The tokenizer splits 'JavaScript' into 'Java' and 'Script.'\"",[86,87,88],"style",{},"html .light .shiki span {color: var(--shiki-light);background: var(--shiki-light-bg);font-style: var(--shiki-light-font-style);font-weight: var(--shiki-light-font-weight);text-decoration: var(--shiki-light-text-decoration);}html.light .shiki span {color: var(--shiki-light);background: var(--shiki-light-bg);font-style: var(--shiki-light-font-style);font-weight: var(--shiki-light-font-weight);text-decoration: var(--shiki-light-text-decoration);}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html.dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}",{"title":36,"searchDepth":50,"depth":50,"links":90},[91,92,93,94],{"id":13,"depth":50,"text":14},{"id":21,"depth":50,"text":22},{"id":28,"depth":50,"text":29},{"id":80,"depth":50,"text":81},"ai","A tokenizer chops text into pieces that the AI model can understand — but not in ways humans would expect.","intermediate","md","t",{},true,"\u002Fterms\u002Ft\u002Ftokenizer",[104,105,106,107,108],"Token","Context Window","BPE","LLM","Embedding",{"title":5,"description":96},{"changefreq":111,"priority":112},"weekly",0.7,"terms\u002Ft\u002Ftokenizer","MgE5OCrY9lHncPZ6DW_rvPvkMipc_Tih_0ojtPd2W2E",[116,120,123,128],{"title":105,"path":117,"acronym":6,"category":118,"difficulty":97,"description":119},"\u002Fterms\u002Fc\u002Fcontext-window","vibecoding","A context window is how much text an AI can 'see' at once — its working memory.",{"title":108,"path":121,"acronym":6,"category":95,"difficulty":97,"description":122},"\u002Fterms\u002Fe\u002Fembedding","An embedding is turning words, sentences, or entire documents into lists of numbers (vectors) that capture their meaning.",{"title":107,"path":124,"acronym":125,"category":95,"difficulty":126,"description":127},"\u002Fterms\u002Fl\u002Fllm","Large Language Model","beginner","An LLM is a humongous AI that read basically the entire internet and learned to predict what words come next, really really well.",{"title":104,"path":129,"acronym":6,"category":118,"difficulty":126,"description":130},"\u002Fterms\u002Ft\u002Ftoken","In AI-land, a token is a chunk of text — roughly 3\u002F4 of a word.",1776518319436]