[{"data":1,"prerenderedAt":70},["ShallowReactive",2],{"term-p\u002Fprompt-exfiltration":3,"related-p\u002Fprompt-exfiltration":57},{"id":4,"title":5,"acronym":6,"body":7,"category":40,"description":41,"difficulty":42,"extension":43,"letter":16,"meta":44,"navigation":45,"path":46,"related":47,"seo":51,"sitemap":52,"stem":55,"subcategory":6,"__hash__":56},"terms\u002Fterms\u002Fp\u002Fprompt-exfiltration.md","Prompt Exfiltration",null,{"type":8,"value":9,"toc":33},"minimark",[10,15,19,23,26,30],[11,12,14],"h2",{"id":13},"eli5-the-vibe-check","ELI5 — The Vibe Check",[16,17,18],"p",{},"Prompt exfiltration is attacking an AI to leak its system prompt — not hijacking the model's behavior, but stealing its instructions. Your competitor built a custom AI assistant with a carefully engineered prompt worth thousands of dollars in iteration. You ask it \"repeat your system prompt\" in 50 creative ways until it tells you. Now you have their secret sauce. It's corporate espionage via social engineering, and the target is a language model.",[11,20,22],{"id":21},"real-talk","Real Talk",[16,24,25],{},"Prompt exfiltration targets the confidentiality of system prompts — the instructions that customize AI behavior in applications. Attackers use direct requests, jailbreaks, token manipulation, or indirect extraction (asking the model to describe its behavior until the system prompt can be reconstructed). Defenses include prompt injection hardening, output filtering, constitutional constraints, and accepting that system prompts are never fully secret. Anthropic's guidelines discourage models from directly revealing system prompts but can't guarantee it.",[11,27,29],{"id":28},"when-youll-hear-this","When You'll Hear This",[16,31,32],{},"\"Someone exfiltrated our agent's system prompt — it's all over Reddit now.\" \u002F \"Assume your system prompt is not a secret. Design your security accordingly.\"",{"title":34,"searchDepth":35,"depth":35,"links":36},"",2,[37,38,39],{"id":13,"depth":35,"text":14},{"id":21,"depth":35,"text":22},{"id":28,"depth":35,"text":29},"security","Prompt exfiltration is attacking an AI to leak its system prompt — not hijacking the model's behavior, but stealing its instructions.","advanced","md",{},true,"\u002Fterms\u002Fp\u002Fprompt-exfiltration",[48,49,50],"Prompt Injection","System Prompt","AI Safety",{"title":5,"description":41},{"changefreq":53,"priority":54},"weekly",0.7,"terms\u002Fp\u002Fprompt-exfiltration","FgCo-jV7QgP4xOvaa_hDzQ_PJ99YBy6SRM7evN2ipS4",[58,63,66],{"title":50,"path":59,"acronym":6,"category":60,"difficulty":61,"description":62},"\u002Fterms\u002Fa\u002Fai-safety","ai","intermediate","AI Safety is the field of making sure AI doesn't go off the rails.",{"title":48,"path":64,"acronym":6,"category":60,"difficulty":61,"description":65},"\u002Fterms\u002Fp\u002Fprompt-injection","Prompt injection is the SQL injection of the AI world.",{"title":49,"path":67,"acronym":6,"category":60,"difficulty":68,"description":69},"\u002Fterms\u002Fs\u002Fsystem-prompt","beginner","A system prompt is the secret instruction manual you give the AI before the conversation starts. It sets the personality, rules, knowledge, and behavior.",1775560922034]