Structured Outputs: JSON Schema Enforcement, Pydantic/Zod Integration and Production Pitfalls
Chapter 12: Prefill and Output Steering: The Art of Controlling Response Starting Points
12.1 What Is Prefill?
Prefill is an Anthropic-specific API capability that lets you place an incomplete assistant message as the final entry in the messages array. Claude then continues generating from that starting point, as if it had already begun writing the response.
import anthropic
client = anthropic.Anthropic()
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
messages=[
{"role": "user", "content": "What is quantum entanglement?"},
{"role": "assistant", "content": "Quantum entanglement is"} # Prefill
]
)
# The model continues from "Quantum entanglement is"
# The response does NOT repeat the prefill — it only contains what comes after
print(response.content[0].text)
# Example: " a phenomenon where two particles become correlated..."
Critical behavior: The API response contains only the text generated after the prefill. The prefill itself is not echoed back. To reconstruct the full response, concatenate: prefill_text + response.content[0].text.
Prefill vs. history assistant messages
| Property | History assistant message | Prefill (last incomplete assistant message) |
|---|---|---|
| Position | Middle of conversation | Last item in messages array |
| Completeness | Complete | Can be a partial fragment |
| Appears in response | No (it's history) | No (only continuation is returned) |
| Purpose | Provide conversation context | Force the response starting point |
12.2 Forcing JSON Output
The most common use of prefill is guaranteeing that the model's response begins with { or [, eliminating the preamble text that sometimes appears before JSON.
Basic JSON enforcement
import anthropic, json
client = anthropic.Anthropic()
def extract_contact_info(text: str) -> dict:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
system="Extract structured contact information. Output JSON only, no other text.",
messages=[
{"role": "user", "content": f"Extract contact info from:\n\n{text}"},
{"role": "assistant", "content": "{"} # Force JSON object start
]
)
full_json = "{" + response.content[0].text
return json.loads(full_json)
result = extract_contact_info(
"Please contact John Smith, phone 555-0100, email [email protected]"
)
print(result)
# {"name": "John Smith", "phone": "555-0100", "email": "[email protected]"}
Forcing a specific JSON structure
def batch_sentiment(reviews: list[str]) -> list[dict]:
reviews_text = "\n".join(f"{i+1}. {r}" for i, r in enumerate(reviews))
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
messages=[
{
"role": "user",
"content": f"""Analyze these reviews. Output a JSON array.
Each element: {{"id": number, "sentiment": "positive|negative|neutral", "score": 0.0-1.0}}
Reviews:
{reviews_text}"""
},
{"role": "assistant", "content": "["} # Force JSON array
]
)
raw = "[" + response.content[0].text.rstrip()
if not raw.endswith("]"):
raw = raw.rstrip(",") + "]"
return json.loads(raw)
YAML and XML enforcement
def generate_config(spec: str) -> str:
"""Force YAML output"""
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
messages=[
{"role": "user", "content": f"Generate a YAML config for: {spec}"},
{"role": "assistant", "content": "# Configuration\n"}
]
)
return "# Configuration\n" + response.content[0].text
def generate_xml(data_description: str) -> str:
"""Force XML output"""
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[
{"role": "user", "content": f"Generate XML for: {data_description}"},
{"role": "assistant", "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<root>\n"}
]
)
return '<?xml version="1.0" encoding="UTF-8"?>\n<root>\n' + response.content[0].text
12.3 Code Block Enforcement
Forcing specific programming languages
def generate_python(task: str) -> str:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
system="You are a Python expert. Output code only, no explanations.",
messages=[
{"role": "user", "content": f"Implement: {task}"},
{"role": "assistant", "content": "```python\n"}
]
)
code = response.content[0].text
if "```" in code:
code = code[:code.rfind("```")]
return "```python\n" + code.rstrip() + "\n```"
def generate_typescript(task: str) -> str:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
messages=[
{"role": "user", "content": f"Implement in TypeScript: {task}"},
{"role": "assistant", "content": "```typescript\n"}
]
)
return "```typescript\n" + response.content[0].text
def generate_sql(query_description: str) -> str:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
messages=[
{"role": "user", "content": f"Write SQL for: {query_description}"},
{"role": "assistant", "content": "```sql\nSELECT "}
]
)
return "```sql\nSELECT " + response.content[0].text
12.4 Tone and Language Locking
Forcing a specific language in responses
def force_language(user_input: str, language: str) -> str:
starters = {
"English": "Certainly, ",
"French": "Bien sûr, ",
"German": "Natürlich, ",
"Spanish": "Por supuesto, ",
"Japanese": "はい、",
"Chinese": "好的,"
}
starter = starters.get(language, "")
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
messages=[
{"role": "user", "content": user_input},
{"role": "assistant", "content": starter}
]
)
return starter + response.content[0].text
Character and persona tone locking
def roleplay_response(
character: str,
character_style: str,
user_message: str
) -> str:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
system=f"You are {character}. {character_style} Always respond in character.",
messages=[
{"role": "user", "content": user_message},
{
"role": "assistant",
"content": f"*{character} pauses thoughtfully*\n\n" # Lock the opening action
}
]
)
return f"*{character} pauses thoughtfully*\n\n" + response.content[0].text
12.5 Chain-of-Thought Steering
Prefill can guide the model into a specific reasoning pattern before it reaches a conclusion:
def solve_with_steps(problem: str) -> dict:
"""Force step-by-step reasoning before the answer"""
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
messages=[
{
"role": "user",
"content": f"Solve this problem step by step:\n\n{problem}"
},
{
"role": "assistant",
"content": "Let me work through this systematically.\n\n**Step 1: Understand the problem**\n"
}
]
)
full = "Let me work through this systematically.\n\n**Step 1: Understand the problem**\n"
full += response.content[0].text
return {"reasoning": full}
def two_phase_reasoning(problem: str) -> tuple[str, str]:
"""Phase 1: reasoning. Phase 2: final answer from reasoning."""
# Phase 1: reasoning
r1 = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
messages=[
{"role": "user", "content": f"Analyze: {problem}"},
{"role": "assistant", "content": "Analysis:\n"}
]
)
reasoning = "Analysis:\n" + r1.content[0].text
# Phase 2: concise answer based on the reasoning
r2 = client.messages.create(
model="claude-opus-4-6",
max_tokens=128,
messages=[
{"role": "user", "content": f"Analyze: {problem}"},
{"role": "assistant", "content": reasoning},
{"role": "user", "content": "Based on the above, what is the final answer in one sentence?"},
{"role": "assistant", "content": "Answer: "}
]
)
answer = "Answer: " + r2.content[0].text.split("\n")[0]
return reasoning, answer
12.6 Output Length Control
Forcing concise responses
def get_one_line_answer(question: str) -> str:
response = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=80,
messages=[
{"role": "user", "content": f"{question} Answer in one sentence."},
{"role": "assistant", "content": "In short: "}
]
)
# Take only the first line
return "In short: " + response.content[0].text.split("\n")[0]
def generate_numbered_list(topic: str, count: int = 5) -> list[str]:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
messages=[
{"role": "user", "content": f"List {count} key points about {topic}."},
{"role": "assistant", "content": "1. "}
]
)
full = "1. " + response.content[0].text
items = []
for line in full.splitlines():
line = line.strip()
if line and line[0].isdigit() and ". " in line:
items.append(line.split(". ", 1)[1].strip())
return items[:count]
12.7 Prefill and Extended Thinking Interaction
When thinking is enabled, prefill use is restricted:
# With Extended Thinking, do NOT use meaningful prefill content.
# Thinking blocks must come before text blocks in the output.
# Using prefill with thinking enabled can cause format conflicts.
# Correct: no prefill when thinking is enabled
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=16000,
thinking={
"type": "enabled",
"budget_tokens": 10000
},
messages=[
{"role": "user", "content": "Solve this complex math problem: ..."}
# No prefill here
]
)
# Process the two content blocks
for block in response.content:
if block.type == "thinking":
print(f"[Thinking]: {block.thinking[:200]}...")
elif block.type == "text":
print(f"[Answer]: {block.text}")
12.8 Multi-Step Prefill Patterns
Progressive document generation
def generate_structured_report(data: str) -> str:
conversation = [
{"role": "user", "content": f"Generate an analysis report based on:\n\n{data}"}
]
# Step 1: Force executive summary section
conversation.append({
"role": "assistant",
"content": "# Analysis Report\n\n## Executive Summary\n\n"
})
r1 = client.messages.create(
model="claude-sonnet-4-6", max_tokens=400, messages=conversation
)
summary_text = "# Analysis Report\n\n## Executive Summary\n\n" + r1.content[0].text
# Step 2: Continue with detailed analysis
conversation[-1] = {"role": "assistant", "content": summary_text}
conversation.append({"role": "user", "content": "Continue with the detailed findings section."})
conversation.append({"role": "assistant", "content": "\n## Detailed Findings\n\n"})
r2 = client.messages.create(
model="claude-sonnet-4-6", max_tokens=800, messages=conversation
)
return summary_text + "\n## Detailed Findings\n\n" + r2.content[0].text
12.9 Production Patterns and Gotchas
JSON parsing safety wrapper
import json, re
def safe_json_prefill(client, messages: list, model: str = "claude-sonnet-4-6",
max_tokens: int = 1024, prefix: str = "{") -> dict | list | None:
"""Prefill-based JSON extraction with fallback parsing."""
response = client.messages.create(
model=model,
max_tokens=max_tokens,
messages=messages + [{"role": "assistant", "content": prefix}]
)
raw = prefix + response.content[0].text.strip()
# Try direct parse
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
# Try to find embedded JSON
match = re.search(r'(\{.*\}|\[.*\])', raw, re.DOTALL)
if match:
try:
return json.loads(match.group(1))
except json.JSONDecodeError:
pass
return None # Signal caller to handle the failure
Best practices summary
Use prefill when:
- Enforcing JSON/XML/YAML structured output
- Locking a code block to a specific language
- Ensuring a response opens in a specific tone or role
- Guiding step-by-step reasoning before a conclusion
Avoid prefill when:
- Extended Thinking (
thinkingparameter) is enabled - The model needs freedom to choose the best output structure
- The prefill text would create a misleading or contradictory context
Important gotchas:
- The prefill text is not included in the API response — concatenate manually for the full string
- Very long prefills consume extra input tokens
- In streaming mode, display the prefill immediately on the client side, then append deltas
- Prefill guides but does not guarantee — test adversarially for production use
Summary
Prefill is a uniquely powerful Claude API feature for deterministic output shaping. By controlling where a response starts, you gain:
- Near-zero JSON preamble noise — open with
{or[to get clean parseable output - Code language enforcement —
```python\nas prefill makes code blocks reliable - Tone and language locking — language-specific opening words steer the model's language choice
- Reasoning path control — chain-of-thought prefills guide analysis structure before conclusions
- Concatenation rule — always combine
prefill + response.content[0].textfor the full text