Compare commits

..

2 Commits

Author SHA1 Message Date
9273d14845 Merge branch 'feature/disable-thinking-mode' 2026-03-11 19:36:49 -05:00
f0d8ef8f0a feat: add thinking mode toggle to suppress reasoning-only response loops
Adds `llm.thinking` config option (default: true) that when disabled:
- Injects /no_think into the last user message for Qwen 3.x compatibility
- Sends chat_template_kwargs in API payload for backends that support it
- Silently and immediately nudges on reasoning-only responses instead of
  showing warnings and wasting retry iterations

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 19:34:36 -05:00
4 changed files with 46 additions and 11 deletions

View File

@@ -92,9 +92,29 @@ class AgentLoop:
return prompt return prompt
def _get_messages_with_system_prompt(self) -> list[Message]: def _get_messages_with_system_prompt(self) -> list[Message]:
"""Prepend the system prompt to conversation history.""" """Prepend the system prompt to conversation history.
When thinking is disabled, injects a /no_think tag into the last user
message so Qwen 3.x (and similar) chat templates see it regardless of
where tool-result messages fall in the history.
"""
system_msg = Message(role="system", content=self._system_prompt) system_msg = Message(role="system", content=self._system_prompt)
return [system_msg] + self._ctx.get_history() history = self._ctx.get_history()
if not self._config.llm.thinking and history:
history = list(history)
for i in range(len(history) - 1, -1, -1):
if history[i].role == "user":
original = history[i]
history[i] = Message(
role="user",
content=(original.content or "") + " /no_think",
tool_call_id=original.tool_call_id,
name=original.name,
)
break
return [system_msg] + history
async def run_turn(self, user_input: str) -> None: async def run_turn(self, user_input: str) -> None:
"""Execute one full agent turn: add user message, loop until done. """Execute one full agent turn: add user message, loop until done.
@@ -161,6 +181,10 @@ class AgentLoop:
reasoning_only_streak += 1 reasoning_only_streak += 1
self._ctx.pop_last_message() self._ctx.pop_last_message()
# When thinking is disabled, reasoning-only is expected model noise.
# Nudge immediately and silently to avoid wasting iterations.
thinking_disabled = not self._config.llm.thinking
# If the last context messages are tool errors, nudge immediately # If the last context messages are tool errors, nudge immediately
# rather than wasting retries — the model is likely confused by the error. # rather than wasting retries — the model is likely confused by the error.
has_recent_tool_error = any( has_recent_tool_error = any(
@@ -168,9 +192,14 @@ class AgentLoop:
for m in self._ctx.get_history()[-3:] for m in self._ctx.get_history()[-3:]
) )
if has_recent_tool_error or reasoning_only_streak >= _MAX_REASONING_RETRIES: should_nudge = (
# Nudge the model by injecting a user hint thinking_disabled
if self._display: or has_recent_tool_error
or reasoning_only_streak >= _MAX_REASONING_RETRIES
)
if should_nudge:
if not thinking_disabled and self._display:
self._display.write_warning( self._display.write_warning(
f"Model produced reasoning but no response {reasoning_only_streak} times. " f"Model produced reasoning but no response {reasoning_only_streak} times. "
"Nudging model to respond..." "Nudging model to respond..."

View File

@@ -20,6 +20,10 @@ class LLMConfig(BaseModel):
max_retries: int = Field(default=3, description="Max retry attempts on transient errors") max_retries: int = Field(default=3, description="Max retry attempts on transient errors")
retry_backoff_base: float = Field(default=1.0, description="Base seconds for exponential backoff") retry_backoff_base: float = Field(default=1.0, description="Base seconds for exponential backoff")
retry_backoff_max: float = Field(default=30.0, description="Maximum backoff seconds") retry_backoff_max: float = Field(default=30.0, description="Maximum backoff seconds")
thinking: bool = Field(
default=True,
description="Enable model thinking/reasoning mode (disable to reduce reasoning-only loops)",
)
extra_body: dict[str, Any] = Field( extra_body: dict[str, Any] = Field(
default_factory=dict, default_factory=dict,
description="Extra parameters merged into the API request body (model-specific)", description="Extra parameters merged into the API request body (model-specific)",

View File

@@ -151,7 +151,11 @@ class LLMClient:
if tools: if tools:
payload["tools"] = tools payload["tools"] = tools
# Merge model-specific extra parameters (e.g., enable_thinking, reasoning_effort) # When thinking is disabled, inject chat_template_kwargs for backends that support it
if not self._config.thinking:
payload.setdefault("chat_template_kwargs", {})["enable_thinking"] = False
# Merge model-specific extra parameters (e.g., reasoning_effort)
if self._config.extra_body: if self._config.extra_body:
payload.update(self._config.extra_body) payload.update(self._config.extra_body)

View File

@@ -10,13 +10,11 @@ llm:
max_retries: 3 max_retries: 3
retry_backoff_base: 1.0 retry_backoff_base: 1.0
retry_backoff_max: 30.0 retry_backoff_max: 30.0
thinking: false # Disable model thinking/reasoning mode (reduces reasoning-only loops)
# Extra parameters merged into the API request body (model-specific). # Extra parameters merged into the API request body (model-specific).
# Examples: # Examples:
# Qwen 3.x: enable_thinking: false
# DeepSeek: enable_thinking: false
# OpenAI: reasoning_effort: "low" # OpenAI: reasoning_effort: "low"
extra_body: extra_body: {}
enable_thinking: false
agent: agent:
max_iterations: 25 max_iterations: 25