Merge branch 'feature/disable-thinking-mode'

feat: add thinking mode toggle to suppress reasoning-only response loops
Adds `llm.thinking` config option (default: true) that when disabled: - Injects /no_think into the last user message for Qwen 3.x compatibility - Sends chat_template_kwargs in API payload for backends that support it - Silently and immediately nudges on reasoning-only responses instead of showing warnings and wasting retry iterations Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 19:36:49 -05:00 · 2026-03-11 19:34:36 -05:00
4 changed files with 46 additions and 11 deletions
--- a/app/agent/loop.py
+++ b/app/agent/loop.py
@@ -92,9 +92,29 @@ class AgentLoop:
        return prompt
    def _get_messages_with_system_prompt(self) -> list[Message]:
-        """Prepend the system prompt to conversation history."""
+        """Prepend the system prompt to conversation history.
        When thinking is disabled, injects a /no_think tag into the last user
        message so Qwen 3.x (and similar) chat templates see it regardless of
        where tool-result messages fall in the history.
        """
        system_msg = Message(role="system", content=self._system_prompt)
-        return [system_msg] + self._ctx.get_history()
+        history = self._ctx.get_history()
        if not self._config.llm.thinking and history:
            history = list(history)
            for i in range(len(history) - 1, -1, -1):
                if history[i].role == "user":
                    original = history[i]
                    history[i] = Message(
                        role="user",
                        content=(original.content or "") + " /no_think",
                        tool_call_id=original.tool_call_id,
                        name=original.name,
                    )
                    break
        return [system_msg] + history
    async def run_turn(self, user_input: str) -> None:
        """Execute one full agent turn: add user message, loop until done.
@@ -161,6 +181,10 @@ class AgentLoop:
                reasoning_only_streak += 1
                self._ctx.pop_last_message()
                # When thinking is disabled, reasoning-only is expected model noise.
                # Nudge immediately and silently to avoid wasting iterations.
                thinking_disabled = not self._config.llm.thinking
                # If the last context messages are tool errors, nudge immediately
                # rather than wasting retries — the model is likely confused by the error.
                has_recent_tool_error = any(
@@ -168,9 +192,14 @@ class AgentLoop:
                    for m in self._ctx.get_history()[-3:]
                )
-                if has_recent_tool_error or reasoning_only_streak >= _MAX_REASONING_RETRIES:
+                should_nudge = (
-                    # Nudge the model by injecting a user hint
+                    thinking_disabled
-                    if self._display:
+                    or has_recent_tool_error
                    or reasoning_only_streak >= _MAX_REASONING_RETRIES
                )
                if should_nudge:
                    if not thinking_disabled and self._display:
                        self._display.write_warning(
                            f"Model produced reasoning but no response {reasoning_only_streak} times. "
                            "Nudging model to respond..."
--- a/app/models/config.py
+++ b/app/models/config.py
@@ -20,6 +20,10 @@ class LLMConfig(BaseModel):
    max_retries: int = Field(default=3, description="Max retry attempts on transient errors")
    retry_backoff_base: float = Field(default=1.0, description="Base seconds for exponential backoff")
    retry_backoff_max: float = Field(default=30.0, description="Maximum backoff seconds")
    thinking: bool = Field(
        default=True,
        description="Enable model thinking/reasoning mode (disable to reduce reasoning-only loops)",
    )
    extra_body: dict[str, Any] = Field(
        default_factory=dict,
        description="Extra parameters merged into the API request body (model-specific)",
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -151,7 +151,11 @@ class LLMClient:
        if tools:
            payload["tools"] = tools
-        # Merge model-specific extra parameters (e.g., enable_thinking, reasoning_effort)
+        # When thinking is disabled, inject chat_template_kwargs for backends that support it
        if not self._config.thinking:
            payload.setdefault("chat_template_kwargs", {})["enable_thinking"] = False
        # Merge model-specific extra parameters (e.g., reasoning_effort)
        if self._config.extra_body:
            payload.update(self._config.extra_body)
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -10,13 +10,11 @@ llm:
  max_retries: 3
  retry_backoff_base: 1.0
  retry_backoff_max: 30.0
  thinking: false  # Disable model thinking/reasoning mode (reduces reasoning-only loops)
  # Extra parameters merged into the API request body (model-specific).
  # Examples:
  #   Qwen 3.x:  enable_thinking: false
  #   DeepSeek:   enable_thinking: false
  #   OpenAI: reasoning_effort: "low"
-  extra_body:
+  extra_body: {}
    enable_thinking: false
 agent:
  max_iterations: 25