fix: empty response handling, /no_think model gating, per-model profiles
- Detect empty LLM responses (no content, no tool calls) instead of silently treating them as task completion. Retries once without tools before warning the user. - Gate /no_think system message and chat_template_kwargs to Qwen/QwQ models only — sending /no_think to llama3.x caused empty responses. - Add model_profiles config section for per-model overrides (token budget, thinking, temperature, max_tokens) matched by name prefix. Applied at startup and on /model switch. - Update SessionManager on /model switch so session files record the correct model. - Add NDJSON fallback in SSE stream parser for Ollama compatibility. - Improve read_file error to suggest find_files on FileNotFoundError. - Add diagnostic logging for empty streams and empty results. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -151,8 +151,9 @@ class LLMClient:
|
||||
if tools:
|
||||
payload["tools"] = tools
|
||||
|
||||
# When thinking is disabled, inject chat_template_kwargs for backends that support it
|
||||
if not self._config.thinking:
|
||||
# When thinking is disabled, inject chat_template_kwargs for backends
|
||||
# that support it (Qwen 3.x thinking models).
|
||||
if not self._config.thinking and self._config.model.lower().startswith(("qwen", "qwq")):
|
||||
payload.setdefault("chat_template_kwargs", {})["enable_thinking"] = False
|
||||
|
||||
# Merge model-specific extra parameters (e.g., reasoning_effort)
|
||||
@@ -170,20 +171,32 @@ class LLMClient:
|
||||
status_code=response.status_code,
|
||||
)
|
||||
|
||||
chunk_count = 0
|
||||
async for line in response.aiter_lines():
|
||||
if not line.startswith("data: "):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
data = line[6:] # strip "data: " prefix
|
||||
|
||||
if data.strip() == "[DONE]":
|
||||
return
|
||||
# SSE format: "data: {json}" or "data: [DONE]"
|
||||
if line.startswith("data: "):
|
||||
data = line[6:]
|
||||
if data.strip() == "[DONE]":
|
||||
break
|
||||
elif line.startswith("{"):
|
||||
# Plain NDJSON fallback (some Ollama versions)
|
||||
data = line
|
||||
else:
|
||||
continue
|
||||
|
||||
try:
|
||||
yield json.loads(data)
|
||||
chunk_count += 1
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("malformed_sse_chunk", data=data[:200])
|
||||
|
||||
if chunk_count == 0:
|
||||
logger.warning("empty_stream", model=self._config.model)
|
||||
|
||||
except httpx.ConnectError as e:
|
||||
raise LLMConnectionError(f"Cannot connect to LLM endpoint: {e}") from e
|
||||
except httpx.TimeoutException as e:
|
||||
|
||||
@@ -52,6 +52,10 @@ class SessionManager:
|
||||
self._session_dir = workspace_root / config.session_dir
|
||||
self._session_id = f"{self._workspace_hash}_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}"
|
||||
|
||||
def update_model(self, model: str) -> None:
|
||||
"""Update the model name for session metadata."""
|
||||
self._model = model
|
||||
|
||||
def save(self, ctx: "SessionContext") -> Path:
|
||||
"""Save session state to a JSON file via atomic write.
|
||||
|
||||
|
||||
@@ -60,8 +60,10 @@ class StreamHandler:
|
||||
"""
|
||||
thinking_notified = False
|
||||
last_update_time = 0.0
|
||||
chunk_count = 0
|
||||
|
||||
async for chunk in chunk_iter:
|
||||
chunk_count += 1
|
||||
self._process_chunk(chunk)
|
||||
|
||||
if not self._display_config.stream_output:
|
||||
@@ -96,6 +98,14 @@ class StreamHandler:
|
||||
self._on_done()
|
||||
|
||||
tool_calls = self._build_tool_calls() or None
|
||||
|
||||
if chunk_count > 0 and not self._accumulated_content and not tool_calls:
|
||||
logger.debug(
|
||||
"stream_empty_result",
|
||||
chunks_received=chunk_count,
|
||||
had_reasoning=bool(self._accumulated_reasoning),
|
||||
)
|
||||
|
||||
return Message(
|
||||
role="assistant",
|
||||
content=self._accumulated_content or None,
|
||||
|
||||
Reference in New Issue
Block a user