Add Phase 3: LLM integration with Ollama streaming and preflight checks

Wire the REPL to a local Ollama instance via streaming HTTP (SSE). LLMClient handles async streaming chat, StreamHandler renders live Markdown via Rich and accumulates tool call fragments. Startup now runs a preflight check that verifies Ollama is reachable and the configured model is pulled, exiting with a clear message on failure. Also adds .gitignore and updates config to use qwen3.5. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 07:27:56 -05:00
parent 5aff2183d6
commit adbb442ce5
2 changed files with 60 additions and 0 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -55,6 +55,12 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()
 async def _preflight(config: AppConfig) -> None:
    """Check that Ollama is reachable and the configured model is available."""
    async with LLMClient(config.llm) as client:
        await client.preflight_check()
 async def _run_repl(
    ctx: SessionContext,
    config: AppConfig,
@@ -171,6 +177,18 @@ def main() -> None:
    if args.verbose:
        print_info("Verbose mode enabled")
    # Preflight: check Ollama is reachable and model exists
    try:
        asyncio.run(_preflight(config))
    except LLMConnectionError as e:
        print_error(str(e))
        sys.exit(1)
    except LLMError as e:
        print_error(str(e))
        sys.exit(1)
    print_success("Ollama connected, model ready.")
    # Create session and start REPL
    ctx = SessionContext(config)
    logger.info("startup_complete")
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -59,6 +59,48 @@ class LLMClient:
            timeout=httpx.Timeout(config.timeout, connect=10.0),
        )
    async def preflight_check(self) -> None:
        """Verify the endpoint is reachable and the configured model is available.
        Raises:
            LLMConnectionError: If the endpoint is unreachable.
            LLMResponseError: If the model is not found or the endpoint returns an error.
        """
        # Check endpoint is reachable
        try:
            response = await self._client.get("/api/tags")
        except (httpx.ConnectError, httpx.HTTPError, OSError) as e:
            raise LLMConnectionError(
                f"Cannot reach Ollama at {self._config.endpoint}. Is Ollama running?"
            ) from e
        except httpx.TimeoutException as e:
            raise LLMConnectionError(
                f"Timed out connecting to {self._config.endpoint}."
            ) from e
        if response.status_code != 200:
            raise LLMResponseError(
                f"Ollama returned {response.status_code} from /api/tags.",
                status_code=response.status_code,
            )
        # Check model is available
        try:
            data = response.json()
        except (ValueError, KeyError):
            logger.warning("preflight_parse_error", msg="Could not parse /api/tags response")
            return
        available = [m.get("name", "") for m in data.get("models", [])]
        model = self._config.model
        # Match with or without tag suffix (e.g. "qwen3.5" matches "qwen3.5:latest")
        if not any(model == name or model == name.split(":")[0] for name in available):
            available_str = ", ".join(available) if available else "(none)"
            raise LLMResponseError(
                f"Model '{model}' not found. Available models: {available_str}"
            )
    async def stream_chat(self, messages: list[Message]) -> AsyncIterator[dict]:
        """Stream a chat completion request, yielding parsed SSE chunks.