fix(chat): stop retry spirals on tool failures

2026-03-10 18:05:34 +00:00
parent 212ce33e36
commit dbabbdf9f0
6 changed files with 490 additions and 14 deletions
--- a/backend/server/chat/views/init.py
+++ b/backend/server/chat/views/init.py
@@ -62,6 +62,9 @@ class ChatViewSet(viewsets.ModelViewSet):
            if message.role == "tool"
            and message.tool_call_id
            and not self._is_required_param_tool_error_message_content(message.content)
+            and not self._is_execution_failure_tool_error_message_content(
+                message.content
+            )
        }

        messages = [
@@ -76,6 +79,13 @@ class ChatViewSet(viewsets.ModelViewSet):
                and self._is_required_param_tool_error_message_content(message.content)
            ):
                continue
+            if (
+                message.role == "tool"
+                and self._is_execution_failure_tool_error_message_content(
+                    message.content
+                )
+            ):
+                continue

            payload = {
                "role": message.role,
@@ -152,6 +162,24 @@ class ChatViewSet(viewsets.ModelViewSet):
            )
        )

+    @classmethod
+    def _is_execution_failure_tool_error(cls, result):
+        if not isinstance(result, dict):
+            return False
+
+        error_text = result.get("error")
+        if not isinstance(error_text, str) or not error_text.strip():
+            return False
+
+        return not cls._is_required_param_tool_error(result)
+
+    @staticmethod
+    def _is_retryable_execution_failure(result):
+        if not isinstance(result, dict):
+            return False
+
+        return result.get("retryable", True) is not False
+
    @classmethod
    def _is_required_param_tool_error_message_content(cls, content):
        if not isinstance(content, str):
@@ -164,6 +192,18 @@ class ChatViewSet(viewsets.ModelViewSet):

        return cls._is_required_param_tool_error(parsed)

+    @classmethod
+    def _is_execution_failure_tool_error_message_content(cls, content):
+        if not isinstance(content, str):
+            return False
+
+        try:
+            parsed = json.loads(content)
+        except json.JSONDecodeError:
+            return False
+
+        return cls._is_execution_failure_tool_error(parsed)
+
    @staticmethod
    def _build_required_param_error_event(tool_name, result):
        tool_error = result.get("error") if isinstance(result, dict) else ""
@@ -190,6 +230,24 @@ class ChatViewSet(viewsets.ModelViewSet):
        normalized_error = error_text.strip().lower()
        return "location" in normalized_error

+    @staticmethod
+    def _is_search_places_geocode_error(tool_name, result):
+        if tool_name != "search_places" or not isinstance(result, dict):
+            return False
+
+        error_text = result.get("error")
+        if not isinstance(error_text, str):
+            return False
+
+        return error_text.strip().lower().startswith("could not geocode location")
+
+    @classmethod
+    def _is_search_places_location_retry_candidate_error(cls, tool_name, result):
+        return cls._is_search_places_missing_location_required_error(
+            tool_name,
+            result,
+        ) or cls._is_search_places_geocode_error(tool_name, result)
+
    @staticmethod
    def _build_search_places_location_clarification_message():
        return (
@@ -198,6 +256,21 @@ class ChatViewSet(viewsets.ModelViewSet):
            "activities, or lodging."
        )

+    @staticmethod
+    def _build_tool_execution_error_event(tool_name, result):
+        tool_error = (
+            (result or {}).get("error")
+            if isinstance(result, dict)
+            else "Tool execution failed"
+        )
+        return {
+            "error": (
+                f"The assistant could not complete '{tool_name}' ({tool_error}). "
+                "Please try again in a moment or adjust your request."
+            ),
+            "error_category": "tool_execution_error",
+        }
+
    @staticmethod
    def _normalize_trip_context_destination(destination):
        destination_text = (destination or "").strip()
@@ -420,11 +493,13 @@ class ChatViewSet(viewsets.ModelViewSet):
        )

        MAX_TOOL_ITERATIONS = 10
+        MAX_ALL_FAILURE_ROUNDS = 3

        async def event_stream():
            current_messages = list(llm_messages)
            encountered_error = False
            tool_iterations = 0
+            all_failure_rounds = 0

            while tool_iterations < MAX_TOOL_ITERATIONS:
                content_chunks = []
@@ -472,10 +547,11 @@ class ChatViewSet(viewsets.ModelViewSet):
                assistant_content = "".join(content_chunks)

                if tool_calls_accumulator:
-                    tool_iterations += 1
                    successful_tool_calls = []
                    successful_tool_messages = []
                    successful_tool_chat_entries = []
+                    first_execution_failure = None
+                    encountered_permanent_failure = False

                    for tool_call in tool_calls_accumulator:
                        function_payload = tool_call.get("function") or {}
@@ -519,7 +595,7 @@ class ChatViewSet(viewsets.ModelViewSet):
                            **prepared_arguments,
                        )

-                        if self._is_search_places_missing_location_required_error(
+                        if self._is_search_places_location_retry_candidate_error(
                            function_name,
                            result,
                        ):
@@ -552,7 +628,11 @@ class ChatViewSet(viewsets.ModelViewSet):
                                    **retry_arguments,
                                )

-                                if not self._is_required_param_tool_error(retry_result):
+                                if not self._is_required_param_tool_error(
+                                    retry_result
+                                ) and not self._is_execution_failure_tool_error(
+                                    retry_result
+                                ):
                                    result = retry_result
                                    tool_call_for_history = {
                                        **tool_call,
@@ -630,6 +710,13 @@ class ChatViewSet(viewsets.ModelViewSet):
                            yield "data: [DONE]\n\n"
                            return

+                        if self._is_execution_failure_tool_error(result):
+                            if first_execution_failure is None:
+                                first_execution_failure = (function_name, result)
+                            if not self._is_retryable_execution_failure(result):
+                                encountered_permanent_failure = True
+                            continue
+
                        result_content = serialize_tool_result(result)

                        successful_tool_calls.append(tool_call_for_history)
@@ -659,6 +746,41 @@ class ChatViewSet(viewsets.ModelViewSet):
                        }
                        yield f"data: {json.dumps(tool_event)}\n\n"

+                    if not successful_tool_calls and first_execution_failure:
+                        if encountered_permanent_failure:
+                            all_failure_rounds = MAX_ALL_FAILURE_ROUNDS
+                        else:
+                            all_failure_rounds += 1
+
+                        if all_failure_rounds >= MAX_ALL_FAILURE_ROUNDS:
+                            failed_tool_name, failed_tool_result = (
+                                first_execution_failure
+                            )
+                            error_event = self._build_tool_execution_error_event(
+                                failed_tool_name,
+                                failed_tool_result,
+                            )
+                            await sync_to_async(
+                                ChatMessage.objects.create,
+                                thread_sensitive=True,
+                            )(
+                                conversation=conversation,
+                                role="assistant",
+                                content=error_event["error"],
+                            )
+                            await sync_to_async(
+                                conversation.save,
+                                thread_sensitive=True,
+                            )(update_fields=["updated_at"])
+                            yield f"data: {json.dumps(error_event)}\n\n"
+                            yield "data: [DONE]\n\n"
+                            return
+
+                        continue
+
+                    all_failure_rounds = 0
+                    tool_iterations += 1
+
                    assistant_with_tools = {
                        "role": "assistant",
                        "content": assistant_content,