Evaluating RAG: Using Mistral Models for LLM as a Judge (With Structured Outputs)

RAGEvaluation

This cookbook shows an example of using the Mistral AI models for LLM As A Judge using structured outputs.

Imports & API Key Setting

You can get your api key from: https://console.mistral.ai/

!pip install mistralai==1.5.1 httpx==0.28.1 pydantic==2.10.6 python-dateutil==2.9.0.post0 jsonpath-python==1.0.6 typing-inspect==0.9.0
from pydantic import BaseModel, Field
from enum import Enum
from typing import List
from getpass import getpass
from mistralai import Mistral

# Define the API key and model
api_key = getpass("Enter Mistral AI API Key")

Main Code For LLM As A Judge For RAG (With Structured Outputs)

from pydantic import BaseModel, Field
from enum import Enum
from getpass import getpass
from mistralai import Mistral

# Initialize the Mistral client with the API key
client = Mistral(api_key=api_key)
model = "mistral-large-latest"

# Define Enum for scores
class Score(str, Enum):
    no_relevance = "0"
    low_relevance = "1"
    medium_relevance = "2"
    high_relevance = "3"

# Define a constant for the score description
SCORE_DESCRIPTION = (
    "Score as a string between '0' and '3'. "
    "0: No relevance/Not grounded/Irrelevant - The context/answer is completely unrelated or not based on the context. "
    "1: Low relevance/Low groundedness/Somewhat relevant - The context/answer has minimal relevance or grounding. "
    "2: Medium relevance/Medium groundedness/Mostly relevant - The context/answer is somewhat relevant or grounded. "
    "3: High relevance/High groundedness/Fully relevant - The context/answer is highly relevant or grounded."
)

# Define separate classes for each criterion with detailed descriptions
class ContextRelevance(BaseModel):
    explanation: str = Field(..., description=("Step-by-step reasoning explaining how the retrieved context aligns with the user's query. "
                    "Consider the relevance of the information to the query's intent and the appropriateness of the context "
                    "in providing a coherent and useful response."))
    score: Score = Field(..., description=SCORE_DESCRIPTION)

class AnswerRelevance(BaseModel):
    explanation: str = Field(..., description=("Step-by-step reasoning explaining how well the generated answer addresses the user's original query. "
                    "Consider the helpfulness and on-point nature of the answer, aligning with the user's intent and providing valuable insights."))
    score: Score = Field(..., description=SCORE_DESCRIPTION)

class Groundedness(BaseModel):
    explanation: str = Field(..., description=("Step-by-step reasoning explaining how faithful the generated answer is to the retrieved context. "
                    "Consider the factual accuracy and reliability of the answer, ensuring it is grounded in the retrieved information."))
    score: Score = Field(..., description=SCORE_DESCRIPTION)

class RAGEvaluation(BaseModel):
    context_relevance: ContextRelevance = Field(..., description="Evaluation of the context relevance to the query, considering how well the retrieved context aligns with the user's intent." )
    answer_relevance: AnswerRelevance = Field(..., description="Evaluation of the answer relevance to the query, assessing how well the generated answer addresses the user's original query." )
    groundedness: Groundedness = Field(..., description="Evaluation of the groundedness of the generated answer, ensuring it is faithful to the retrieved context." )

# Function to evaluate RAG metrics
def evaluate_rag(query: str, retrieved_context: str, generated_answer: str):
    chat_response = client.chat.parse(
        model=model,
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a judge for evaluating a Retrieval-Augmented Generation (RAG) system. "
                    "Evaluate the context relevance, answer relevance, and groundedness based on the following criteria: "
                    "Provide a reasoning and a score as a string between '0' and '3' for each criterion. "
                    "Context Relevance: How relevant is the retrieved context to the query? "
                    "Answer Relevance: How relevant is the generated answer to the query? "
                    "Groundedness: How faithful is the generated answer to the retrieved context?"
                )
            },
            {
                "role": "user",
                "content": f"Query: {query}\nRetrieved Context: {retrieved_context}\nGenerated Answer: {generated_answer}"
            },
        ],
        response_format=RAGEvaluation,
        temperature=0
    )
    return chat_response.choices[0].message.parsed

# Example usage
query = "What are the benefits of renewable energy?"
retrieved_context = "Renewable energy includes solar, wind, hydro, and geothermal energy, which are naturally replenished."
generated_answer = "Renewable energy sources like solar and wind are environmentally friendly and reduce carbon emissions."
evaluation = evaluate_rag(query, retrieved_context, generated_answer)

# Print the evaluation
print("🏆 RAG Evaluation:")
print("\nCriteria: Context Relevance")
print(f"Reasoning: {evaluation.context_relevance.explanation}")
print(f"Score: {evaluation.context_relevance.score.value}/3")

print("\nCriteria: Answer Relevance")
print(f"Reasoning: {evaluation.answer_relevance.explanation}")
print(f"Score: {evaluation.answer_relevance.score.value}/3")

print("\nCriteria: Groundedness")
print(f"Reasoning: {evaluation.groundedness.explanation}")
print(f"Score: {evaluation.groundedness.score.value}/3")