import os
from fair_forge.metrics.context import Context
from fair_forge.metrics.conversational import Conversational
from fair_forge.core.retriever import Retriever
from fair_forge.schemas.common import Dataset, Batch
from langchain_groq import ChatGroq
# 1. Define your retriever
class CustomerServiceRetriever(Retriever):
def load_dataset(self) -> list[Dataset]:
return [
Dataset(
session_id="cs-eval-001",
assistant_id="customer-service-bot",
language="english",
context="You are a helpful customer service assistant for an e-commerce store.",
conversation=[
Batch(
qa_id="q1",
query="I want to return a product I bought last week.",
assistant="I'd be happy to help with your return! You can return items within 30 days of purchase. Please visit our Returns Portal with your order number ready.",
ground_truth_assistant="Items can be returned within 30 days. Use the Returns Portal.",
),
Batch(
qa_id="q2",
query="What's your phone number for support?",
assistant="You can reach our support team at 1-800-EXAMPLE. We're available Monday through Friday, 9 AM to 5 PM EST.",
ground_truth_assistant="Support: 1-800-EXAMPLE, Mon-Fri 9-5 EST.",
),
]
)
]
# 2. Initialize the judge model
judge = ChatGroq(
model="llama-3.3-70b-versatile",
api_key=os.getenv("GROQ_API_KEY"),
temperature=0.0,
)
# 3. Run Context metric
print("=== Context Evaluation ===")
context_results = Context.run(
CustomerServiceRetriever,
model=judge,
use_structured_output=True,
verbose=True,
)
avg_context = sum(m.context_awareness for m in context_results) / len(context_results)
print(f"Average Context Awareness: {avg_context:.2f}")
# 4. Run Conversational metric
print("\n=== Conversational Evaluation ===")
conv_results = Conversational.run(
CustomerServiceRetriever,
model=judge,
use_structured_output=True,
verbose=True,
)
for m in conv_results:
print(f"QA {m.qa_id}:")
print(f" Quality: {m.conversational_quality_maxim}/10")
print(f" Sensibleness: {m.conversational_sensibleness}/10")