The first step is to create a retriever that loads your conversation data. A retriever is a class that inherits from Retriever and implements the load_dataset() method.
from fair_forge.core.retriever import Retrieverfrom fair_forge.schemas.common import Dataset, Batchclass MyRetriever(Retriever): """Custom retriever to load your AI conversation data.""" def load_dataset(self) -> list[Dataset]: return [ Dataset( session_id="evaluation-session-1", assistant_id="my-assistant-v1", language="english", context="You are a helpful customer service assistant.", conversation=[ Batch( qa_id="q1", query="What are your return policies?", assistant="Our return policy allows returns within 30 days...", ground_truth_assistant="Returns are accepted within 30 days with receipt.", ), Batch( qa_id="q2", query="How can I track my order?", assistant="You can track your order by logging into your account...", ground_truth_assistant="Log into your account and visit Order History.", ), ] ) ]
Each metric returns a list of results. The structure depends on the metric type:
Context
Toxicity
Conversational
for metric in metrics: print(f"QA: {metric.qa_id}") print(f"Score: {metric.context_awareness}") # 0-1 scale print(f"Insight: {metric.context_insight}")
for metric in metrics: print(f"Session: {metric.session_id}") print(f"Cluster Profiling: {metric.cluster_profiling}") if metric.group_profiling: gp = metric.group_profiling.frequentist print(f"DIDT Score: {gp.DIDT}")
for metric in metrics: print(f"QA: {metric.qa_id}") print(f"Quality Maxim: {metric.conversational_quality_maxim}") print(f"Quantity Maxim: {metric.conversational_quantity_maxim}") print(f"Relation Maxim: {metric.conversational_relation_maxim}") print(f"Manner Maxim: {metric.conversational_manner_maxim}")
Here’s a complete example that evaluates an AI assistant using multiple metrics:
import osfrom fair_forge.metrics.context import Contextfrom fair_forge.metrics.conversational import Conversationalfrom fair_forge.core.retriever import Retrieverfrom fair_forge.schemas.common import Dataset, Batchfrom langchain_groq import ChatGroq# 1. Define your retrieverclass CustomerServiceRetriever(Retriever): def load_dataset(self) -> list[Dataset]: return [ Dataset( session_id="cs-eval-001", assistant_id="customer-service-bot", language="english", context="You are a helpful customer service assistant for an e-commerce store.", conversation=[ Batch( qa_id="q1", query="I want to return a product I bought last week.", assistant="I'd be happy to help with your return! You can return items within 30 days of purchase. Please visit our Returns Portal with your order number ready.", ground_truth_assistant="Items can be returned within 30 days. Use the Returns Portal.", ), Batch( qa_id="q2", query="What's your phone number for support?", assistant="You can reach our support team at 1-800-EXAMPLE. We're available Monday through Friday, 9 AM to 5 PM EST.", ground_truth_assistant="Support: 1-800-EXAMPLE, Mon-Fri 9-5 EST.", ), ] ) ]# 2. Initialize the judge modeljudge = ChatGroq( model="llama-3.3-70b-versatile", api_key=os.getenv("GROQ_API_KEY"), temperature=0.0,)# 3. Run Context metricprint("=== Context Evaluation ===")context_results = Context.run( CustomerServiceRetriever, model=judge, use_structured_output=True, verbose=True,)avg_context = sum(m.context_awareness for m in context_results) / len(context_results)print(f"Average Context Awareness: {avg_context:.2f}")# 4. Run Conversational metricprint("\n=== Conversational Evaluation ===")conv_results = Conversational.run( CustomerServiceRetriever, model=judge, use_structured_output=True, verbose=True,)for m in conv_results: print(f"QA {m.qa_id}:") print(f" Quality: {m.conversational_quality_maxim}/10") print(f" Sensibleness: {m.conversational_sensibleness}/10")