Skip to main content

BaseGenerator

The BaseGenerator class is the core component for generating synthetic test datasets from your documentation.

Overview

from fair_forge.generators import BaseGenerator
from langchain_groq import ChatGroq

# Create model
model = ChatGroq(model="llama-3.1-8b-instant", temperature=0.4)

# Create generator
generator = BaseGenerator(
    model=model,
    use_structured_output=True,
)

# Generate datasets
datasets = await generator.generate_dataset(
    context_loader=loader,
    source="./documentation.md",
    assistant_id="my-assistant",
    num_queries_per_chunk=3,
)

Constructor Parameters

ParameterTypeDefaultDescription
modelBaseChatModelRequiredLangChain-compatible chat model
use_structured_outputboolFalseUse structured output parsing

generate_dataset Method

Parameters

ParameterTypeDefaultDescription
context_loaderContextLoaderRequiredLoader for documentation
sourcestrRequiredPath to documentation
assistant_idstrRequiredID for the generated dataset
num_queries_per_chunkint3Questions per chunk
languagestr"english"Language for generation
conversation_modeboolFalseGenerate conversations
selection_strategyStrategySequentialStrategy()Chunk selection strategy
seed_exampleslist[str]NoneExample questions to guide style

Return Value

list[Dataset]  # One or more datasets depending on strategy

Basic Example

from fair_forge.generators import BaseGenerator, create_markdown_loader
from langchain_groq import ChatGroq

# Setup
model = ChatGroq(model="llama-3.1-8b-instant", temperature=0.4)
loader = create_markdown_loader(max_chunk_size=2000)
generator = BaseGenerator(model=model, use_structured_output=True)

# Generate
datasets = await generator.generate_dataset(
    context_loader=loader,
    source="./docs/product.md",
    assistant_id="product-assistant",
    num_queries_per_chunk=3,
    language="english",
)

# Results
dataset = datasets[0]
print(f"Generated {len(dataset.conversation)} queries")

for batch in dataset.conversation:
    print(f"[{batch.qa_id}] {batch.query}")

With Seed Examples

Guide the style of generated questions:
seed_examples = [
    "What is the difference between supervised and unsupervised learning?",
    "How do you prevent overfitting in a machine learning model?",
    "When should you use precision vs recall as your primary metric?",
]

datasets = await generator.generate_dataset(
    context_loader=loader,
    source="./ml_docs.md",
    assistant_id="ml-assistant",
    num_queries_per_chunk=3,
    seed_examples=seed_examples,
)

Conversation Mode

Generate coherent multi-turn conversations:
datasets = await generator.generate_dataset(
    context_loader=loader,
    source="./docs",
    assistant_id="my-assistant",
    num_queries_per_chunk=3,  # 3-turn conversations
    conversation_mode=True,
)

# Each batch includes turn metadata
for batch in datasets[0].conversation:
    turn = batch.agentic.get('turn_number', 0)
    builds_on = batch.agentic.get('builds_on', None)
    print(f"Turn {turn}: {batch.query}")
    if builds_on:
        print(f"  (follows up on: {builds_on})")

With Selection Strategy

Random Sampling

Generate multiple diverse datasets:
from fair_forge.generators import RandomSamplingStrategy

strategy = RandomSamplingStrategy(
    num_samples=3,       # Create 3 datasets
    chunks_per_sample=5, # Each with 5 random chunks
    seed=42,             # For reproducibility
)

datasets = await generator.generate_dataset(
    context_loader=loader,
    source="./docs",
    assistant_id="my-assistant",
    num_queries_per_chunk=2,
    selection_strategy=strategy,
)

print(f"Generated {len(datasets)} datasets")

Generated Query Metadata

Each generated batch includes metadata in the agentic field:
batch.agentic = {
    "difficulty": "medium",     # easy, medium, hard
    "query_type": "factual",    # factual, inferential, comparative, application
    "chunk_id": "doc_section_1",
    "turn_number": 1,           # In conversation mode
    "builds_on": "prev_query",  # In conversation mode
}

Complete Example

import asyncio
import json
from pathlib import Path
from fair_forge.generators import (
    BaseGenerator,
    create_markdown_loader,
    RandomSamplingStrategy,
)
from langchain_groq import ChatGroq

async def main():
    # Create sample documentation
    content = """# Product Documentation

    Our product helps users manage their tasks efficiently.

    ## Features

    - Task creation and management
    - Due date reminders
    - Team collaboration

    ## Getting Started

    1. Create an account
    2. Set up your first project
    3. Invite team members
    """

    # Save to file
    doc_path = Path("./sample_docs.md")
    doc_path.write_text(content)

    # Setup components
    model = ChatGroq(
        model="llama-3.1-8b-instant",
        temperature=0.4,
        max_tokens=2048,
    )

    loader = create_markdown_loader(
        max_chunk_size=2000,
        header_levels=[1, 2, 3],
    )

    generator = BaseGenerator(
        model=model,
        use_structured_output=True,
    )

    # Generate with random sampling
    strategy = RandomSamplingStrategy(
        num_samples=2,
        chunks_per_sample=2,
        seed=42,
    )

    datasets = await generator.generate_dataset(
        context_loader=loader,
        source=str(doc_path),
        assistant_id="docs-assistant",
        num_queries_per_chunk=3,
        language="english",
        selection_strategy=strategy,
        conversation_mode=False,
    )

    # Output results
    print(f"Generated {len(datasets)} dataset(s)\n")

    for i, dataset in enumerate(datasets):
        print(f"Dataset {i+1}: {len(dataset.conversation)} queries")
        for batch in dataset.conversation:
            difficulty = batch.agentic.get('difficulty', 'N/A')
            print(f"  [{difficulty}] {batch.query}")

    # Save to JSON
    for i, dataset in enumerate(datasets):
        output = Path(f"./test_dataset_{i+1}.json")
        output.write_text(json.dumps(dataset.model_dump(), indent=2))
        print(f"\nSaved: {output}")

    # Cleanup
    doc_path.unlink()

asyncio.run(main())

Error Handling

try:
    datasets = await generator.generate_dataset(
        context_loader=loader,
        source="./docs",
        assistant_id="my-assistant",
        num_queries_per_chunk=3,
    )
except ValueError as e:
    print(f"Configuration error: {e}")
except Exception as e:
    print(f"Generation error: {e}")

Best Practices

Match chunk size to your content:
  • 500-1000: Short, focused sections
  • 1000-2000: Standard documentation
  • 2000-4000: Long-form content
Provide seed examples to guide question style:
seed_examples = [
    "What is...",
    "How do I...",
    "When should I...",
]
  • Low (0.0-0.3): More deterministic, focused questions
  • Medium (0.4-0.7): Balanced creativity
  • High (0.8-1.0): More varied, creative questions
Enable conversation mode when testing context retention:
conversation_mode=True

Next Steps