Evaluation Integrations

The Experiment class bridges any evaluation framework into the observability stack. Run your evaluations with your preferred tool, then upload the results as OTel spans so everything is queryable in one place.

DeepEval

DeepEval provides LLM-as-judge metrics like faithfulness, answer relevancy, and hallucination detection.

pip install deepeval

from opensearch_genai_observability_sdk_py import register, Experiment
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.test_case import LLMTestCase

register(service_name="deepeval-experiment")

# Define your test cases
test_cases = [
    {
        "input": "What is OpenSearch?",
        "output": "OpenSearch is an open-source search and analytics engine.",
        "expected": "OpenSearch is a search and analytics engine.",
        "context": ["OpenSearch is an open-source search and analytics suite."],
    },
    {
        "input": "How does RAG work?",
        "output": "RAG retrieves relevant documents and uses them to generate answers.",
        "expected": "Retrieval-augmented generation combines retrieval with generation.",
        "context": ["RAG first retrieves documents, then feeds them to an LLM."],
    },
]

# Run DeepEval metrics and upload results
relevancy = AnswerRelevancyMetric(model="gpt-4o")
faithfulness = FaithfulnessMetric(model="gpt-4o")

with Experiment("deepeval_run", metadata={"framework": "deepeval"}) as exp:
    for case in test_cases:
        tc = LLMTestCase(
            input=case["input"],
            actual_output=case["output"],
            expected_output=case["expected"],
            retrieval_context=case["context"],
        )
        relevancy.measure(tc)
        faithfulness.measure(tc)
        exp.log(
            input=case["input"],
            output=case["output"],
            expected=case["expected"],
            scores={
                "answer_relevancy": relevancy.score,
                "faithfulness": faithfulness.score,
            },
            case_name=case["input"][:50],
        )

RAGAS

RAGAS evaluates RAG pipelines with metrics like context precision, faithfulness, and answer correctness.

pip install ragas

from opensearch_genai_observability_sdk_py import register, Experiment
from ragas import evaluate as ragas_evaluate
from ragas.metrics import faithfulness, answer_correctness, context_precision
from datasets import Dataset

register(service_name="ragas-experiment")

# Prepare your RAG evaluation dataset
data = {
    "question": ["What is OpenSearch?", "How does RAG work?"],
    "answer": [
        "OpenSearch is an open-source search and analytics engine.",
        "RAG retrieves documents and generates answers from them.",
    ],
    "contexts": [
        ["OpenSearch is a community-driven, open-source search and analytics suite."],
        ["RAG combines information retrieval with text generation."],
    ],
    "ground_truth": [
        "OpenSearch is a search and analytics engine.",
        "Retrieval-augmented generation retrieves then generates.",
    ],
}
dataset = Dataset.from_dict(data)

# Run RAGAS evaluation
ragas_result = ragas_evaluate(
    dataset=dataset,
    metrics=[faithfulness, answer_correctness, context_precision],
)

# Upload results to OpenSearch via Experiment
with Experiment("ragas_eval", metadata={"framework": "ragas"}) as exp:
    for i, row in enumerate(ragas_result.to_pandas().itertuples()):
        exp.log(
            input=data["question"][i],
            output=data["answer"][i],
            expected=data["ground_truth"][i],
            scores={
                "faithfulness": row.faithfulness,
                "answer_correctness": row.answer_correctness,
                "context_precision": row.context_precision,
            },
            case_name=data["question"][i][:50],
        )

MLflow

MLflow tracks ML experiments. Export MLflow evaluation results into the observability stack:

pip install mlflow

from opensearch_genai_observability_sdk_py import register, Experiment
import mlflow

register(service_name="mlflow-experiment")

# Run MLflow evaluation
eval_data = [
    {"inputs": {"question": "What is OpenSearch?"}, "ground_truth": "search engine"},
    {"inputs": {"question": "What is OTEL?"}, "ground_truth": "observability framework"},
]

mlflow_result = mlflow.evaluate(
    model="openai:/gpt-4o",
    data=eval_data,
    model_type="question-answering",
)

# Upload to observability stack
with Experiment("mlflow_eval", metadata={"framework": "mlflow"}) as exp:
    for _, row in mlflow_result.tables["eval_results_table"].iterrows():
        exp.log(
            input=row["inputs"],
            output=row["outputs"],
            expected=row.get("ground_truth", ""),
            scores={
                col: row[col]
                for col in mlflow_result.metrics
                if col in row and row[col] is not None
            },
        )

pytest

Use evaluate() directly in your test suite for CI/CD integration:

from opensearch_genai_observability_sdk_py import register, evaluate, EvalScore

register(service_name="pytest-eval")

def accuracy_scorer(input, output, expected) -> EvalScore:
    is_correct = expected.lower() in output.lower()
    return EvalScore(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        label="pass" if is_correct else "fail",
    )

def latency_scorer(input, output, expected) -> EvalScore:
    return EvalScore(name="response_length", value=len(output))

def my_agent(input: str) -> str:
    # Replace with your agent logic
    return f"Answer to: {input}"

def test_agent_quality():
    result = evaluate(
        name="ci_regression_test",
        task=my_agent,
        data=[
            {"input": "What is OpenSearch?", "expected": "search"},
            {"input": "What is OTEL?", "expected": "opentelemetry"},
        ],
        scores=[accuracy_scorer, latency_scorer],
    )
    avg_accuracy = result.summary.scores["accuracy"].avg
    assert avg_accuracy >= 0.8, f"Accuracy dropped to {avg_accuracy}"

Run with: pytest test_agent.py - results are recorded as OTel experiment spans and available in OpenSearch Dashboards.

Evaluation & Scoring - core score(), evaluate(), Experiment API
Python SDK reference - full SDK documentation
Agent Health - Experiments - UI and CLI-based experiment workflows

Evaluation Integrations

DeepEval

RAGAS

MLflow

pytest

Related links