Evaluation Integrations
The Benchmark class bridges any evaluation framework into the observability stack. Run your evaluations with your preferred tool, then upload the results as OTel spans so everything is queryable in one place.
DeepEval
Section titled “DeepEval”DeepEval provides LLM-as-judge metrics like faithfulness, answer relevancy, and hallucination detection.
pip install opensearch-genai-observability-sdk-py deepevalfrom opensearch_genai_observability_sdk_py import register, Benchmarkfrom deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetricfrom deepeval.test_case import LLMTestCase
register(service_name="deepeval-eval")
# Define your test casestest_cases = [ { "input": "What is OpenSearch?", "output": "OpenSearch is an open-source search and analytics engine.", "expected": "OpenSearch is a search and analytics engine.", "context": ["OpenSearch is an open-source search and analytics suite."], }, { "input": "How does RAG work?", "output": "RAG retrieves relevant documents and uses them to generate answers.", "expected": "Retrieval-augmented generation combines retrieval with generation.", "context": ["RAG first retrieves documents, then feeds them to an LLM."], },]
# Run DeepEval metrics and upload resultsrelevancy = AnswerRelevancyMetric(model="gpt-4o")faithfulness = FaithfulnessMetric(model="gpt-4o")
with Benchmark("deepeval_run", metadata={"framework": "deepeval"}) as b: for case in test_cases: tc = LLMTestCase( input=case["input"], actual_output=case["output"], expected_output=case["expected"], retrieval_context=case["context"], ) relevancy.measure(tc) faithfulness.measure(tc) b.log( input=case["input"], output=case["output"], expected=case["expected"], scores={ "answer_relevancy": relevancy.score, "faithfulness": faithfulness.score, }, case_name=case["input"][:50], )RAGAS evaluates RAG pipelines with metrics like context precision, faithfulness, and answer correctness.
pip install opensearch-genai-observability-sdk-py ragas datasetsfrom opensearch_genai_observability_sdk_py import register, Benchmarkfrom ragas import evaluate as ragas_evaluatefrom ragas.metrics import faithfulness, answer_correctness, context_precisionfrom datasets import Dataset
register(service_name="ragas-eval")
# Prepare your RAG evaluation datasetdata = { "question": ["What is OpenSearch?", "How does RAG work?"], "answer": [ "OpenSearch is an open-source search and analytics engine.", "RAG retrieves documents and generates answers from them.", ], "contexts": [ ["OpenSearch is a community-driven, open-source search and analytics suite."], ["RAG combines information retrieval with text generation."], ], "ground_truth": [ "OpenSearch is a search and analytics engine.", "Retrieval-augmented generation retrieves then generates.", ],}dataset = Dataset.from_dict(data)
# Run RAGAS evaluationragas_result = ragas_evaluate( dataset=dataset, metrics=[faithfulness, answer_correctness, context_precision],)
# Upload results to OpenSearch via Benchmarkdf = ragas_result.to_pandas()with Benchmark("ragas_eval", metadata={"framework": "ragas"}) as b: for i, row in df.iterrows(): b.log( input=data["question"][i], output=data["answer"][i], expected=data["ground_truth"][i], scores={col: row[col] for col in df.columns if col not in data}, case_name=data["question"][i][:50], )MLflow
Section titled “MLflow”MLflow tracks ML experiments. Export MLflow evaluation results into the observability stack:
pip install opensearch-genai-observability-sdk-py mlflowfrom opensearch_genai_observability_sdk_py import register, Benchmarkimport mlflowimport pandas as pd
register(service_name="mlflow-eval")
# Prepare evaluation data as a DataFrameeval_df = pd.DataFrame([ {"inputs": "What is OpenSearch?", "ground_truth": "search engine"}, {"inputs": "What is OTEL?", "ground_truth": "observability framework"},])
# Run MLflow evaluationwith mlflow.start_run(): mlflow_result = mlflow.evaluate( model="openai:/gpt-4o", data=eval_df, targets="ground_truth", model_type="question-answering", )
# Upload to observability stackresults_df = mlflow_result.tables["eval_results_table"]with Benchmark("mlflow_eval", metadata={"framework": "mlflow"}) as b: for _, row in results_df.iterrows(): b.log( input=row.get("inputs", ""), output=row.get("outputs", ""), expected=row.get("ground_truth", ""), scores={ k: v for k, v in mlflow_result.metrics.items() if isinstance(v, (int, float)) }, )pytest
Section titled “pytest”Use evaluate() directly in your test suite for CI/CD integration:
pip install opensearch-genai-observability-sdk-py pytest# conftest.py — initialize tracing once for all testsimport pytestfrom opensearch_genai_observability_sdk_py import register
@pytest.fixture(scope="session", autouse=True)def _init_tracing(): register(service_name="pytest-eval")from opensearch_genai_observability_sdk_py import evaluate, EvalScore
def accuracy_scorer(input, output, expected) -> EvalScore: is_correct = expected.lower() in output.lower() return EvalScore( name="accuracy", value=1.0 if is_correct else 0.0, label="pass" if is_correct else "fail", )
def response_length_scorer(input, output, expected) -> EvalScore: return EvalScore(name="response_length", value=float(len(output)))
def my_agent(input: str) -> str: # Replace with your agent logic return f"Answer to: {input}"
def test_agent_quality(): result = evaluate( name="ci_regression_test", task=my_agent, data=[ {"input": "What is OpenSearch?", "expected": "search"}, {"input": "What is OTEL?", "expected": "opentelemetry"}, ], scores=[accuracy_scorer, response_length_scorer], ) avg_accuracy = result.summary.scores["accuracy"].avg assert avg_accuracy >= 0.8, f"Accuracy dropped to {avg_accuracy}"Run with: pytest test_agent.py — results are recorded as OTel benchmark spans and available in OpenSearch Dashboards.
Related links
Section titled “Related links”- Evaluation & Scoring - core
score(),evaluate(),BenchmarkAPI - Python SDK reference - full SDK documentation