diff --git a/api-reference/evaluators/overview.mdx b/api-reference/evaluators/overview.mdx
new file mode 100644
index 0000000..56f3bee
--- /dev/null
+++ b/api-reference/evaluators/overview.mdx
@@ -0,0 +1,60 @@
+---
+title: 'Overview'
+description: 'Browse all available evaluators in LangWatch to find the right scoring method for your AI agent evaluation use case.'
+---
+
+## Intro
+
+LangWatch offers an extensive library of evaluators to help you evaluate the quality and guarantee the safety of your LLM apps.
+
+While here you can find a reference list, to get the execution code you can use the [Evaluation Wizard](https://app.langwatch.ai/@project/evaluations) on LangWatch platform.
+
+## Authentication
+
+To make a call to the Evaluators API, you will need to pass through your LangWatch API key in the header as `X-Auth-Token`. Your API key can be found on the setup page under settings.
+
+#### Allowed Methods
+
+- `POST /api/evaluations/{evaluator}/evaluate` - Run an evaluation using a specific evaluator
+
+## Evaluators List
+
+import EvaluatorsList from "/snippets/evaluators-list.mdx"
+
+
+
+## Running Evaluations
+
+Set up your first evaluation using the [Evaluation Wizard](https://app.langwatch.ai/@project/evaluations):
+
+
+
+
+
+
+
+## Instrumenting Custom Evaluator
+
+If you have a custom evaluator built in-house, you can follow the guide below to integrate.
+
+
+
+
+
+## Common Request Format
+
+All evaluator endpoints follow a similar pattern:
+
+```
+POST /api/evaluations/{evaluator_path}/evaluate
+```
+
+Each evaluator accepts specific input parameters and settings. Refer to the individual evaluator documentation pages for detailed request/response schemas and examples.
+
+## Response Format
+
+Successful evaluations return an array of evaluation results with scores, details, and metadata specific to each evaluator type.
diff --git a/docs.json b/docs.json
index 45e08ce..0b14b6d 100644
--- a/docs.json
+++ b/docs.json
@@ -116,11 +116,7 @@
{
"group": "Built-in Evaluators",
"pages": [
- "llm-evaluation/list",
- {
- "group": "API Docs",
- "openapi": "/api-reference/openapi-evals.json"
- }
+ "llm-evaluation/list"
]
},
{
@@ -491,6 +487,51 @@
"api-reference/scenarios/overview",
"api-reference/scenarios/create-event"
]
+ },
+ {
+ "group": "Evaluators",
+ "pages": [
+ "api-reference/evaluators/overview",
+ "api-reference/evaluators/exact-match-evaluator",
+ "api-reference/evaluators/llm-answer-match",
+ "api-reference/evaluators/bleu-score",
+ "api-reference/evaluators/llm-factual-match",
+ "api-reference/evaluators/rouge-score",
+ "api-reference/evaluators/sql-query-equivalence",
+ "api-reference/evaluators/llm-as-a-judge-boolean-evaluator",
+ "api-reference/evaluators/llm-as-a-judge-category-evaluator",
+ "api-reference/evaluators/llm-as-a-judge-score-evaluator",
+ "api-reference/evaluators/rubrics-based-scoring",
+ "api-reference/evaluators/ragas-answer-correctness",
+ "api-reference/evaluators/ragas-answer-relevancy",
+ "api-reference/evaluators/ragas-context-precision",
+ "api-reference/evaluators/ragas-context-recall",
+ "api-reference/evaluators/ragas-context-relevancy",
+ "api-reference/evaluators/ragas-context-utilization",
+ "api-reference/evaluators/ragas-faithfulness",
+ "api-reference/evaluators/ragas-faithfulness-1",
+ "api-reference/evaluators/ragas-response-context-precision",
+ "api-reference/evaluators/ragas-response-context-recall",
+ "api-reference/evaluators/ragas-response-relevancy",
+ "api-reference/evaluators/context-f1",
+ "api-reference/evaluators/context-precision",
+ "api-reference/evaluators/context-recall",
+ "api-reference/evaluators/azure-content-safety",
+ "api-reference/evaluators/azure-jailbreak-detection",
+ "api-reference/evaluators/azure-prompt-shield",
+ "api-reference/evaluators/openai-moderation",
+ "api-reference/evaluators/presidio-pii-detection",
+ "api-reference/evaluators/custom-basic-evaluator",
+ "api-reference/evaluators/competitor-blocklist",
+ "api-reference/evaluators/competitor-allowlist-check",
+ "api-reference/evaluators/competitor-llm-check",
+ "api-reference/evaluators/off-topic-evaluator",
+ "api-reference/evaluators/query-resolution",
+ "api-reference/evaluators/semantic-similarity-evaluator",
+ "api-reference/evaluators/summarization-score",
+ "api-reference/evaluators/valid-format-evaluator",
+ "api-reference/evaluators/lingua-language-detection"
+ ]
}
]
}
diff --git a/llm-evaluation/list.mdx b/llm-evaluation/list.mdx
index 2933294..3abd002 100644
--- a/llm-evaluation/list.mdx
+++ b/llm-evaluation/list.mdx
@@ -7,6 +7,9 @@ LangWatch offers an extensive library of evaluators to help you evaluate the qua
While here you can find a reference list, to get the execution code you can use the [Evaluation Wizard](https://app.langwatch.ai/@project/evaluations) on LangWatch platform.
+
+ Full API documentation for running evaluations programmatically.
+
## Evaluators List
diff --git a/llm-evaluation/offline/code/evaluation-api.mdx b/llm-evaluation/offline/code/evaluation-api.mdx
index bc536cd..c74e604 100644
--- a/llm-evaluation/offline/code/evaluation-api.mdx
+++ b/llm-evaluation/offline/code/evaluation-api.mdx
@@ -141,10 +141,12 @@ await evaluation.run(dataset, async ({ item, index, span }) => {
```
The callback receives:
+
- `item` - The current dataset item
- `index` - The current index in the dataset
- `span` - An OpenTelemetry span for custom tracing
-
+
+
### Metrics logging