diff --git a/api-reference/evaluators/overview.mdx b/api-reference/evaluators/overview.mdx new file mode 100644 index 0000000..56f3bee --- /dev/null +++ b/api-reference/evaluators/overview.mdx @@ -0,0 +1,60 @@ +--- +title: 'Overview' +description: 'Browse all available evaluators in LangWatch to find the right scoring method for your AI agent evaluation use case.' +--- + +## Intro + +LangWatch offers an extensive library of evaluators to help you evaluate the quality and guarantee the safety of your LLM apps. + +While here you can find a reference list, to get the execution code you can use the [Evaluation Wizard](https://app.langwatch.ai/@project/evaluations) on LangWatch platform. + +## Authentication + +To make a call to the Evaluators API, you will need to pass through your LangWatch API key in the header as `X-Auth-Token`. Your API key can be found on the setup page under settings. + +#### Allowed Methods + +- `POST /api/evaluations/{evaluator}/evaluate` - Run an evaluation using a specific evaluator + +## Evaluators List + +import EvaluatorsList from "/snippets/evaluators-list.mdx" + + + +## Running Evaluations + +Set up your first evaluation using the [Evaluation Wizard](https://app.langwatch.ai/@project/evaluations): + + + + + + + +## Instrumenting Custom Evaluator + +If you have a custom evaluator built in-house, you can follow the guide below to integrate. + + + + + +## Common Request Format + +All evaluator endpoints follow a similar pattern: + +``` +POST /api/evaluations/{evaluator_path}/evaluate +``` + +Each evaluator accepts specific input parameters and settings. Refer to the individual evaluator documentation pages for detailed request/response schemas and examples. + +## Response Format + +Successful evaluations return an array of evaluation results with scores, details, and metadata specific to each evaluator type. diff --git a/docs.json b/docs.json index 45e08ce..0b14b6d 100644 --- a/docs.json +++ b/docs.json @@ -116,11 +116,7 @@ { "group": "Built-in Evaluators", "pages": [ - "llm-evaluation/list", - { - "group": "API Docs", - "openapi": "/api-reference/openapi-evals.json" - } + "llm-evaluation/list" ] }, { @@ -491,6 +487,51 @@ "api-reference/scenarios/overview", "api-reference/scenarios/create-event" ] + }, + { + "group": "Evaluators", + "pages": [ + "api-reference/evaluators/overview", + "api-reference/evaluators/exact-match-evaluator", + "api-reference/evaluators/llm-answer-match", + "api-reference/evaluators/bleu-score", + "api-reference/evaluators/llm-factual-match", + "api-reference/evaluators/rouge-score", + "api-reference/evaluators/sql-query-equivalence", + "api-reference/evaluators/llm-as-a-judge-boolean-evaluator", + "api-reference/evaluators/llm-as-a-judge-category-evaluator", + "api-reference/evaluators/llm-as-a-judge-score-evaluator", + "api-reference/evaluators/rubrics-based-scoring", + "api-reference/evaluators/ragas-answer-correctness", + "api-reference/evaluators/ragas-answer-relevancy", + "api-reference/evaluators/ragas-context-precision", + "api-reference/evaluators/ragas-context-recall", + "api-reference/evaluators/ragas-context-relevancy", + "api-reference/evaluators/ragas-context-utilization", + "api-reference/evaluators/ragas-faithfulness", + "api-reference/evaluators/ragas-faithfulness-1", + "api-reference/evaluators/ragas-response-context-precision", + "api-reference/evaluators/ragas-response-context-recall", + "api-reference/evaluators/ragas-response-relevancy", + "api-reference/evaluators/context-f1", + "api-reference/evaluators/context-precision", + "api-reference/evaluators/context-recall", + "api-reference/evaluators/azure-content-safety", + "api-reference/evaluators/azure-jailbreak-detection", + "api-reference/evaluators/azure-prompt-shield", + "api-reference/evaluators/openai-moderation", + "api-reference/evaluators/presidio-pii-detection", + "api-reference/evaluators/custom-basic-evaluator", + "api-reference/evaluators/competitor-blocklist", + "api-reference/evaluators/competitor-allowlist-check", + "api-reference/evaluators/competitor-llm-check", + "api-reference/evaluators/off-topic-evaluator", + "api-reference/evaluators/query-resolution", + "api-reference/evaluators/semantic-similarity-evaluator", + "api-reference/evaluators/summarization-score", + "api-reference/evaluators/valid-format-evaluator", + "api-reference/evaluators/lingua-language-detection" + ] } ] } diff --git a/llm-evaluation/list.mdx b/llm-evaluation/list.mdx index 2933294..3abd002 100644 --- a/llm-evaluation/list.mdx +++ b/llm-evaluation/list.mdx @@ -7,6 +7,9 @@ LangWatch offers an extensive library of evaluators to help you evaluate the qua While here you can find a reference list, to get the execution code you can use the [Evaluation Wizard](https://app.langwatch.ai/@project/evaluations) on LangWatch platform. + + Full API documentation for running evaluations programmatically. + ## Evaluators List diff --git a/llm-evaluation/offline/code/evaluation-api.mdx b/llm-evaluation/offline/code/evaluation-api.mdx index bc536cd..c74e604 100644 --- a/llm-evaluation/offline/code/evaluation-api.mdx +++ b/llm-evaluation/offline/code/evaluation-api.mdx @@ -141,10 +141,12 @@ await evaluation.run(dataset, async ({ item, index, span }) => { ``` The callback receives: + - `item` - The current dataset item - `index` - The current index in the dataset - `span` - An OpenTelemetry span for custom tracing - + + ### Metrics logging