-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_results.py
More file actions
75 lines (62 loc) · 2.67 KB
/
get_results.py
File metadata and controls
75 lines (62 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import json
def find_evaluation_summaries(root_dir):
"""
Finds all evaluation summaries in the given root directory.
Args:
root_dir: The root directory to search.
Returns:
A list of paths to the evaluation summaries.
"""
evaluation_summaries = []
for dirpath, _, filenames in os.walk(root_dir):
for filename in filenames:
if "evaluation_summary.json" in filename:
evaluation_summaries.append(os.path.join(dirpath, filename))
return evaluation_summaries
from collections import defaultdict
def process_trait_adherence(results_per_conversation):
"""
Aggregates and calculates the mean trait adherence scores.
Args:
results_per_conversation: A list of conversation results.
"""
trait_scores = defaultdict(list)
for conversation in results_per_conversation:
trait_adherence = conversation.get("evaluation_results", {}).get("traitadherence", {})
if trait_adherence:
for score_item in trait_adherence.get("trait_scores", []):
trait = score_item.get("trait")
score = score_item.get("score")
if trait and isinstance(score, (int, float)):
trait_scores[trait].append(score)
print("Trait Adherence Scores:")
for trait, scores in sorted(trait_scores.items()):
if scores:
average_score = sum(scores) / len(scores)
print(f" {trait}: {average_score:.2f}")
if __name__ == "__main__":
evaluation_summaries = find_evaluation_summaries("evals/results")
scores = {
"behavioralpredictability": [],
"reasoningauthenticity": [],
"engagementquality": [],
"longtermconsistency": [],
"contextretention": [],
}
all_conversations = []
for summary_path in evaluation_summaries:
with open(summary_path, "r") as f:
data = json.load(f)
all_conversations.extend(data["results_per_conversation"])
for conversation in data["results_per_conversation"]:
for eval_type, eval_data in conversation["evaluation_results"].items():
if eval_type in scores:
# Extract the specific score from the evaluation data
score_key = [key for key in eval_data.keys() if key.endswith("_score")][0]
scores[eval_type].append(eval_data[score_key])
for eval_type, score_list in scores.items():
if score_list:
average_score = sum(score_list) / len(score_list)
print(f"Average score for {eval_type}: {average_score:.2f}")
process_trait_adherence(all_conversations)