Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
fc5db25
unified automl files
humaira-rf Nov 20, 2025
2f2aafc
mlflow logging for evals
humaira-rf Dec 2, 2025
9d6a5ab
trackio added to fit
humaira-rf Dec 12, 2025
5c820a8
trackio metrics for evals
humaira-rf Dec 12, 2025
39b7413
Merge update
david-rfai Dec 18, 2025
ab913c4
Update seperations
david-rfai Dec 18, 2025
c8a89cc
remove merge tags
david-rfai Dec 18, 2025
16d1ea6
New RFLogger consolidation
david-rfai Dec 23, 2025
35cd92e
Circular
david-rfai Dec 23, 2025
277465e
add backends flag
david-rfai Dec 23, 2025
67c032a
Remove isinstance
david-rfai Dec 24, 2025
dde237c
Remove comma from experiment
david-rfai Dec 24, 2025
64af609
erroneous equal sign
david-rfai Dec 24, 2025
8b52830
Add epochs number in rf_db
david-rfai Dec 24, 2025
e8a354a
Add build
david-rfai Dec 24, 2025
9cced3f
Typo
david-rfai Dec 24, 2025
dd4deb6
Handle erros to metric loggers
david-rfai Dec 24, 2025
e252d6b
Add Any
david-rfai Dec 24, 2025
b7f48d4
Add None return
david-rfai Dec 24, 2025
9c82157
Fix typo
david-rfai Dec 24, 2025
929ab18
Add force
david-rfai Dec 24, 2025
4c37872
Debug force
david-rfai Dec 24, 2025
243dab9
Remove pins from dependencies
david-rfai Dec 24, 2025
3ee92d5
Remove pins from dependencies
david-rfai Dec 24, 2025
8a784c5
pin fsspec for datasets
david-rfai Dec 24, 2025
023f080
pin fsspec for datasets
david-rfai Dec 24, 2025
99dc25a
fix mixing fi in start.sh
david-rfai Dec 24, 2025
475ac42
Add trackio to pyproject.toml
david-rfai Dec 24, 2025
be812a4
add mlflow to pyproject.toml
david-rfai Dec 24, 2025
88c5b9e
Transformers to minimum
david-rfai Dec 24, 2025
93d745e
Handle db migration for metric_experiment_id
david-rfai Dec 24, 2025
f8d3c57
Handle tracking_backends flag
david-rfai Dec 24, 2025
5344cc5
Import MLFlowconfig to rfmetric
david-rfai Dec 24, 2025
6baeafd
Fix experiments_id table
david-rfai Dec 24, 2025
c9c99b1
Fix mflow config import
david-rfai Dec 24, 2025
c7d3ca9
Add back training_args
david-rfai Dec 24, 2025
47422cd
Add metric_experiment_id migration
david-rfai Dec 24, 2025
af14b24
Correct mlflow import
david-rfai Dec 24, 2025
b4b9432
Fix MLflowMetricLogger importer
david-rfai Dec 24, 2025
380ad4e
Fix autocast error
david-rfai Dec 24, 2025
dd66745
Add more logging to metrics
david-rfai Dec 24, 2025
0ec195c
Fix logger
david-rfai Dec 24, 2025
0b08933
handle no experiment for logging
david-rfai Dec 24, 2025
91829db
handle no experiment for logging
david-rfai Dec 24, 2025
f7a1a25
Debug None logger for trackio
david-rfai Dec 24, 2025
36159f7
Debug None logger for trackio
david-rfai Dec 24, 2025
a334572
Debug None logger for trackio
david-rfai Dec 24, 2025
8a06f3b
Debug None logger for trackio
david-rfai Dec 24, 2025
c8781c3
Debug None logger for trackio
david-rfai Dec 24, 2025
ab12cba
Debug None logger for trackio
david-rfai Dec 24, 2025
79322f3
Debug None logger for trackio
david-rfai Dec 24, 2025
59b22d4
Debug None logger for trackio
david-rfai Dec 24, 2025
8c6737d
Debug None logger for trackio
david-rfai Dec 24, 2025
a3e3d03
Debug None logger for trackio
david-rfai Dec 24, 2025
26ff5e5
Debug None logger for trackio
david-rfai Dec 24, 2025
a780e03
Debug None logger for trackio
david-rfai Dec 24, 2025
267ef90
Latest
david-rfai Jan 5, 2026
5284564
Fix migraction column
david-rfai Jan 5, 2026
34182a5
annotations fix
david-rfai Jan 5, 2026
70065de
annotations fix
david-rfai Jan 5, 2026
eb6153f
Remove build
david-rfai Jan 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,25 +161,26 @@ Built-in procedures for searching over configuration knob combinations, includin

```text
rapidfireai/
├── fit
├── automl/ # Search and AutoML algorithms for knob tuning
├── backend/ # Core backend components (controller, scheduler, worker)
├── db/ # Database interface and SQLite operations
├── dispatcher/ # Flask-based web API for UI communication
├── frontend/ # Frontend components (dashboard, IC Ops implementation)
├── ml/ # ML training utilities and trainer classes
└── utils/ # Utility functions and helper modules
├── automl/ # Search and AutoML algorithms for knob tuning
├── cli.py # CLI script
├── evals
├── actors/ # Ray-based workers for doc and query processing
├── automl/ # Search and AutoML algorithms for knob tuning
├── data/ # Data sharding and handling
├── db/ # Database interface and SQLite operations
├── dispatcher/ # Flask-based web API for UI communication
├── metrics/ # Online aggregation logic and metrics handling
├── rag/ # Stages of RAG pipeline
├── scheduling/ # Fair scheduler for multi-config resource sharing
└── utils/ # Utility functions and helper modules
└── experiment.py # Main experiment lifecycle management
├── experiment.py # Main experiment lifecycle management
├── fit
├── backend/ # Core backend components (controller, scheduler, worker)
├── db/ # Database interface and SQLite operations
├── dispatcher/ # Flask-based web API for UI communication
├── frontend/ # Frontend components (dashboard, IC Ops implementation)
├── ml/ # ML training utilities and trainer classes
└── utils/ # Utility functions and helper modules
└── utils.py # Utility functions and helper modules
```

## Architecture
Expand Down Expand Up @@ -327,7 +328,9 @@ used to overwrite the defaults.
- `RF_LOG_FILENAME` - Default log file name (default: rapidfire.log)
- `RF_TRAINING_LOG_FILENAME` - Default training log file name (default: training.log)
- `RF_DB_PATH` - Base directory for database files (default: ${RF_HOME}/db)
- `RF_TRACKING_BACKEND` - Tracking backend used (default: mlflow on Non-Google Colab and tensorboard on Google Colab)
- `RF_MLFLOW_ENABLED` - Enable MLFlow tracking backend
- `RF_TENSORBOARD_ENABLED` - Enable Tensorboard tracking backend
- `RF_TRACKIO_ENABLED` - Enable TrackIO tracking backend
- `RF_COLAB_MODE` - Whether running on colab (default: false on Non-Google Colab and true on Google Colab)
- `RF_TUTORIAL_PATH` - Location that `rapidfireai init` copies `tutorial_notebooks` to (default: ./tutorial_notebooks)
- `RF_TEST_PATH` - Location that `rapidfireai --test-noteobooks` copies test notebooks to (default: ./tutorial_notebooks/tests)
Expand Down
35 changes: 27 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,40 @@ classifiers = [
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Software Development :: Libraries :: Application Frameworks",
]
# dependencies = [
# # REST API (Dispatcher)
# "flask>=3.1.1",
# "flask-cors>=6.0.1",
# "waitress>=3.0.2",

# # JSON Query Tool
# "jq>=1.10.0",
# # "protobuf==5.29.5",

# # Other
# "dill>=0.3.0,<0.3.9",
# "jedi>=0.16",
# # "pytest>=8.4.2",
# "uv>=0.8.14",
# ]
dependencies = [
# REST API (Dispatcher)
"flask>=3.1.1",
"flask-cors>=6.0.1",
"waitress>=3.0.2",
"flask",
"flask-cors",
"waitress",

# JSON Query Tool
"jq>=1.10.0",
"jq",
# "protobuf==5.29.5",

# Other
"dill>=0.3.0,<0.3.9",
"jedi>=0.16",
"dill",
"jedi",
# "pytest>=8.4.2",
"uv>=0.8.14",
"uv",
"trackio",
"mlflow",
"fsspec<=2025.10.0",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -104,7 +123,7 @@ local_evals = [
"ray==2.44.1",

# LLM Inference
"transformers==4.56.1",
"transformers>=4.56.1",
"vllm==0.7.2",

# OpenAI API
Expand Down
10 changes: 0 additions & 10 deletions rapidfireai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,6 @@
__author__ = "RapidFire AI Inc."
__email__ = "support@rapidfire.ai"

# Core imports - always available
# from rapidfireai.experiment import Experiment

# Optional evals imports - gracefully handle missing dependencies
# get_dispatcher_url = None
# get_dispatcher_headers = None
# get_colab_auth_token = None

try:
from rapidfireai.experiment import Experiment
Expand Down Expand Up @@ -47,7 +40,4 @@ def __repr__(self):
"Experiment",
"__version__",
"__version_info__",
# "get_dispatcher_url",
# "get_dispatcher_headers",
# "get_colab_auth_token",
]
9 changes: 4 additions & 5 deletions rapidfireai/automl/model_config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
"""Model configuration for AutoML training and evaluation."""

# from __future__ import annotations

from __future__ import annotations
import copy
import inspect
from abc import ABC, abstractmethod
Expand All @@ -11,6 +9,7 @@

from rapidfireai.automl.datatypes import List, Range


# Fit mode dependencies (peft, trl)
try:
from peft import LoraConfig
Expand Down Expand Up @@ -140,8 +139,8 @@ class RFModelConfig:
formatting_func: Callable | List | None = None
compute_metrics: Callable | List | None = None
peft_config: RFLoraConfig | List | None = None
# training_args: RFSFTConfig | RFDPOConfig | RFGRPOConfig | None = None
training_args = None
training_args: RFSFTConfig | RFDPOConfig | RFGRPOConfig | None = None
# training_args = None
model_type: str | None = "causal_lm"
model_kwargs: dict[str, Any] | None = None
ref_model_name: str | None = None
Expand Down
28 changes: 22 additions & 6 deletions rapidfireai/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,10 +397,12 @@ def main():
parser.add_argument("--version", action="version", version=f"RapidFire AI {__version__}")

parser.add_argument(
"--tracking-backend",
choices=["mlflow", "tensorboard", "both"],
default=os.getenv("RF_TRACKING_BACKEND", "mlflow" if not ColabConfig.ON_COLAB else "tensorboard"),
help="Tracking backend to use for metrics (default: mlflow)",
"--tracking-backends",
choices=["mlflow", "tensorboard", "trackio"],
default=["mlflow"] if not ColabConfig.ON_COLAB else ["tensorboard"],
help="Tracking backend to use for metrics (default: mlflow on Non-Google Colab and tensorboard on Google Colab)",
nargs="*",
action="extend"
)

parser.add_argument(
Expand All @@ -421,6 +423,8 @@ def main():
help="Copy test notebooks to the tutorial_notebooks directory",
)

parser.add_argument("--force", "-f", action="store_true", help="Force action without confirmation")

parser.add_argument("--evals", action="store_true", help="Initialize with evaluation dependencies")

parser.add_argument("--log-lines", type=int, default=10, help="Number of lines to log to the console")
Expand All @@ -429,14 +433,26 @@ def main():

# Set environment variables from CLI args

if args.tracking_backend:
os.environ["RF_TRACKING_BACKEND"] = args.tracking_backend
if args.tracking_backends:
os.environ["RF_MLFLOW_ENABLED"] = "false"
os.environ["RF_TENSORBOARD_ENABLED"] = "false"
os.environ["RF_TRACKIO_ENABLED"] = "false"
if "mlflow" in args.tracking_backends:
os.environ["RF_MLFLOW_ENABLED"] = "true"
if "tensorboard" in args.tracking_backends:
os.environ["RF_TENSORBOARD_ENABLED"] = "true"
if "trackio" in args.tracking_backends:
os.environ["RF_TRACKIO_ENABLED"] = "true"
if args.tensorboard_log_dir:
os.environ["RF_TENSORBOARD_LOG_DIR"] = args.tensorboard_log_dir
if args.colab:
os.environ["RF_COLAB_MODE"] = "true"
elif ColabConfig.ON_COLAB and os.getenv("RF_COLAB_MODE") is None:
os.environ["RF_COLAB_MODE"] = "true"

# Handle force command separately
if args.force:
os.environ["RF_FORCE"] = "true"

# Handle doctor command separately
if args.command == "doctor":
Expand Down
56 changes: 33 additions & 23 deletions rapidfireai/evals/db/rf_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,22 @@ def _initialize_schema(self):
self.db.conn.executescript(schema_sql)
self.db.conn.commit()

# Migration: Add mlflow_run_id column to pipelines table if it doesn't exist
# Migration: Add metric_run_id to pipelines table if they don't exist
try:
cursor = self.db.conn.execute("PRAGMA table_info(pipelines)")
columns = [row[1] for row in cursor.fetchall()]
if "mlflow_run_id" not in columns:
self.db.conn.execute("ALTER TABLE pipelines ADD COLUMN mlflow_run_id TEXT")
if "metric_run_id" not in columns:
self.db.conn.execute("ALTER TABLE pipelines ADD COLUMN metric_run_id TEXT")
self.db.conn.commit()
except Exception:
pass

# Migration: Add metric_experiment_id to experiments table if they don't exist
try:
cursor = self.db.conn.execute("PRAGMA table_info(experiments)")
columns = [row[1] for row in cursor.fetchall()]
if "metric_experiment_id" not in columns:
self.db.conn.execute("ALTER TABLE experiments ADD COLUMN metric_experiment_id TEXT")
self.db.conn.commit()
except Exception:
pass
Expand All @@ -67,7 +77,7 @@ def create_experiment(
num_actors: int,
num_cpus: int = None,
num_gpus: int = None,
mlflow_experiment_id: str = None,
metric_experiment_id: str = None,
status: ExperimentStatus = ExperimentStatus.RUNNING,
num_shards: int = 0,
) -> int:
Expand All @@ -79,7 +89,7 @@ def create_experiment(
num_actors: Number of query processing actors
num_cpus: Number of CPUs allocated
num_gpus: Number of GPUs allocated
mlflow_experiment_id: Optional MLflow experiment ID
metric_experiment_id: Optional MetricLogger experiment ID
status: Initial status (default: ExperimentStatus.RUNNING)
num_shards: Number of shards for the dataset (default: 0)

Expand All @@ -89,7 +99,7 @@ def create_experiment(
query = """
INSERT INTO experiments (
experiment_name, num_actors, num_shards, num_cpus, num_gpus,
mlflow_experiment_id, status, error
metric_experiment_id, status, error
) VALUES (?, ?, ?, ?, ?, ?, ?, '')
"""
self.db.execute(
Expand All @@ -100,7 +110,7 @@ def create_experiment(
num_shards,
num_cpus,
num_gpus,
mlflow_experiment_id,
metric_experiment_id,
status.value,
),
commit=True,
Expand Down Expand Up @@ -241,7 +251,7 @@ def get_experiment(self, experiment_id: int) -> dict[str, Any] | None:
"""
query = """
SELECT experiment_id, experiment_name, num_actors, num_cpus, num_gpus,
mlflow_experiment_id, status, num_shards, error, created_at
metric_experiment_id, status, num_shards, error, created_at
FROM experiments
WHERE experiment_id = ?
"""
Expand All @@ -254,7 +264,7 @@ def get_experiment(self, experiment_id: int) -> dict[str, Any] | None:
"num_actors": row[2],
"num_cpus": row[3],
"num_gpus": row[4],
"mlflow_experiment_id": row[5],
"metric_experiment_id": row[5],
"status": row[6],
"num_shards": row[7],
"error": row[8],
Expand Down Expand Up @@ -295,7 +305,7 @@ def get_running_experiment(self) -> dict[str, Any] | None:
Dictionary with all experiment fields, or None if no running experiment
"""
query = """
SELECT experiment_id, experiment_name, mlflow_experiment_id, num_shards,
SELECT experiment_id, experiment_name, metric_experiment_id, num_shards,
num_actors, num_cpus, num_gpus, status, error, created_at
FROM experiments
WHERE status = ?
Expand All @@ -308,7 +318,7 @@ def get_running_experiment(self) -> dict[str, Any] | None:
return {
"experiment_id": row[0],
"experiment_name": row[1],
"mlflow_experiment_id": row[2],
"metric_experiment_id": row[2],
"num_shards": row[3],
"num_actors": row[4],
"num_cpus": row[5],
Expand Down Expand Up @@ -509,7 +519,7 @@ def create_pipeline(
INSERT INTO pipelines (
context_id, pipeline_type,
pipeline_config, pipeline_config_json, status, error,
current_shard_id, shards_completed, total_samples_processed, mlflow_run_id
current_shard_id, shards_completed, total_samples_processed, metric_run_id
) VALUES (?, ?, ?, ?, ?, '', '', 0, 0, NULL)
"""
self.db.execute(
Expand Down Expand Up @@ -538,7 +548,7 @@ def set_pipeline_progress(self, pipeline_id: int) -> dict[str, Any] | None:
query = """
SELECT pipeline_id, context_id, pipeline_type,
pipeline_config, pipeline_config_json, status, current_shard_id,
shards_completed, total_samples_processed, mlflow_run_id, error, created_at
shards_completed, total_samples_processed, metric_run_id, error, created_at
FROM pipelines
WHERE pipeline_id = ?
"""
Expand All @@ -561,7 +571,7 @@ def set_pipeline_progress(self, pipeline_id: int) -> dict[str, Any] | None:
"current_shard_id": row[6],
"shards_completed": row[7],
"total_samples_processed": row[8],
"mlflow_run_id": row[9],
"metric_run_id": row[9],
"error": row[10],
"created_at": row[11],
}
Expand All @@ -580,7 +590,7 @@ def get_pipeline(self, pipeline_id: int) -> dict[str, Any] | None:
query = """
SELECT pipeline_id, context_id, pipeline_type,
pipeline_config, pipeline_config_json, status, current_shard_id,
shards_completed, total_samples_processed, mlflow_run_id, error, created_at
shards_completed, total_samples_processed, metric_run_id, error, created_at
FROM pipelines
WHERE pipeline_id = ?
"""
Expand All @@ -603,7 +613,7 @@ def get_pipeline(self, pipeline_id: int) -> dict[str, Any] | None:
"current_shard_id": row[6],
"shards_completed": row[7],
"total_samples_processed": row[8],
"mlflow_run_id": row[9],
"metric_run_id": row[9],
"error": row[10],
"created_at": row[11],
}
Expand Down Expand Up @@ -673,7 +683,7 @@ def get_all_pipelines(self) -> list[dict[str, Any]]:
query = """
SELECT pipeline_id, context_id, pipeline_type,
pipeline_config, pipeline_config_json, status, current_shard_id,
shards_completed, total_samples_processed, mlflow_run_id, error, created_at
shards_completed, total_samples_processed, metric_run_id, error, created_at
FROM pipelines
ORDER BY pipeline_id DESC
"""
Expand All @@ -698,7 +708,7 @@ def get_all_pipelines(self) -> list[dict[str, Any]]:
"current_shard_id": row[6],
"shards_completed": row[7],
"total_samples_processed": row[8],
"mlflow_run_id": row[9],
"metric_run_id": row[9],
"error": row[10],
"created_at": row[11],
}
Expand Down Expand Up @@ -765,16 +775,16 @@ def set_pipeline_error(self, pipeline_id: int, error: str):
query = "UPDATE pipelines SET error = ? WHERE pipeline_id = ?"
self.db.execute(query, (error, pipeline_id), commit=True)

def set_pipeline_mlflow_run_id(self, pipeline_id: int, mlflow_run_id: str):
def set_pipeline_metric_run_id(self, pipeline_id: int, metric_run_id: str):
"""
Set MLflow run ID for a pipeline.
Set MetricLogger run ID for a pipeline.

Args:
pipeline_id: ID of the pipeline
mlflow_run_id: MLflow run ID
metric_run_id: MetricLogger run ID
"""
query = "UPDATE pipelines SET mlflow_run_id = ? WHERE pipeline_id = ?"
self.db.execute(query, (mlflow_run_id, pipeline_id), commit=True)
query = "UPDATE pipelines SET metric_run_id = ? WHERE pipeline_id = ?"
self.db.execute(query, (metric_run_id, pipeline_id), commit=True)

# ============================================================================
# ACTOR_TASKS TABLE METHODS
Expand Down
Loading
Loading