diff --git a/.cursor/rules/hatchet-docs.mdc b/.cursor/rules/hatchet-docs.mdc new file mode 100644 index 0000000..7738938 --- /dev/null +++ b/.cursor/rules/hatchet-docs.mdc @@ -0,0 +1,12 @@ +--- +description: Hatchet documentation MCP server +alwaysApply: true +--- + +When working with Hatchet (task queues, workflows, durable execution), use the +Hatchet MCP docs server for accurate, up-to-date API reference and examples. + +MCP server URL: https://docs.hatchet.run/api/mcp + +Use the search_docs tool to find relevant documentation pages, or get_full_docs +for comprehensive context. Documentation covers Python, TypeScript, and Go SDKs. diff --git a/.env.scythe.training b/.env.scythe.training new file mode 100644 index 0000000..d2e3634 --- /dev/null +++ b/.env.scythe.training @@ -0,0 +1,9 @@ +SCYTHE_WORKER_SLOTS=1 +SCYTHE_WORKER_DOES_FAN=False +SCYTHE_WORKER_DOES_LEAF=True +SCYTHE_WORKER_HAS_GPU=True + +SCYTHE_TIMEOUT_EXPERIMENT_SCHEDULE=2h +SCYTHE_TIMEOUT_EXPERIMENT_EXECUTION=1h +SCYTHE_TIMEOUT_SCATTER_GATHER_SCHEDULE=10h +SCYTHE_TIMEOUT_SCATTER_GATHER_EXECUTION=10h diff --git a/.gitignore b/.gitignore index b31cb9f..3e3a71c 100644 --- a/.gitignore +++ b/.gitignore @@ -217,3 +217,5 @@ inputs/ .env.local.hatchet .env.local.host.hatchet + +scratch/ diff --git a/Makefile b/Makefile index 87de19d..67c54e6 100644 --- a/Makefile +++ b/Makefile @@ -71,6 +71,10 @@ simulations-native: ## Run the simulations fanouts-native: ## Run the fanouts @uv run --env-file .env.$(AWS_ENV).aws --env-file .env.$(HATCHET_ENV).hatchet --env-file .env.scythe.storage --env-file .env.scythe.fanouts worker +.PHONY: training-native +training-native: ## Run the training + @uv run --env-file .env.$(AWS_ENV).aws --env-file .env.$(HATCHET_ENV).hatchet --env-file .env.scythe.storage --env-file .env.scythe.training worker + .PHONY: viz-native viz-native: ## Run the visualization tool # TODO: possibly add env vars to the command @uv run streamlit run src/globi/tools/visualization/main.py diff --git a/docker-compose.yml b/docker-compose.yml index c5a1d8a..a66b0b3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,6 +20,31 @@ services: - ./inputs:/code/inputs - ./outputs:/code/outputs - ./tests/data/e2e:/code/tests/data/e2e + training: + image: ${AWS_ACCOUNT_ID:-123456789012}.dkr.ecr.${AWS_REGION:-us-east-1}.amazonaws.com/hatchet/globi:${IMAGE_TAG:-latest} + build: + context: . + dockerfile: src/globi/worker/Dockerfile + args: + EP_VERSION: ${EP_VERSION:-25.2.0} + PYTHON_VERSION: ${PYTHON_VERSION:-3.12} + env_file: + - .env + - .env.${AWS_ENV:-local}.aws + - .env.${HATCHET_ENV:-local}.hatchet + - .env.scythe.storage + - .env.scythe.training + deploy: + mode: replicated + replicas: ${TRAINING_REPLICAS:-0} + resources: + reservations: + devices: + - capabilities: [gpu] # Requests access to all GPUs + volumes: + - ./inputs:/code/inputs + - ./outputs:/code/outputs + - ./tests/data/e2e:/code/tests/data/e2e fanouts: image: ${AWS_ACCOUNT_ID:-123456789012}.dkr.ecr.${AWS_REGION:-us-east-1}.amazonaws.com/hatchet/globi:${IMAGE_TAG:-latest} build: diff --git a/pyproject.toml b/pyproject.toml index 44de639..40f537e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,25 @@ visualization = [ "playwright>=1.40.0", ] +# ml = [ +# "lightgbm>=4.6.0", +# "xgboost>=3.2.0", +# "pytorch-tabular>=1.2.0", +# "torch>=2.5.0", +# "tensorboard>=2.20.0", +# "wandb>=0.25.0", +# ] + +ml-gpu = [ + "lightgbm>=4.6.0", + "xgboost>=3.2.0", + "numba>=0.63.1", + "pytorch-tabular>=1.2.0", + "torch>=2.5.0", + "tensorboard>=2.20.0", + "wandb>=0.25.0", +] + cli = [ "click>=8.1.7", "xlsxwriter>=3.2.9", @@ -78,9 +97,29 @@ docs = [ worker = "globi.worker.main:main" globi = "globi.tools.cli.main:cli" +[[tool.uv.index]] +name = "pytorch-cu128" +url = "https://download.pytorch.org/whl/cu128" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" +explicit = true + +[[tool.uv.index]] +name = "pypi" +url = "https://pypi.org/simple" +explicit = true + [tool.uv.sources] +# PyTorch: CUDA 12.8 on Linux/Windows (where builds exist), PyPI (CPU) on macOS +torch = [ + { index = "pytorch-cu128", marker = "sys_platform != 'darwin'", extra = "ml-gpu" }, + { index = "pypi", marker = "sys_platform == 'darwin'", extra = "ml-gpu" }, +] # scythe-engine = {git = "https://github.com/szvsw/scythe", branch = "feature/allow-optional-filerefs"} -# scythe-engine = {git = "https://github.com/szvsw/scythe", branch = "feature/update-hatchet"} +scythe-engine = {git = "https://github.com/szvsw/scythe", branch = "feature/allow-versioning-workflows"} # scythe-engine = {path = "../scythe", editable = true} # epinterface = {path = "../epinterface", editable = true} # epinterface = {path = "epinterface", editable = true} diff --git a/src/globi/allocate.py b/src/globi/allocate.py index c7691db..988a022 100644 --- a/src/globi/allocate.py +++ b/src/globi/allocate.py @@ -102,7 +102,7 @@ def allocate_globi_experiment( raise ValueError(msg) experiment = BaseExperiment[ExperimentInputSpec, ExperimentOutputSpec]( - experiment=simulate_globi_building, run_name=name + runnable=simulate_globi_building, run_name=name ) print(f"Submitting {len(buildings_gdf)} buildings for experiment {name}") min_branches_required, _, _ = calculate_branching_factor(specs) @@ -182,7 +182,7 @@ def allocate_globi_dryrun( raise ValueError(msg) experiment = BaseExperiment[ExperimentInputSpec, ExperimentOutputSpec]( - experiment=simulate_globi_building, + runnable=simulate_globi_building, run_name=f"{config.name}/dryrun/{config.scenario}", ) diff --git a/src/globi/models/surrogate/__init__.py b/src/globi/models/surrogate/__init__.py new file mode 100644 index 0000000..d5affc9 --- /dev/null +++ b/src/globi/models/surrogate/__init__.py @@ -0,0 +1 @@ +"""Models used for the surrogate pipeline.""" diff --git a/src/globi/models/surrogate/configs/__init__.py b/src/globi/models/surrogate/configs/__init__.py new file mode 100644 index 0000000..2b2e032 --- /dev/null +++ b/src/globi/models/surrogate/configs/__init__.py @@ -0,0 +1 @@ +"""Configs for the surrogate model pipeline.""" diff --git a/src/globi/models/surrogate/configs/pipeline.py b/src/globi/models/surrogate/configs/pipeline.py new file mode 100644 index 0000000..05b65b2 --- /dev/null +++ b/src/globi/models/surrogate/configs/pipeline.py @@ -0,0 +1,366 @@ +"""Configs for the surrogate model pipeline.""" + +import fnmatch +import re +from functools import cached_property +from pathlib import Path +from typing import Literal, cast + +import numpy as np +import pandas as pd +from pydantic import BaseModel, Field +from scythe.base import ExperimentInputSpec +from scythe.experiments import SemVer, SerializableRunnable +from scythe.scatter_gather import RecursionMap, ScatterGatherResult +from scythe.utils.filesys import OptionalFileReference, S3Url + +from globi.models.surrogate.configs.regression import ModelHPType, XGBHyperparameters +from globi.models.surrogate.samplers import Priors + + +class IterationSpec(BaseModel): + """The iteration spec.""" + + n_per_iter: int | list[int] = Field( + default=10_000, + description="The number of samples to generate per generation. If the current iteration exceeds the length of the list, the last element will be used.", + ) + min_per_stratum: int = Field( + default=100, description="The minimum number of samples per stratum." + ) + max_iters: int = Field( + default=100, + description="The maximum number of outer loop iterations to perform.", + ) + recursion: RecursionMap = Field( + default_factory=lambda: RecursionMap(factor=100, max_depth=1), + description="The recursion spec.", + ) + current_iter: int = Field( + default=0, + description="The index of the current training iteration within the outer loop.", + ) + + @property + def at_max_iters(self) -> bool: + """Whether the current iteration is the maximum number of iterations.""" + return self.current_iter + 1 >= self.max_iters + + @property + def n_per_gen_for_current_iter(self) -> int: + """The number of samples to generate for the current iteration.""" + if isinstance(self.n_per_iter, int): + return self.n_per_iter + return self.n_per_iter[min(self.current_iter, len(self.n_per_iter) - 1)] + + +class StratificationSpec(BaseModel): + """A spec for stratifying the data.""" + + field: str = Field( + default="feature.weather.file", description="The field to stratify by." + ) + sampling: Literal["equal", "error-weighted", "proportional"] = Field( + default="equal", + description="The sampling method to use over the strata.", + ) + aliases: list[str] = Field( + default_factory=lambda: ["epwzip_path", "epw_path"], + description="The alias to use for the stratum as a fallback.", + ) + + # TODO: consider allowing the stratification to be a compound with e.g. component_map_uri and semantic_fields_uri and database_uri + + +class CrossValidationSpec(BaseModel): + """The cross validation spec.""" + + n_folds: int = Field( + default=5, description="The number of folds for the entire parent task." + ) + + +class ConvergenceThresholds(BaseModel): + """The thresholds for convergence.""" + + mae: float | None = Field( + default=None, description="The maximum MAE for convergence." + ) + rmse: float | None = Field( + default=None, description="The maximum RMSE for convergence." + ) + mape: float | None = Field( + default=None, description="The maximum MAPE for convergence." + ) + r2: float | None = Field( + default=None, description="The minimum R2 for convergence." + ) + cvrmse: float | None = Field( + default=None, description="The maximum CV_RMSE for convergence." + ) + + def check_convergence(self, metrics: pd.Series, target: re.Pattern | None = None): + """Check if the metrics have converged. + + Note that this requires the metrics data frame to have the following shape: + + """ + # first, we select the data for the relevant targets: + if target is not None: + target_level = metrics.index.get_level_values("target") + # Interpret target as a regex and match + mask = cast(pd.Series, target_level.to_series().astype(str)).str.match( + target + ) + metrics = cast(pd.Series, metrics.loc[mask.values]) + + thresholds = pd.Series(self.model_dump(), name="metric") + + # first, we will select the appropriate threshold for each metric + comparators = thresholds.loc[metrics.index.get_level_values("metric")] + + # we can then copy over the index safely + comparators.index = metrics.index + + # we will ignore any thresholds that are not set or are NaN + comparators_are_na = comparators.isna() + + # next, we will flip the sign of the r2 metric since it is a maximization metric rather thin min + metrics = metrics * np.where( + metrics.index.get_level_values("metric") == "r2", -1, 1 + ) + comparators = comparators * np.where( + comparators.index.get_level_values("metric") == "r2", -1, 1 + ) + + # run the comparisons + comparison = metrics < comparators + comparison = comparison.loc[~comparators_are_na] + + return comparison + + +class ConvergenceThresholdsByTarget(BaseModel): + """The thresholds for convergence by target.""" + + thresholds: dict[str, ConvergenceThresholds] = Field( + default_factory=lambda: {"*": ConvergenceThresholds()}, + description="The thresholds for convergence by target.", + ) + + def make_comparisons(self, metrics: pd.Series) -> list[pd.Series]: + """Generate a list of all stratum/target/metric True/False comparisons.""" + return [ + self.thresholds[target].check_convergence( + metrics, re.compile(fnmatch.translate(target)) + ) + for target in self.thresholds + ] + + def combine_and_check_strata_and_targets(self, comparisons: list[pd.Series]): + """Combine the comparisons and aggregate first by targets then by strata.""" + comparison = pd.concat(comparisons, axis=0) + # now we will groupby the stratum (e.g. features.weather.file) + # and by the target (e.g. Electricity, Gas, etc.) + # we are converged if any of the metrics have converged for that target + # in that stratum + comparison_stratum_and_target = comparison.groupby( + level=[lev for lev in comparison.index.names if lev != "metric"] + ).any() # TODO: make it configurable such that instead of `any`, we can specify a count, i.e. at least 2 must be converged + + # then we will check that all targets have converged for each stratum + + # only levels left in multiindex should be stratum and target + + comparison_strata = comparison_stratum_and_target.groupby(level="stratum").all() + + # finally, we will check that all strata have converged + comparison_all = comparison_strata.all() + + return ( + comparison_all, + comparison_strata, + comparison_stratum_and_target, + comparison, + ) + + def run(self, metrics: pd.Series) -> tuple[bool, pd.Series, pd.Series, pd.Series]: + """Run the convergence criteria.""" + comparisons = self.make_comparisons(metrics) + return self.combine_and_check_strata_and_targets(comparisons) + + +class TargetsConfigSpec(BaseModel): + """The targets config spec.""" + + columns: list[str] = Field( + default_factory=list, description="The columns to use as targets." + ) + normalization: Literal["min-max", "standard", "none"] = Field( + default="none", description="The normalization method to use." + ) + + +class FeatureConfigSpec(BaseModel): + """The feature config spec.""" + + continuous_columns: frozenset[str] = Field( + default=frozenset(), description="The continuous columns to use as features." + ) + categorical_columns: frozenset[str] = Field( + default=frozenset(), description="The categorical columns to use as features." + ) + exclude_columns: frozenset[str] = Field( + default=frozenset(), + description="The columns to exclude from the features.", + ) + cont_cat_unicity_transition_threshold: int = Field( + default=10, + description="The threshold for the number of unique values to transition from continuous to categorical variable.", + ) + + +class RegressionIOConfigSpec(BaseModel): + """The input/output spec for a regression model.""" + + targets: TargetsConfigSpec = Field( + default_factory=TargetsConfigSpec, description="The targets config spec." + ) + features: FeatureConfigSpec = Field( + default_factory=FeatureConfigSpec, + description="The features config spec.", + ) + + +class ProgressiveTrainingSpec(ExperimentInputSpec, SerializableRunnable): + """A spec for iteratively training an SBEM regression model.""" + + base_run_name: str = Field( + ..., + description="The base run name for the experiment.", + ) + convergence_criteria: ConvergenceThresholdsByTarget = Field( + default_factory=ConvergenceThresholdsByTarget, + description="The convergence criteria.", + ) + regression_io_config: RegressionIOConfigSpec = Field( + default_factory=RegressionIOConfigSpec, + description="The regression io config spec.", + ) + hyperparameters: ModelHPType = Field( + default_factory=XGBHyperparameters, + description="The hyperparameters for the model.", + ) + stratification: StratificationSpec = Field( + default_factory=StratificationSpec, + description="The stratification spec.", + ) + samplers: Priors = Field( + ..., + description="The sampling spec.", + ) + cross_val: CrossValidationSpec = Field( + default_factory=CrossValidationSpec, + description="The cross validation spec.", + ) + iteration: IterationSpec = Field( + default_factory=IterationSpec, + description="The iteration spec.", + ) + context: OptionalFileReference = Field( + default=None, + description="The uri of the gis data to train on.", + ) + data_uris: ScatterGatherResult | None = Field( + default=None, + description="The uri of the previous simulation results to train on.", + ) + metrics_uris: list[ScatterGatherResult] = Field( + default_factory=list, + description="The uris of the iteration metrics from previous iterations.", + ) + previous_experiment_ids: list[str] = Field( + default_factory=list, + description="The ids of the previous experiments.", + ) + + def format_combined_output_key(self, key: str) -> str: + """Format the output key for a combined result file.""" + return f"{self.prefix}/combined/data/{key}.parquet" + + def format_combined_output_uri(self, key: str) -> S3Url: + """Format the output uri for a combined result file.""" + if self.storage_settings is None: + msg = "Storage settings are not set, so we can't construct a combined output uri." + raise ValueError(msg) + return S3Url( + f"s3://{self.storage_settings.BUCKET}/{self.format_combined_output_key(key)}" + ) + + def format_metrics_output_key(self, key: str) -> str: + """Format the output key for a metrics file.""" + return f"{self.prefix}/combined/metrics/{key}.parquet" + + def format_metrics_output_uri(self, key: str) -> S3Url: + """Format the output uri for a metrics file.""" + if self.storage_settings is None: + msg = "Storage settings are not set, so we can't construct a metrics output uri." + raise ValueError(msg) + return S3Url( + f"s3://{self.storage_settings.BUCKET}/{self.format_metrics_output_key(key)}" + ) + + def format_summary_manifest_key(self) -> str: + """Format the output key for a summary manifest file.""" + return f"{self.prefix}/summary.yml" + + def format_summary_manifest_uri(self) -> S3Url: + """Format the output uri for a summary manifest file.""" + if self.storage_settings is None: + msg = "Storage settings are not set, so we can't construct a summary manifest uri." + raise ValueError(msg) + return S3Url( + f"s3://{self.storage_settings.BUCKET}/{self.format_summary_manifest_key()}" + ) + + def subrun_name(self, subrun: Literal["sample", "train"]) -> str: + """Format the run name for a subrun.""" + return f"{self.experiment_id}/{subrun}" + + @property + def context_path(self) -> Path | None: + """The path to the gis data.""" + if self.context is None: + return None + if isinstance(self.context, Path): + return self.context + return self.fetch_uri(self.context) + + @cached_property + def context_data(self) -> pd.DataFrame | None: + """Load the gis data.""" + if self.context_path is None: + return None + return pd.read_parquet(self.context_path) + + @property + def current_version(self) -> SemVer: + """The current version.""" + vstr = [ + piece for piece in self.experiment_id.split("/") if piece.startswith("v") + ][-1] + return SemVer.FromString(vstr) + + +class StageSpec(BaseModel): + """A spec that is common to both the sample and train stages (and possibly others).""" + + parent: ProgressiveTrainingSpec = Field( + ..., + description="The parent spec.", + ) + + @cached_property + def random_generator(self) -> np.random.Generator: + """The random generator.""" + return np.random.default_rng(self.parent.iteration.current_iter) diff --git a/src/globi/models/surrogate/configs/regression.py b/src/globi/models/surrogate/configs/regression.py new file mode 100644 index 0000000..edebb95 --- /dev/null +++ b/src/globi/models/surrogate/configs/regression.py @@ -0,0 +1,89 @@ +"""Configs for the surrogate model pipeline.""" + +import warnings +from typing import Any, Literal + +from pydantic import BaseModel, Field + + +class XGBTrainerConfig(BaseModel): + """The trainer hyperparameters for the xgboost model.""" + + num_boost_round: int = Field( + default=4000, description="The number of boosting rounds." + ) + early_stopping_rounds: int = Field( + default=10, description="The number of boosting rounds to early stop." + ) + verbose_eval: bool = Field( + default=True, description="Whether to print verbose evaluation results." + ) + + +class XGBModelConfig(BaseModel): + """The model hyperparameters for the xgboost model.""" + + max_depth: int = Field(default=5, description="The maximum depth of the tree.") + eta: float = Field(default=0.1, description="The learning rate.") + min_child_weight: int | None = Field( + default=3, description="The minimum child weight." + ) + subsample: float | None = Field(default=None, description="The subsample rate.") + colsample_bytree: float | None = Field( + default=None, description="The column sample by tree rate." + ) + alpha: float | None = Field(default=None, description="The alpha parameter.") + lam: float | None = Field(default=None, description="The lambda parameter.") + gamma: float | None = Field(default=None, description="The gamma parameter.") + seed: int = Field( + default=42, description="The seed for the random number generator." + ) + + @property + def param_dict(self) -> dict[str, Any]: + """The dictionary of parameters.""" + import torch + + params = { + "objective": "reg:squarederror", + "eval_metric": "rmse", + "tree_method": "auto", + "seed": self.seed, + # hyperparameters + **self.model_dump( + exclude_none=True, + ), + } + if torch.cuda.is_available(): + params["device"] = "cuda" + else: + warnings.warn("CUDA is not available, using CPU.", stacklevel=3) + return params + + +class XGBHyperparameters(BaseModel): + """The parameters for the xgboost model.""" + + hp: XGBModelConfig = Field( + default_factory=XGBModelConfig, + description="The hyperparameters for the model.", + ) + trainer: XGBTrainerConfig = Field( + default_factory=XGBTrainerConfig, + description="The trainer hyperparameters for the model.", + ) + + +class LGBHyperparameters(BaseModel): + """The parameters for the lightgbm model.""" + + objective: Literal["regression", "binary", "multiclass"] = Field( + default="regression", description="The objective function to use." + ) + metric: Literal["rmse"] = Field( + default="rmse", description="The metric to optimize." + ) + # TODO: add other parameters as needed + + +ModelHPType = XGBHyperparameters | LGBHyperparameters diff --git a/src/globi/models/surrogate/dummy.py b/src/globi/models/surrogate/dummy.py new file mode 100644 index 0000000..88eaee3 --- /dev/null +++ b/src/globi/models/surrogate/dummy.py @@ -0,0 +1,124 @@ +"""Dummy simulation for testing.""" + +import math +from pathlib import Path +from typing import Literal, get_args + +import numpy as np +import pandas as pd +from scythe.base import ExperimentInputSpec, ExperimentOutputSpec +from scythe.registry import ExperimentRegistry + +StratificationOption = Literal["some", "other", "option", "another"] + + +class DummySimulationInput(ExperimentInputSpec): + """The input for the dummy simulation.""" + + x0: float + x1: float + x2: float + x3: float + stratification_field: StratificationOption + + @property + def encoded_stratification_field(self) -> float: + """Encode the stratification field as an integer.""" + return get_args(StratificationOption).index(self.stratification_field) / ( + len(get_args(StratificationOption)) + - (1 if len(get_args(StratificationOption)) > 1 else 0) + ) + + @property + def values(self) -> list[float]: + """Get the values of the input spec.""" + vals = self.model_dump( + exclude={ + "stratification_field", + "experiment_id", + "sort_index", + "workflow_run_id", + "root_workflow_run_id", + } + ) + x_vals = {k: v for k, v in vals.items() if k.startswith("x")} + return [*x_vals.values(), self.encoded_stratification_field] + + def n_inputs(self) -> int: + """Get the number of inputs.""" + return len(self.values) + + +class DummySimulationOutput(ExperimentOutputSpec): + """The output for the dummy simulation.""" + + y0: float + + +@ExperimentRegistry.Register( + description="A dummy simulation.", +) +def dummy_simulation( + input_spec: DummySimulationInput, tempdir: Path +) -> DummySimulationOutput: + """A dummy simulation.""" + n_inputs = input_spec.n_inputs() + n_outputs = 5 + problem = SimpleSyntheticProblem( + n_inputs, + n_outputs, + seed=42, + ) + y = problem.evaluate(np.array(input_spec.values)) + + main_result = pd.DataFrame({f"y{i}": [y[i]] for i in range(1, n_outputs)}) + main_result = main_result.set_index(input_spec.make_multiindex()) + main_result_neg = -main_result + main_result = pd.concat( + [main_result, main_result_neg], + axis=1, + keys=["positive", "negative"], + names=["sign"], + ) + return DummySimulationOutput( + y0=y[0], + dataframes={"main_result": main_result}, + ) + + +class SimpleSyntheticProblem: + """A simple synthetic problem.""" + + def __init__(self, n_inputs: int, n_outputs: int, seed: int): + """Initialize the simple synthetic problem.""" + self.n_inputs = n_inputs + self.n_outputs = n_outputs + rng = np.random.default_rng(seed) + + self.alpha = rng.normal(size=n_outputs) + self.beta = rng.normal(scale=0.8, size=(n_outputs, n_inputs)) + self.gamma = rng.normal(scale=0.4, size=(n_outputs, n_inputs)) + self.delta = rng.normal(scale=0.3, size=(n_outputs, max(0, n_inputs - 1))) + self.eta = rng.normal(scale=0.2, size=n_outputs) + self.sine_dim = rng.integers(0, n_inputs, size=n_outputs) + + def evaluate(self, x: np.ndarray) -> np.ndarray: + """Evaluate the simple synthetic problem.""" + x = np.asarray(x, dtype=float) + x = np.clip(x, 0.0, 1.0) + + linear = self.beta @ x + quad = self.gamma @ (x**2) + + if self.n_inputs > 1: + pairwise_terms = x[:-1] * x[1:] + pairwise = self.delta @ pairwise_terms + else: + pairwise = np.zeros(self.n_outputs) + + periodic = np.array([ + self.eta[j] * math.sin(2 * math.pi * x[self.sine_dim[j]]) + for j in range(self.n_outputs) + ]) + + return self.alpha + linear + quad + pairwise + periodic diff --git a/src/globi/models/surrogate/outputs.py b/src/globi/models/surrogate/outputs.py new file mode 100644 index 0000000..961df92 --- /dev/null +++ b/src/globi/models/surrogate/outputs.py @@ -0,0 +1,57 @@ +"""Outputs for the surrogate model pipeline.""" + +from typing import Literal + +from pydantic import BaseModel +from scythe.experiments import ExperimentRun +from scythe.scatter_gather import ScatterGatherResult +from scythe.utils.filesys import S3Url + +from globi.models.surrogate.training import TrainWithCVSpec + + +class CombineResultsResult(BaseModel): + """The result of combining the results of the simulations.""" + + incoming: ScatterGatherResult + combined: ScatterGatherResult + + +# TODO: This should perhaps go somewhere else since it is generally useful. +# (most likely into scythe itself) +class ExperimentRunWithRef(BaseModel): + """An experiment run with a workflow run id.""" + + run: ExperimentRun + workflow_run_id: str + + +class StartTrainingResult(BaseModel): + """The result of starting the training.""" + + training_spec: TrainWithCVSpec + experiment_run_with_ref: ExperimentRunWithRef + + +class TrainingEvaluationResult(BaseModel): + """The result of evaluating the training.""" + + converged: bool + # TODO: possibly get rid of this since we have nice combined outputs already. + metrics: dict + + +class RecursionTransition(BaseModel): + """The transition of the recursion.""" + + reasoning: Literal["max_depth", "converged"] | None + child_workflow_run_id: str | None + + +class FinalizeResult(BaseModel): + """The result of finalizing the training.""" + + reasoning: Literal["max_depth", "converged"] | None + data_uris: dict[str, S3Url] + metrics_uris: dict[str, S3Url] + experiment_ids: list[str] diff --git a/src/globi/models/surrogate/samplers.py b/src/globi/models/surrogate/samplers.py new file mode 100644 index 0000000..32acb4c --- /dev/null +++ b/src/globi/models/surrogate/samplers.py @@ -0,0 +1,720 @@ +"""Conditional Priors and Samplers. + +Ported from epengine/models/sampling.py with enhancements: +- Fixed NaN comparison bug in ConditionalPrior +- Added MultiColumnConditionalPrior for multi-column conditioning + without requiring ConcatenateFeaturesSampler intermediate columns +""" + +from abc import ABC, abstractmethod +from typing import Literal, cast + +import networkx as nx +import numpy as np +import pandas as pd +from pydantic import BaseModel, model_validator + +# TODO: Make sure that all of the samplers can be serialized and deserialized with proper discrimination, i.e. that they do not share identical field names. + + +class SamplingError(Exception): + """A sampling error.""" + + pass + + +class Sampler(ABC): + """A sampler.""" + + @abstractmethod + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Sample features from a prior, which may depend on a context.""" + pass + + @property + @abstractmethod + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + pass + + +class UniformSampler(BaseModel, Sampler): + """A uniform sampler which generates values uniformly between a min and max value.""" + + min: float + max: float + round: Literal["ceil", "floor", "nearest"] | None = None + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Sample uniformly between a min and max value.""" + samples = generator.uniform(self.min, self.max, size=n) + if self.round == "ceil": + samples = np.ceil(samples) + elif self.round == "floor": + samples = np.floor(samples) + elif self.round == "nearest": + samples = np.round(samples) + return samples + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return set() + + +class ClippedNormalSampler(BaseModel, Sampler): + """A clipped normal sampler which generates values from a normal distribution, clipped to a min and max value.""" + + mean: float + std: float + clip_min: float | None + clip_max: float | None + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Sample from a normal distribution, clipped to a min and max value.""" + clip_min = self.clip_min if self.clip_min is not None else -np.inf + clip_max = self.clip_max if self.clip_max is not None else np.inf + samples = generator.normal(self.mean, self.std, size=n).clip(clip_min, clip_max) + return samples + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return set() + + +class FixedValueSampler(BaseModel): + """A fixed value sampler which generates a fixed value for all samples.""" + + value: float | str | int | bool + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Sample a fixed value.""" + return np.full(n, self.value) + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return set() + + +class CategoricalSampler(BaseModel): + """A categorical sampler which generates values from a categorical distribution.""" + + values: list[str] | list[float] | list[int] + weights: list[float] + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Sample from a categorical distribution.""" + return generator.choice(self.values, size=n, p=self.weights) + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return set() + + @model_validator(mode="after") + def check_values_and_weights(self): + """Check that the values and weights are the same length and normalized.""" + if len(self.values) != len(self.weights): + msg = "values and weights must be the same length" + raise ValueError(msg) + if not np.isclose(sum(self.weights), 1): + self.weights = [w / sum(self.weights) for w in self.weights] + return self + + +class CopySampler(BaseModel): + """A deterministic sampler which generates a copy of a feature in the provided context dataframe.""" + + feature_to_copy: str + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Compute a copy of a feature.""" + if self.feature_to_copy not in context.columns: + msg = f"Feature to copy {self.feature_to_copy} not found in context dataframe." + raise SamplingError(msg) + if len(context) != n: + msg = ( + f"Context dataframe must have {n} rows, but it has {len(context)} rows." + ) + raise SamplingError(msg) + return context[self.feature_to_copy].to_numpy() + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return {self.feature_to_copy} + + +class AddValueSampler(BaseModel): + """A deterministic sampler which adds a value to a feature.""" + + feature_to_add_to: str + value_to_add: float + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Compute a sum of a feature and a value.""" + if self.feature_to_add_to not in context.columns: + msg = f"Feature to add to {self.feature_to_add_to} not found in context dataframe." + raise SamplingError(msg) + return context[self.feature_to_add_to].to_numpy() + self.value_to_add + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return {self.feature_to_add_to} + + +class SumValuesSampler(BaseModel): + """A deterministic sampler which generates a sum of features.""" + + features_to_sum: list[str] + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Compute a sum of features.""" + if not all(f in context.columns for f in self.features_to_sum): + msg = f"All features to sum {self.features_to_sum} must be found in context dataframe." + raise SamplingError(msg) + if len(context) != n: + msg = ( + f"Context dataframe must have {n} rows, but it has {len(context)} rows." + ) + raise SamplingError(msg) + return np.sum(context[self.features_to_sum].to_numpy(), axis=1) + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return set(self.features_to_sum) + + +class MultiplyValueSampler(BaseModel): + """A deterministic sampler which generates a product of a feature and a value.""" + + feature_to_multiply: str + value_to_multiply: float + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Compute a multiply of a feature.""" + if self.feature_to_multiply not in context.columns: + msg = f"Feature to multiply {self.feature_to_multiply} not found in context dataframe." + raise SamplingError(msg) + if len(context) != n: + msg = ( + f"Context dataframe must have {n} rows, but it has {len(context)} rows." + ) + raise SamplingError(msg) + return context[self.feature_to_multiply].to_numpy() * self.value_to_multiply + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return {self.feature_to_multiply} + + +class ProductValuesSampler(BaseModel): + """A deterministic sampler which generates a product of features.""" + + features_to_multiply: list[str] + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Compute a product of features.""" + if not all(f in context.columns for f in self.features_to_multiply): + msg = f"All features to multiply {self.features_to_multiply} must be found in context dataframe." + raise SamplingError(msg) + if len(context) != n: + msg = ( + f"Context dataframe must have {n} rows, but it has {len(context)} rows." + ) + raise SamplingError(msg) + return np.prod(context[self.features_to_multiply].to_numpy(), axis=1) + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return set(self.features_to_multiply) + + +class InvertSampler(BaseModel): + """A deterministic sampler which generates the multiplicative inverse of a feature.""" + + feature_to_invert: str + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Compute an invert of a feature.""" + if self.feature_to_invert not in context.columns: + msg = f"Feature to invert {self.feature_to_invert} not found in context dataframe." + raise SamplingError(msg) + if len(context) != n: + msg = ( + f"Context dataframe must have {n} rows, but it has {len(context)} rows." + ) + raise SamplingError(msg) + return 1 / context[self.feature_to_invert].to_numpy() + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return {self.feature_to_invert} + + +class PowerSampler(BaseModel): + """A deterministic sampler which generates a power of a feature.""" + + feature_to_power: str + power: float + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Compute a power of a feature.""" + return context[self.feature_to_power].to_numpy() ** self.power + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return {self.feature_to_power} + + +class LogSampler(BaseModel): + """A deterministic sampler which generates a log of a feature.""" + + feature_to_log: str + base: float = np.e + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Compute a log of a feature.""" + if self.feature_to_log not in context.columns: + msg = ( + f"Feature to log {self.feature_to_log} not found in context dataframe." + ) + raise SamplingError(msg) + if len(context) != n: + msg = ( + f"Context dataframe must have {n} rows, but it has {len(context)} rows." + ) + raise SamplingError(msg) + return np.log(context[self.feature_to_log].to_numpy()) / np.log(self.base) + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return {self.feature_to_log} + + +class RoundSampler(BaseModel): + """A deterministic sampler which applies ceil, floor, or nearest to a feature.""" + + feature_to_round: str + operation: Literal["ceil", "floor", "nearest"] + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Apply ceil, floor, or nearest to a feature.""" + if self.feature_to_round not in context.columns: + msg = f"Feature to round {self.feature_to_round} not found in context dataframe." + raise SamplingError(msg) + if len(context) != n: + msg = ( + f"Context dataframe must have {n} rows, but it has {len(context)} rows." + ) + raise SamplingError(msg) + values = context[self.feature_to_round].to_numpy() + if self.operation == "ceil": + return np.ceil(values) + if self.operation == "floor": + return np.floor(values) + return np.round(values) + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return {self.feature_to_round} + + +class ConcatenateFeaturesSampler(BaseModel): + """A deterministic sampler which concatenates features. + + Retained for backward compatibility. Prefer MultiColumnConditionalPrior + for multi-column conditioning instead of creating intermediate compound key columns. + """ + + features_to_concatenate: list[str] + separator: str = ":" + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Compute a concatenation of features.""" + if not all(f in context.columns for f in self.features_to_concatenate): + msg = f"All features to concatenate {self.features_to_concatenate} must be found in context dataframe." + raise SamplingError(msg) + if len(context) != n: + msg = ( + f"Context dataframe must have {n} rows, but it has {len(context)} rows." + ) + raise SamplingError(msg) + cols: pd.DataFrame = cast(pd.DataFrame, context[self.features_to_concatenate]) + return cols.astype(str).agg(self.separator.join, axis=1).to_numpy() + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return set(self.features_to_concatenate) + + +PriorSampler = ( + UniformSampler + | ClippedNormalSampler + | FixedValueSampler + | CategoricalSampler + | CopySampler + | AddValueSampler + | SumValuesSampler + | MultiplyValueSampler + | ProductValuesSampler + | InvertSampler + | LogSampler + | RoundSampler + | ConcatenateFeaturesSampler + | PowerSampler +) + + +class ConditionalPriorCondition(BaseModel): + """A conditional prior condition.""" + + match_val: str | float | int | bool + sampler: PriorSampler + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Sample from a conditional prior condition.""" + return self.sampler.sample(context, n, generator) + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return self.sampler.depends_on + + +class MultiColumnCondition(BaseModel): + """A condition that matches on multiple source features simultaneously. + + Used with MultiColumnConditionalPrior to condition on combinations + of column values without creating intermediate compound key columns. + """ + + match_vals: tuple[str | float | int | bool, ...] + sampler: PriorSampler + + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Sample from this condition's sampler.""" + return self.sampler.sample(context, n, generator) + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return self.sampler.depends_on + + +class PriorABC(ABC): + """A prior.""" + + @abstractmethod + def sample( + self, context: pd.DataFrame, n: int, generator: np.random.Generator + ) -> np.ndarray: + """Sample from a prior.""" + pass + + @property + @abstractmethod + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + pass + + +class ConditionalPrior(BaseModel, PriorABC): + """A conditional prior that selects a sampler based on a single source feature.""" + + source_feature: str + conditions: list[ConditionalPriorCondition] + fallback_prior: PriorSampler | None + + def sample(self, context: pd.DataFrame, n: int, generator: np.random.Generator): + """Sample from a conditional prior.""" + conditional_samples = { + c.match_val: c.sampler.sample(context, n, generator) + for c in self.conditions + } + test_feature = context[self.source_feature].to_numpy() + + final = np.full(n, np.nan) + + any_matched_mask = np.full(n, False) + for match_val, samples_for_match_val in conditional_samples.items(): + mask = test_feature == match_val + any_matched_mask = any_matched_mask | mask + final = np.where(mask, samples_for_match_val, final) + + if self.fallback_prior is not None: + mask = ~any_matched_mask + final = np.where( + mask, self.fallback_prior.sample(context, n, generator), final + ) + + if np.isnan(final).any(): + msg = ( + "Final array contains NaN values; possibly due to an unmatched value for " + f"feature {self.source_feature}." + ) + raise SamplingError(msg) + + return final + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return {self.source_feature} | { + dependency for c in self.conditions for dependency in c.depends_on + } + + +class MultiColumnConditionalPrior(BaseModel, PriorABC): + """A conditional prior that selects a sampler based on multiple source features. + + This eliminates the need for ConcatenateFeaturesSampler + compound key columns. + Instead of creating an intermediate concatenated column and matching on strings, + this prior directly matches on tuples of column values. + + Example usage:: + + prior = MultiColumnConditionalPrior( + source_features=["Typology", "Age_bracket"], + conditions=[ + MultiColumnCondition( + match_vals=("SFH", "pre_1975"), + sampler=CategoricalSampler(values=[...], weights=[...]), + ), + MultiColumnCondition( + match_vals=("MFH", "post_2003"), + sampler=UniformSampler(min=0.5, max=1.0), + ), + ], + fallback_prior=CategoricalSampler(values=[...], weights=[...]), + ) + """ + + source_features: list[str] + conditions: list[MultiColumnCondition] + fallback_prior: PriorSampler | None + + @model_validator(mode="after") + def validate_condition_lengths(self): + """Ensure all conditions have match_vals aligned with source_features.""" + for i, c in enumerate(self.conditions): + if len(c.match_vals) != len(self.source_features): + msg = ( + f"Condition {i}: match_vals length {len(c.match_vals)} " + f"!= source_features length {len(self.source_features)}" + ) + raise ValueError(msg) + return self + + def sample(self, context: pd.DataFrame, n: int, generator: np.random.Generator): + """Sample from a multi-column conditional prior.""" + for f in self.source_features: + if f not in context.columns: + msg = f"Source feature {f} not found in context dataframe." + raise SamplingError(msg) + if len(context) != n: + msg = ( + f"Context dataframe must have {n} rows, but it has {len(context)} rows." + ) + raise SamplingError(msg) + + row_tuples = list( + zip(*(context[f].to_numpy() for f in self.source_features), strict=True) + ) + conditional_samples = { + c.match_vals: c.sampler.sample(context, n, generator) + for c in self.conditions + } + + final = np.full(n, np.nan) + any_matched = np.full(n, False) + + for match_vals, samples in conditional_samples.items(): + mask = np.array([t == match_vals for t in row_tuples]) + any_matched |= mask + final = np.where(mask, samples, final) + + if self.fallback_prior is not None: + final = np.where( + ~any_matched, + self.fallback_prior.sample(context, n, generator), + final, + ) + + if np.isnan(final).any(): + unmatched_examples = [ + row_tuples[i] for i in range(n) if not any_matched[i] + ][:5] + msg = ( + "Final array contains NaN values; possibly due to unmatched values for " + f"features {self.source_features}. Examples of unmatched tuples: {unmatched_examples}" + ) + raise SamplingError(msg) + + return final + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return set(self.source_features) | { + dependency for c in self.conditions for dependency in c.sampler.depends_on + } + + +class UnconditionalPrior(BaseModel, PriorABC): + """An unconditional prior.""" + + sampler: PriorSampler + + def sample(self, context: pd.DataFrame, n: int, generator: np.random.Generator): + """Sample from an unconditional prior.""" + return self.sampler.sample(context, n, generator) + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return self.sampler.depends_on + + +Prior = UnconditionalPrior | ConditionalPrior | MultiColumnConditionalPrior + + +class Priors(BaseModel): + """A collection of priors defining a dependency graph for sampling. + + The sampled_features dict must be ordered such that dependencies come before + dependents (i.e. topological order). Sampling iterates in dict order. + + TODO: Add automatic topological sort and validation that all required + target model fields appear as terminal nodes in the graph. + """ + + sampled_features: dict[str, Prior] + + def sample(self, context: pd.DataFrame, n: int, generator: np.random.Generator): + """Sample from all priors in dependency order.""" + working_df = context.copy(deep=True) + # TODO: how to do we deal with race conditions here in the sense that + # some features may require previous features to have already been sampled? + # TODO: Similarly, how do we ensure that there are no cycles in the dependency graph? + for feature, prior in self.sampled_features.items(): + working_df[feature] = prior.sample(working_df, n, generator) + if working_df.isna().any().any(): # pyright: ignore [reportAttributeAccessIssue] + # TODO: allow na values eg in training? + msg = "Working dataframe contains NaN values; possibly due to an unmatched value." + raise SamplingError(msg) + return working_df + + @property + def depends_on(self) -> set[str]: + """The features that this sampler depends on.""" + return { + dependency + for prior in self.sampled_features.values() + for dependency in prior.depends_on + } + + @property + def dependency_graph(self) -> nx.DiGraph: + """Construct a dependency graph between columns in the context dataframe. + + Edges connect *from* the dependency *to* the dependent feature. + """ + g = nx.DiGraph() + for feature, prior in self.sampled_features.items(): + if prior.depends_on: + for dependency in prior.depends_on: + g.add_edge(dependency, feature) + return g + + @property + def root_features(self) -> set[str]: + """The features that have no dependencies.""" + return { + node + for node in self.dependency_graph.nodes + if self.dependency_graph.in_degree(node) == 0 + } + + def select_prior_tree_for_changed_features( + self, changed_features: set[str], resample_changed_features: bool = True + ) -> "Priors": + """Select the prior tree for the changed features. + + Returns a new Priors object with only the priors that are + downstream of the changed features. + + Args: + changed_features: The features that have changed. + resample_changed_features: Whether to resample the changed features + themselves (dependencies are always resampled). You probably want + this to be False, but for backwards compatibility it defaults to True. + + Returns: + A new Priors object with only the downstream priors. + """ + g = self.dependency_graph + all_changing_priors: set[str] = set() + for any_feature in self.root_features.union(set(self.sampled_features.keys())): + if any(f == any_feature for f in changed_features): + descendants = nx.descendants(g, any_feature) + + if any_feature in self.sampled_features and resample_changed_features: + all_changing_priors.add(any_feature) + + for dep in descendants: + if dep in self.sampled_features: + all_changing_priors.add(dep) + + return Priors( + sampled_features={ + f: p + for f, p in self.sampled_features.items() + if f in all_changing_priors + } + ) diff --git a/src/globi/models/surrogate/sampling.py b/src/globi/models/surrogate/sampling.py new file mode 100644 index 0000000..8ead08f --- /dev/null +++ b/src/globi/models/surrogate/sampling.py @@ -0,0 +1,123 @@ +"""Models used for the training set sampling pipeline.""" + +from typing import cast + +import numpy as np +import pandas as pd +from pydantic import Field +from scythe.base import ExperimentInputSpec + +from globi.models.surrogate.configs.pipeline import StageSpec +from globi.models.surrogate.samplers import Priors + + +class SampleSpec(StageSpec): + """A spec for the sampling stage of the progressive training.""" + + # TODO: add the ability to receive the last set of error metrics and use them to inform the sampling + priors: Priors = Field( + ..., + description="The priors to use for sampling.", + ) + + def stratified_selection(self) -> pd.DataFrame | None: + """Sample the gis data.""" + df = self.parent.context_data + if df is None: + return None + + stratification_field = self.parent.stratification.field + stratification_aliases = self.parent.stratification.aliases + + if stratification_field not in df.columns and not any( + alias in df.columns for alias in stratification_aliases + ): + msg = f"Stratification field {stratification_field} not found in gis data. Please check the field name and/or the aliases." + raise ValueError(msg) + + if stratification_field not in df.columns: + stratification_field = next( + alias for alias in stratification_aliases if alias in df.columns + ) + + strata = cast(list[str], df[stratification_field].unique().tolist()) + + if self.parent.stratification.sampling == "equal": + return self.sample_equally_by_stratum(df, strata, stratification_field) + elif self.parent.stratification.sampling == "error-weighted": + msg = "Error-weighted sampling is not yet implemented." + raise NotImplementedError(msg) + elif self.parent.stratification.sampling == "proportional": + msg = "Proportional sampling is not yet implemented." + raise NotImplementedError(msg) + else: + msg = f"Invalid sampling method: {self.parent.stratification.sampling}" + raise ValueError(msg) + + def sample_equally_by_stratum( + self, df: pd.DataFrame, strata: list[str], stratification_field: str + ) -> pd.DataFrame: + """Sample equally by stratum. + + This will break the dataframe up into n strata and ensure that each strata ends up with the same number of samples. + + Args: + df (pd.DataFrame): The dataframe to sample from. + strata (list[str]): The unique values of the strata. + stratification_field (str): The field to stratify the data by. + + Returns: + samples (pd.DataFrame): The sampled dataframe. + """ + stratum_dfs = { + stratum: df[df[stratification_field] == stratum] for stratum in strata + } + n_per_iter = self.parent.iteration.n_per_gen_for_current_iter + n_per_stratum = max( + n_per_iter // len(strata), + ( + self.parent.iteration.min_per_stratum + if self.parent.iteration.current_iter == 0 + else 0 + ), + ) + + # TODO: consider how we want to handle potentially having the same geometry appear in both + # the training and testing sets. + # if any(len(stratum_df) < n_per_stratum for stratum_df in stratum_dfs.values()): + # msg = "There are not enough buildings in some strata to sample the desired number of buildings per stratum." + # # connsider making this a warning? + # raise ValueError(msg) + + sampled_strata = { + stratum: stratum_df.sample( + n=n_per_stratum, random_state=self.random_generator, replace=True + ) + for stratum, stratum_df in stratum_dfs.items() + } + return cast(pd.DataFrame, pd.concat(sampled_strata.values())) + + # TODO: Add the ability to check the compatiblity of a sampling spec with an input_validator_type. + + def populate_sample_df(self) -> pd.DataFrame: + """Populate the sample dataframe with the priors.""" + base_df = self.stratified_selection() + if base_df is None: + base_df = pd.DataFrame() + # in case we needed more samples due to the strata min req + n_samples = max(self.parent.iteration.n_per_gen_for_current_iter, len(base_df)) + return self.priors.sample( + base_df, + n_samples, + self.random_generator, + ) + + def convert_to_specs( + self, df: pd.DataFrame, input_validator: type[ExperimentInputSpec] + ): + """Convert the sampled dataframe to a list of simulation specs.""" + df["experiment_id"] = "placeholder" + df["sort_index"] = np.arange(len(df)) + return [ + input_validator.model_validate(row) for row in df.to_dict(orient="records") + ] diff --git a/src/globi/models/surrogate/training.py b/src/globi/models/surrogate/training.py new file mode 100644 index 0000000..8da98a1 --- /dev/null +++ b/src/globi/models/surrogate/training.py @@ -0,0 +1,624 @@ +"""Models used for the surrogate training pipeline.""" + +import warnings +from collections.abc import Callable +from functools import cached_property +from pathlib import Path +from typing import TYPE_CHECKING, cast + +import numpy as np +import pandas as pd +from pydantic import Field +from scythe.base import ExperimentInputSpec, ExperimentOutputSpec +from scythe.scatter_gather import ScatterGatherResult +from scythe.utils.filesys import FileReference, S3Url + +from globi.models.surrogate.configs.pipeline import ProgressiveTrainingSpec, StageSpec +from globi.models.surrogate.configs.regression import XGBHyperparameters + +if TYPE_CHECKING: + from mypy_boto3_s3.client import S3Client as S3ClientType +else: + S3ClientType = object + + +EXCLUDED_COLUMNS = frozenset({ + "experiment_id", + "sort_index", + "workflow_run_id", + "root_workflow_run_id", +}) + + +class TrainFoldSpec(ExperimentInputSpec): + """Train an sbem model for a specific fold. + + The fold is determined by the sort_index, which does mean we need to know the n_folds. + + We will need to know: + - where the data is + - the desired stratification (e.g. feature.weather.file) + - how to divide the data into training and testing splits given the desired stratification + + The data uri should be assumed to have features in the index and targets in the columns. + + TODO: consider the potential for leakage when a stratum has few buildings! + + First, we will subdivide the data into its strata. + + Then for each stratum, we will create a train/test split according to the fold index. + + We wish to return validation metrics with the following hierarchy for the column index + - train/test ["split_segment"] + - loc1/loc2 ... ["stratum"] + - mae/rmse/r2/... ["metric"] + + Theoretically, we also might want to pass in normalization specifications for features and/or targets. + However, with xgb, this is less imperative. + """ + + data_uris: dict[str, S3Url] = Field( + ..., description="The uris of the data to train on." + ) + parent: ProgressiveTrainingSpec = Field(..., description="The parent spec.") + + @cached_property + def combined_data(self) -> pd.DataFrame: + """Combines the data from the data uris into a single dataframe with a flattened column index.""" + dfs: dict[str, pd.DataFrame] = { + key: pd.read_parquet(str(uri)) for key, uri in self.data_uris.items() + } + # TODO: we should drop any dataframes which do not participate in training + # for instance, by checking their regression io spec, or if there is another place to check. + # Mostly important for preventing errors on the next line when many differently shaped dataframes are returned. + if not all( + df.index.equals(next(iter(dfs.values())).index) for df in dfs.values() + ): + msg = "The indices of the dataframes are not all equal. " + "This is not supported, since the features must be identical for all outputs.." + raise ValueError(msg) + + for df in dfs.values(): + # TODO: use level names while constructing the sequential name + _level_names = df.columns.names + df.columns = df.columns.to_flat_index() + + df.columns = [ + "/".join(col) if isinstance(col, tuple | list) else col + for col in df.columns + ] + + combined_df = pd.concat(dfs, axis=1) + combined_df.columns = combined_df.columns.to_flat_index() + combined_df.columns = ["/".join(col) for col in combined_df.columns] + shuffled_df = combined_df.sample(frac=1, random_state=42, replace=False) + return shuffled_df + + @property + def data(self) -> pd.DataFrame: + """The combined data.""" + return self.combined_data + + @cached_property + def dparams(self) -> pd.DataFrame: + """The index of the data.""" + return self.data.index.to_frame() + + @cached_property + def all_feature_columns(self) -> frozenset[str]: + """The names of all columns.""" + return frozenset(self.dparams.columns) + + @cached_property + def all_target_columns(self) -> frozenset[str]: + """The names of all columns.""" + return frozenset(self.data.columns) + + @cached_property + def continuous_columns(self) -> frozenset[str]: + """The continuous columns.""" + feature_conf = self.parent.regression_io_config.features + candidates = ( + self.all_feature_columns - feature_conf.exclude_columns - EXCLUDED_COLUMNS + ) + object_dype_columns = ( + self.dparams[candidates].select_dtypes(include=["object"]).columns.tolist() + ) + candidates = candidates - frozenset(object_dype_columns) + nunique_counts = cast(pd.Series, self.dparams[candidates].nunique()) + thresh = feature_conf.cont_cat_unicity_transition_threshold + passing_candidates = cast( + list[str], + cast(pd.Series, nunique_counts[nunique_counts > thresh]).index.tolist(), + ) + non_passing_candidates = cast( + list[str], + cast(pd.Series, nunique_counts[nunique_counts <= thresh]).index.tolist(), + ) + prespecified = feature_conf.continuous_columns + if prespecified: + skipped_candidates = frozenset(passing_candidates) - (prespecified) + possibly_not_continuous_candidats = ( + frozenset(non_passing_candidates) & prespecified + ) + if possibly_not_continuous_candidats: + warnings.warn( + f"The following columns were specified as continuous but have less than {thresh} unique values: {possibly_not_continuous_candidats}", + stacklevel=2, + ) + if skipped_candidates: + warnings.warn( + f"The following columns are likely continuous but are not included in the continuous columns: {skipped_candidates}", + stacklevel=2, + ) + return prespecified + return frozenset(passing_candidates) + + @cached_property + def categorical_columns(self) -> frozenset[str]: + """The categorical columns.""" + feature_conf = self.parent.regression_io_config.features + candidates = ( + self.all_feature_columns - feature_conf.exclude_columns - EXCLUDED_COLUMNS + ) + object_dtype_columns = ( + self.dparams[candidates].select_dtypes(include=["object"]).columns.tolist() + ) + non_obj_dtype_columns = candidates - frozenset(object_dtype_columns) + nunique_counts = cast(pd.Series, self.dparams[non_obj_dtype_columns].nunique()) + thresh = feature_conf.cont_cat_unicity_transition_threshold + passing_non_obj_dtype_candidates = cast( + list[str], + cast(pd.Series, nunique_counts[nunique_counts <= thresh]).index.tolist(), + ) + non_passing_non_obj_dtype_candidates = cast( + list[str], + cast(pd.Series, nunique_counts[nunique_counts > thresh]).index.tolist(), + ) + prespecified = feature_conf.categorical_columns + if prespecified: + skipped_candidates = frozenset(passing_non_obj_dtype_candidates) - ( + prespecified + ) + possibly_not_categorical_candidats = ( + frozenset(non_passing_non_obj_dtype_candidates) & prespecified + ) + if possibly_not_categorical_candidats: + warnings.warn( + f"The following columns were specified as categorical but have more than {thresh} unique values: {possibly_not_categorical_candidats}", + stacklevel=2, + ) + if skipped_candidates: + warnings.warn( + f"The following columns are likely categorical but are not included in the categorical columns: {skipped_candidates}", + stacklevel=2, + ) + return prespecified + return frozenset(passing_non_obj_dtype_candidates) | frozenset( + object_dtype_columns + ) + + @cached_property + def stratum_names(self) -> list[str]: + """The values of the stratification field.""" + return sorted(self.dparams[self.parent.stratification.field].unique().tolist()) + + @cached_property + def data_by_stratum(self) -> dict[str, pd.DataFrame]: + """Subdivide the data by the stratification field. + + We want 1/n_folds data in the test segment for each stratification option, + so we will need to compute train/test splits separately for each stratum. + + This would not be necessary if we knew that the strata always had equal representation, but + since we might use things like adaptive sampling or generating samples proportionally to the number of buildings in that stratum, + e.g. by population, then what *could* happen if we just did a random train/test split is that some strata might end up + entirely in the train set. + """ + return { + val: cast( + pd.DataFrame, + self.data[self.dparams[self.parent.stratification.field] == val], + ) + for val in self.stratum_names + } + + @cached_property + def train_test_split_by_fold_and_stratum(self) -> pd.DataFrame: + """Create the folds for the data. + + To do this, we will go to each stratum and use a strided step to + construct each fold, then assign the fold matching the sort_index + to the test split. We also recombine the strata since they are now + safely stratified. + """ + all_strata = [] + for val in self.stratum_names: + folds = [] + for i in range(self.parent.cross_val.n_folds): + fold = self.data_by_stratum[val].iloc[ + i :: self.parent.cross_val.n_folds + ] + folds.append(fold) + folds_df = pd.concat( + folds, + axis=0, + keys=[ + "test" if i == self.sort_index else "train" + for i in range(self.parent.cross_val.n_folds) + ], + names=["split_segment"], + ) + all_strata.append(folds_df) + return pd.concat(all_strata) + + @cached_property + def train_segment(self) -> tuple[pd.DataFrame, pd.DataFrame]: + """Get the training segment.""" + train_df = cast( + pd.DataFrame, + self.train_test_split_by_fold_and_stratum.xs( + "train", level="split_segment" + ), + ) + params = train_df.index.to_frame(index=False) + targets = train_df + return params, targets + + @cached_property + def test_segment(self) -> tuple[pd.DataFrame, pd.DataFrame]: + """Get the test segment.""" + test_df = cast( + pd.DataFrame, + self.train_test_split_by_fold_and_stratum.xs("test", level="split_segment"), + ) + params = test_df.index.to_frame(index=False) + targets = test_df + return params, targets + + @cached_property + def targets(self) -> list[str]: + """The list of regression targets.""" + return self.parent.regression_io_config.targets.columns or sorted( + self.all_target_columns + ) + + @cached_property + def target_range(self) -> list[tuple[float, float]]: + """The range of the regression targets.""" + _, targets = self.train_segment + targets = targets[self.targets] + return [ + (float(targets[col].min() * 0.8), float(targets[col].max() * 1.2)) + for col in self.targets + ] + + def train(self, tempdir: Path): + """Train the model.""" + if isinstance(self.parent.hyperparameters, XGBHyperparameters): + # TOOO: Consider adding an interface/protocol/base class so signatures can be consistent. + return self.train_xgboost(tempdir) + else: + raise NotImplementedError( + f"Unsupported hyperparameters type: {type(self.parent.hyperparameters)}" + ) + + def train_xgboost(self, tempdir: Path): + """Train an xgboost model.""" + import xgboost as xgb + + hp = ( + self.parent.hyperparameters + if isinstance(self.parent.hyperparameters, XGBHyperparameters) + else XGBHyperparameters() + ) + + x_train, y_train = self.train_segment + x_test, y_test = self.test_segment + + # select the features + x_train_selected, x_test_selected = ( + x_train.loc[:, self.continuous_columns | self.categorical_columns], + x_test.loc[:, self.continuous_columns | self.categorical_columns], + ) + cats = { + col: self.dparams[col].unique().tolist() for col in self.categorical_columns + } + x_train_encoded = self.index_encode_categorical_columns(x_train_selected, cats) + x_test_encoded = self.index_encode_categorical_columns(x_test_selected, cats) + + # select the targets + y_train, y_test = y_train.loc[:, self.targets], y_test.loc[:, self.targets] + + train_dmat = xgb.DMatrix( + x_train_encoded.reset_index(drop=True), label=y_train.reset_index(drop=True) + ) + test_dmat = xgb.DMatrix( + x_test_encoded.reset_index(drop=True), label=y_test.reset_index(drop=True) + ) + + evals = [(train_dmat, "train"), (test_dmat, "eval")] + model = xgb.train( + hp.hp.param_dict, + train_dmat, + num_boost_round=hp.trainer.num_boost_round, + evals=evals, + early_stopping_rounds=hp.trainer.early_stopping_rounds, + verbose_eval=hp.trainer.verbose_eval, + ) + + def predict(x: pd.DataFrame) -> pd.DataFrame: + """Predict the targets for the given features.""" + x_selected = cast( + pd.DataFrame, + x.loc[:, self.continuous_columns | self.categorical_columns], + ) + x_encoded = self.index_encode_categorical_columns(x_selected, cats) + preds = model.predict( + xgb.DMatrix( + x_encoded.reset_index(drop=True), + ) + ) + return pd.DataFrame( + preds, columns=pd.Index(self.targets), index=pd.MultiIndex.from_frame(x) + ) + + evaluation = self.evaluate(predict, x_train, x_test, y_train, y_test) + model_path = tempdir / "model.ubj" + model.save_model(model_path.as_posix()) + return model, evaluation, model_path + + def evaluate( + self, + fn: Callable[[pd.DataFrame], pd.DataFrame], + x_train: pd.DataFrame, + x_test: pd.DataFrame, + y_train: pd.DataFrame, + y_test: pd.DataFrame, + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """Evaluate a model on the train and test segments.""" + y_train_preds = fn(x_train) + y_test_preds = fn(x_test) + + # compute the metrics + global_train_metrics, stratum_train_metrics = self.compute_metrics( + y_train_preds, y_train + ) + global_test_metrics, stratum_test_metrics = self.compute_metrics( + y_test_preds, y_test + ) + + global_metrics = pd.concat( + [global_train_metrics, global_test_metrics], + axis=1, + keys=["train", "test"], + names=["split_segment"], + ) + stratum_metrics = pd.concat( + [stratum_train_metrics, stratum_test_metrics], + axis=1, + keys=["train", "test"], + names=["split_segment"], + ) + return global_metrics, stratum_metrics + + def index_encode_categorical_columns( + self, df: pd.DataFrame, cats: dict[str, list[str]] + ) -> pd.DataFrame: + """Index encode the categorical columns.""" + df = df.copy(deep=True) + for col in df.columns: + if df[col].dtype == "object": + df[col] = pd.Categorical(df[col], categories=cats[col]).codes + return df + + def train_pytorch_tabular(self, tempdir: Path): + """Train a pytorch tabular model.""" + from pytorch_tabular import TabularModel + from pytorch_tabular.config import ( + DataConfig, + ExperimentConfig, + OptimizerConfig, + TrainerConfig, + ) + from pytorch_tabular.models import GANDALFConfig + from pytorch_tabular.models.common.heads import LinearHeadConfig + + data_config = DataConfig( + target=self.targets, + continuous_cols=list(self.continuous_columns), + categorical_cols=list(self.categorical_columns), + # validation_split=0.2, + # continuous_feature_transform="", + # normalize_continuous_features=True, + ) + n_epochs = 200 + optimizer_config = OptimizerConfig( # TODO: make this all configurable + optimizer="AdamW", + optimizer_params={"weight_decay": 1e-5}, + # lr_scheduler="CosineAnnealingLR", + # lr_scheduler_params={"T_max": n_epochs, "eta_min": 1e-5}, + ) + trainer_config = TrainerConfig( + batch_size=256, + fast_dev_run=False, + max_epochs=n_epochs, + min_epochs=max(n_epochs // 20, 1), + early_stopping=None, + # early_stopping= "valid_loss", + # early_stopping_min_delta=0.001, + # early_stopping_mode="min", + # early_stopping_patience=3, + # gradient_clip_val=1.0, + # auto_lr_find=False + # max_time=60, + ) + + model_config = GANDALFConfig( + task="regression", + head="LinearHead", + head_config=LinearHeadConfig( + layers="256-128-64", + activation="SiLU", + use_batch_norm=True, + # dropout=0, + ).__dict__, + target_range=self.target_range, + embedding_dims=None, + embedding_dropout=0.05, + batch_norm_continuous_input=True, + gflu_stages=24, + gflu_dropout=0.0, + gflu_feature_init_sparsity=0.3, + learnable_sparsity=True, + ) + + experiment_config = ExperimentConfig( + run_name=self.experiment_id, + project_name="globi-surrogate-training", + log_target="tensorboard", + ) + + model = TabularModel( + data_config=data_config, + optimizer_config=optimizer_config, + trainer_config=trainer_config, + experiment_config=experiment_config, + model_config=model_config, + ) + + _, train_targets = self.train_segment + _, test_targets = self.test_segment + trainer = model.fit( + train=train_targets.reset_index(), + validation=test_targets.reset_index(), + seed=42, + ) + model.save_model((tempdir / "model").as_posix()) + return model, trainer + + def compute_frame_metrics( + self, preds: pd.DataFrame, targets: pd.DataFrame + ) -> pd.DataFrame: + """Compute the metrics.""" + from sklearn.metrics import ( + mean_absolute_error, + mean_absolute_percentage_error, + mean_squared_error, + r2_score, + ) + + mae = mean_absolute_error(targets, preds, multioutput="raw_values") + mse = mean_squared_error(targets, preds, multioutput="raw_values") + rmse = np.sqrt(mse) + r2 = r2_score(targets, preds, multioutput="raw_values") + cvrmse = rmse / np.abs(targets.mean(axis=0) + 1e-5) + mape = mean_absolute_percentage_error( + targets + 1e-5, + preds, + multioutput="raw_values", + ) + + metrics = pd.DataFrame( + { + "mae": mae, + "rmse": rmse, + "r2": r2, + "cvrmse": cvrmse, + "mape": mape, + }, + ) + metrics.columns.names = ["metric"] + metrics.index.names = ["target"] + + return metrics + + def compute_metrics(self, preds: pd.DataFrame, targets: pd.DataFrame): + """Compute the metrics.""" + global_metrics = self.compute_frame_metrics(preds, targets) + stratum_metric_dfs = {} + for stratum_name in self.stratum_names: + stratum_targets = cast( + pd.DataFrame, + targets.xs(stratum_name, level=self.parent.stratification.field), + ) + stratum_preds = cast( + pd.DataFrame, + preds.xs(stratum_name, level=self.parent.stratification.field), + ) + metrics = self.compute_frame_metrics(stratum_preds, stratum_targets) + stratum_metric_dfs[stratum_name] = metrics + + stratum_metrics = pd.concat( + stratum_metric_dfs, + axis=1, + keys=self.stratum_names, + names=["stratum"], + ) + global_metrics = ( + global_metrics.set_index( + pd.Index( + [self.sort_index] * len(global_metrics), + name="sort_index", + ), + append=True, + ) + .set_index( + pd.Index( + [self.parent.iteration.current_iter] * len(global_metrics), + name="iteration", + ), + append=True, + ) + .unstack(level="target") + ) + + stratum_metrics = ( + stratum_metrics.set_index( + pd.Index( + [self.sort_index] * len(stratum_metrics), + name="sort_index", + ), + append=True, + ) + .set_index( + pd.Index( + [self.parent.iteration.current_iter] * len(stratum_metrics), + name="iteration", + ), + append=True, + ) + .unstack(level="target") + ) + return global_metrics, stratum_metrics + + +class FoldResult(ExperimentOutputSpec): + """The output for a fold.""" + + regressor: FileReference + + +class TrainWithCVSpec(StageSpec): + """Train an SBEM model using a scatter gather approach for cross-fold validation.""" + + data_uris: ScatterGatherResult = Field( + ..., + description="The uris of the data to train on.", + ) + + @property + def schedule(self) -> list[TrainFoldSpec]: + """Create the task schedule.""" + schedule = [] + + for i in range(self.parent.cross_val.n_folds): + schedule.append( + TrainFoldSpec( + experiment_id="placeholder", + sort_index=i, + data_uris=self.data_uris.uris, + parent=self.parent, + ) + ) + return schedule diff --git a/src/globi/pipelines/__init__.py b/src/globi/pipelines/__init__.py new file mode 100644 index 0000000..9c61f28 --- /dev/null +++ b/src/globi/pipelines/__init__.py @@ -0,0 +1,13 @@ +"""Pipelines for the GloBI project.""" + +from globi.models.surrogate.dummy import dummy_simulation +from globi.pipelines.gis import preprocess_gis_file +from globi.pipelines.simulations import simulate_globi_building +from globi.pipelines.training import iterative_training + +__all__ = [ + "dummy_simulation", + "iterative_training", + "preprocess_gis_file", + "simulate_globi_building", +] diff --git a/src/globi/pipelines.py b/src/globi/pipelines/gis.py similarity index 58% rename from src/globi/pipelines.py rename to src/globi/pipelines/gis.py index 24ddd58..c3ad274 100644 --- a/src/globi/pipelines.py +++ b/src/globi/pipelines/gis.py @@ -1,27 +1,12 @@ -"""Experiment configuration for building builder simulations.""" +"""GIS processing pipelines for the GloBI project.""" import logging from pathlib import Path from typing import cast import geopandas as gpd -import numpy as np -import pandas as pd import yaml -from epinterface.geometry import ( - SceneContext, - ShoeboxGeometry, -) -from epinterface.sbem.builder import ( - AtticAssumptions, - BasementAssumptions, - Model, - construct_zone_def, -) from epinterface.sbem.fields.spec import SemanticModelFields -from scythe.registry import ExperimentRegistry -from scythe.utils.filesys import FileReference -from shapely import Polygon, from_wkt from globi.gis.errors import SemanticFieldsFileHasNoBuildingIDColumnError from globi.gis.geometry import ( @@ -53,206 +38,10 @@ FileConfig, GISPreprocessorColumnMap, ) -from globi.models.tasks import GloBIBuildingSpec, GloBIOutputSpec logger = logging.getLogger(__name__) -INDEX_COLS_TO_KEEP: list[str] = [ - "feature.geometry.long_edge", - "feature.geometry.short_edge", - "feature.geometry.orientation", - "feature.geometry.num_floors", - "feature.geometry.energy_model_conditioned_area", - "feature.geometry.energy_model_occupied_area", - "feature.semantic.Typology", - "feature.semantic.Age_bracket", - "feature.semantic.Region", - "feature.weather.file", - "feature.geometry.wwr", - "feature.geometry.f2f_height", - "feature.geometry.attic_height", -] - - -def simulate_globi_building_pipeline( - input_spec: GloBIBuildingSpec, - tempdir: Path, -) -> GloBIOutputSpec: - """Simulate a GlobiSpec building and return energy and peak results. - - Args: - input_spec: The input specification containing building parameters and file URIs - tempdir: Temporary directory for intermediate files - Returns: - Output specification containing a DataFrame with MultiIndex: - - Top level: Measurement type (Energy, Peak) - - Feature levels from input specification - """ - spec = input_spec - log = logger.info - zone_def = construct_zone_def( - component_map_path=spec.component_map, - db_path=spec.db_path, - semantic_field_context=spec.semantic_field_context, - ) - model = Model( - Weather=spec.epwzip_path, - Zone=zone_def, - Basement=BasementAssumptions( - Conditioned=spec.basement_is_conditioned, - UseFraction=spec.basement_use_fraction - if spec.basement_is_occupied - else None, - ), - Attic=AtticAssumptions( - Conditioned=spec.attic_is_conditioned, - UseFraction=spec.attic_use_fraction if spec.attic_is_occupied else None, - ), - geometry=ShoeboxGeometry( - x=0, - y=0, - w=spec.long_edge, - d=spec.short_edge, - h=spec.f2f_height, - wwr=spec.wwr, - num_stories=spec.num_floors, - basement=spec.has_basement, - zoning=spec.use_core_perim_zoning, - roof_height=spec.attic_height, - exposed_basement_frac=spec.exposed_basement_frac, - scene_context=SceneContext( - building=cast(Polygon, from_wkt(spec.rotated_rectangle)), - neighbors=[ - cast(Polygon, from_wkt(poly)) for poly in spec.neighbor_polys - ], - neighbor_heights=[ - float(h) if h is not None else 0 for h in spec.neighbor_heights - ], - orientation=spec.long_edge_angle, - ), - ), - ) - - log("Building and running model...") - overheating_config = ( - spec.parent_experiment_spec.overheating_config - if spec.parent_experiment_spec - else None - ) - run_result = model.run( - eplus_parent_dir=tempdir, - overheating_config=overheating_config, - ) - # Validate conditioned area - if not np.allclose( - model.total_conditioned_area, spec.energy_model_conditioned_area - ): - msg = ( - f"Total conditioned area mismatch: " - f"{model.total_conditioned_area} != {spec.energy_model_conditioned_area}" - ) - raise ValueError(msg) - - # Results Post-processing - # TODO: consider if we actually want all t he columns we are including. - feature_index = spec.make_multiindex( - n_rows=1, additional_index_data=spec.feature_dict - ) - results = run_result.energy_and_peak.to_frame().T.set_index(feature_index) - - dfs: dict[str, pd.DataFrame] = { - "EnergyAndPeak": results, - } - if run_result.overheating_results is not None: - # TODO: add feature dict to overheating df indices? Or instead of a full feature df, just add a single column with the building id? - edh = run_result.overheating_results.edh - old_ix = edh.index - feature_index = spec.make_multiindex( - n_rows=len(edh), include_sort_subindex=False - ) - edh.index = feature_index - edh = edh.set_index(old_ix, append=True) - dfs["ExceedanceDegreeHours"] = edh - - basic_oh = run_result.overheating_results.basic_oh - old_ix = basic_oh.index - feature_index = spec.make_multiindex( - n_rows=len(basic_oh), include_sort_subindex=False - ) - basic_oh.index = feature_index - basic_oh = basic_oh.set_index(old_ix, append=True) - dfs["BasicOverheating"] = basic_oh - - heat_index_categories = run_result.overheating_results.hi - old_ix = heat_index_categories.index - feature_index = spec.make_multiindex( - n_rows=len(heat_index_categories), include_sort_subindex=False - ) - heat_index_categories.index = feature_index - heat_index_categories = heat_index_categories.set_index(old_ix, append=True) - dfs["HeatIndexCategories"] = heat_index_categories - - consecutive_e_zone = run_result.overheating_results.consecutive_e_zone - # may be zero if no streaks found in any zones - if len(consecutive_e_zone) > 0: - old_ix = consecutive_e_zone.index - feature_index = spec.make_multiindex( - n_rows=len(consecutive_e_zone), include_sort_subindex=False - ) - consecutive_e_zone.index = feature_index - consecutive_e_zone = consecutive_e_zone.set_index(old_ix, append=True) - dfs["ConsecutiveExceedances"] = consecutive_e_zone - - hourly_data_outpath: FileReference | None = None - - if spec.parent_experiment_spec and spec.parent_experiment_spec.hourly_data_config: - hourly_df = run_result.sql.timeseries_by_name( - spec.parent_experiment_spec.hourly_data_config.data, - reporting_frequency="Hourly", - ) - hourly_df.index.names = ["Timestep"] - hourly_df.columns.names = ["Trash", "Group", "Meter"] - hourly_df: pd.DataFrame = cast( - pd.DataFrame, - hourly_df.droplevel("Trash", axis=1) - .stack(level="Group", future_stack=True) - .unstack(level="Timestep"), - ) - hourly_multiindex = spec.make_multiindex( - n_rows=len(hourly_df), include_sort_subindex=False - ) - old_ix = hourly_df.index - hourly_df.index = hourly_multiindex - hourly_df = hourly_df.set_index(old_ix, append=True) - - if spec.parent_experiment_spec.hourly_data_config.does_dataframe_output: - for meter_name in hourly_df.columns.get_level_values("Meter").unique(): - variable_df = hourly_df.xs(meter_name, level="Meter", axis=1) - dataframe_key = f"HourlyData.{meter_name.replace(' ', '')}" - dfs[dataframe_key] = variable_df - if spec.parent_experiment_spec.hourly_data_config.does_file_output: - hourly_data_outpath = tempdir / "outputs_hourly_data.pq" - hourly_df.to_parquet(hourly_data_outpath) - - return GloBIOutputSpec( - dataframes=dfs, - hourly_data=hourly_data_outpath, - ) - - -@ExperimentRegistry.Register(retries=2, schedule_timeout="10h", execution_timeout="30m") -def simulate_globi_building( - input_spec: GloBIBuildingSpec, tempdir: Path -) -> GloBIOutputSpec: - """Simulate a GlobiSpec building and return monthly energy and peak results. - - NB: this is separated from the pipeline above so the pipeline can still be used as a - local invocation without *too* much difficulty. - """ - return simulate_globi_building_pipeline(input_spec, tempdir) - - def preprocess_gis_file( config: DeterministicGISPreprocessorConfig, file_config: "FileConfig", @@ -522,17 +311,3 @@ def preprocess_gis_file( logger.info(f"saved {len(gdf)} features to {output_path}") return gdf, column_output_map - - -if __name__ == "__main__": - import tempfile - - from globi.models.tasks import MinimalBuildingSpec - - with tempfile.TemporaryDirectory() as tempdir: - with open("inputs/building.yml") as f: - input_spec = MinimalBuildingSpec.model_validate(yaml.safe_load(f)) - o = simulate_globi_building_pipeline( - input_spec=input_spec.globi_spec, - tempdir=Path(tempdir), - ) diff --git a/src/globi/pipelines/simulations.py b/src/globi/pipelines/simulations.py new file mode 100644 index 0000000..dfaff88 --- /dev/null +++ b/src/globi/pipelines/simulations.py @@ -0,0 +1,235 @@ +"""Experiment configuration for building builder simulations.""" + +import logging +from pathlib import Path +from typing import cast + +import numpy as np +import pandas as pd +import yaml +from epinterface.geometry import ( + SceneContext, + ShoeboxGeometry, +) +from epinterface.sbem.builder import ( + AtticAssumptions, + BasementAssumptions, + Model, + construct_zone_def, +) +from scythe.registry import ExperimentRegistry +from scythe.utils.filesys import FileReference +from shapely import Polygon, from_wkt + +from globi.models.tasks import GloBIBuildingSpec, GloBIOutputSpec + +logger = logging.getLogger(__name__) + + +INDEX_COLS_TO_KEEP: list[str] = [ + "feature.geometry.long_edge", + "feature.geometry.short_edge", + "feature.geometry.orientation", + "feature.geometry.num_floors", + "feature.geometry.energy_model_conditioned_area", + "feature.geometry.energy_model_occupied_area", + "feature.semantic.Typology", + "feature.semantic.Age_bracket", + "feature.semantic.Region", + "feature.weather.file", + "feature.geometry.wwr", + "feature.geometry.f2f_height", + "feature.geometry.attic_height", +] + + +def simulate_globi_building_pipeline( + input_spec: GloBIBuildingSpec, + tempdir: Path, +) -> GloBIOutputSpec: + """Simulate a GlobiSpec building and return energy and peak results. + + Args: + input_spec: The input specification containing building parameters and file URIs + tempdir: Temporary directory for intermediate files + Returns: + Output specification containing a DataFrame with MultiIndex: + - Top level: Measurement type (Energy, Peak) + - Feature levels from input specification + """ + spec = input_spec + log = logger.info + zone_def = construct_zone_def( + component_map_path=spec.component_map, + db_path=spec.db_path, + semantic_field_context=spec.semantic_field_context, + ) + model = Model( + Weather=spec.epwzip_path, + Zone=zone_def, + Basement=BasementAssumptions( + Conditioned=spec.basement_is_conditioned, + UseFraction=spec.basement_use_fraction + if spec.basement_is_occupied + else None, + ), + Attic=AtticAssumptions( + Conditioned=spec.attic_is_conditioned, + UseFraction=spec.attic_use_fraction if spec.attic_is_occupied else None, + ), + geometry=ShoeboxGeometry( + x=0, + y=0, + w=spec.long_edge, + d=spec.short_edge, + h=spec.f2f_height, + wwr=spec.wwr, + num_stories=spec.num_floors, + basement=spec.has_basement, + zoning=spec.use_core_perim_zoning, + roof_height=spec.attic_height, + exposed_basement_frac=spec.exposed_basement_frac, + scene_context=SceneContext( + building=cast(Polygon, from_wkt(spec.rotated_rectangle)), + neighbors=[ + cast(Polygon, from_wkt(poly)) for poly in spec.neighbor_polys + ], + neighbor_heights=[ + float(h) if h is not None else 0 for h in spec.neighbor_heights + ], + orientation=spec.long_edge_angle, + ), + ), + ) + + log("Building and running model...") + overheating_config = ( + spec.parent_experiment_spec.overheating_config + if spec.parent_experiment_spec + else None + ) + run_result = model.run( + eplus_parent_dir=tempdir, + overheating_config=overheating_config, + ) + # Validate conditioned area + if not np.allclose( + model.total_conditioned_area, spec.energy_model_conditioned_area + ): + msg = ( + f"Total conditioned area mismatch: " + f"{model.total_conditioned_area} != {spec.energy_model_conditioned_area}" + ) + raise ValueError(msg) + + # Results Post-processing + # TODO: consider if we actually want all t he columns we are including. + feature_index = spec.make_multiindex( + n_rows=1, additional_index_data=spec.feature_dict + ) + results = run_result.energy_and_peak.to_frame().T.set_index(feature_index) + + dfs: dict[str, pd.DataFrame] = { + "EnergyAndPeak": results, + } + if run_result.overheating_results is not None: + # TODO: add feature dict to overheating df indices? Or instead of a full feature df, just add a single column with the building id? + edh = run_result.overheating_results.edh + old_ix = edh.index + feature_index = spec.make_multiindex( + n_rows=len(edh), include_sort_subindex=False + ) + edh.index = feature_index + edh = edh.set_index(old_ix, append=True) + dfs["ExceedanceDegreeHours"] = edh + + basic_oh = run_result.overheating_results.basic_oh + old_ix = basic_oh.index + feature_index = spec.make_multiindex( + n_rows=len(basic_oh), include_sort_subindex=False + ) + basic_oh.index = feature_index + basic_oh = basic_oh.set_index(old_ix, append=True) + dfs["BasicOverheating"] = basic_oh + + heat_index_categories = run_result.overheating_results.hi + old_ix = heat_index_categories.index + feature_index = spec.make_multiindex( + n_rows=len(heat_index_categories), include_sort_subindex=False + ) + heat_index_categories.index = feature_index + heat_index_categories = heat_index_categories.set_index(old_ix, append=True) + dfs["HeatIndexCategories"] = heat_index_categories + + consecutive_e_zone = run_result.overheating_results.consecutive_e_zone + # may be zero if no streaks found in any zones + if len(consecutive_e_zone) > 0: + old_ix = consecutive_e_zone.index + feature_index = spec.make_multiindex( + n_rows=len(consecutive_e_zone), include_sort_subindex=False + ) + consecutive_e_zone.index = feature_index + consecutive_e_zone = consecutive_e_zone.set_index(old_ix, append=True) + dfs["ConsecutiveExceedances"] = consecutive_e_zone + + hourly_data_outpath: FileReference | None = None + + if spec.parent_experiment_spec and spec.parent_experiment_spec.hourly_data_config: + hourly_df = run_result.sql.timeseries_by_name( + spec.parent_experiment_spec.hourly_data_config.data, + reporting_frequency="Hourly", + ) + hourly_df.index.names = ["Timestep"] + hourly_df.columns.names = ["Trash", "Group", "Meter"] + hourly_df: pd.DataFrame = cast( + pd.DataFrame, + hourly_df.droplevel("Trash", axis=1) + .stack(level="Group", future_stack=True) + .unstack(level="Timestep"), + ) + hourly_multiindex = spec.make_multiindex( + n_rows=len(hourly_df), include_sort_subindex=False + ) + old_ix = hourly_df.index + hourly_df.index = hourly_multiindex + hourly_df = hourly_df.set_index(old_ix, append=True) + + if spec.parent_experiment_spec.hourly_data_config.does_dataframe_output: + for meter_name in hourly_df.columns.get_level_values("Meter").unique(): + variable_df = hourly_df.xs(meter_name, level="Meter", axis=1) + dataframe_key = f"HourlyData.{meter_name.replace(' ', '')}" + dfs[dataframe_key] = variable_df + if spec.parent_experiment_spec.hourly_data_config.does_file_output: + hourly_data_outpath = tempdir / "outputs_hourly_data.pq" + hourly_df.to_parquet(hourly_data_outpath) + + return GloBIOutputSpec( + dataframes=dfs, + hourly_data=hourly_data_outpath, + ) + + +@ExperimentRegistry.Register(retries=2, schedule_timeout="10h", execution_timeout="30m") +def simulate_globi_building( + input_spec: GloBIBuildingSpec, tempdir: Path +) -> GloBIOutputSpec: + """Simulate a GlobiSpec building and return monthly energy and peak results. + + NB: this is separated from the pipeline above so the pipeline can still be used as a + local invocation without *too* much difficulty. + """ + return simulate_globi_building_pipeline(input_spec, tempdir) + + +if __name__ == "__main__": + import tempfile + + from globi.models.tasks import MinimalBuildingSpec + + with tempfile.TemporaryDirectory() as tempdir: + with open("inputs/building.yml") as f: + input_spec = MinimalBuildingSpec.model_validate(yaml.safe_load(f)) + o = simulate_globi_building_pipeline( + input_spec=input_spec.globi_spec, + tempdir=Path(tempdir), + ) diff --git a/src/globi/pipelines/training.py b/src/globi/pipelines/training.py new file mode 100644 index 0000000..4ab59b5 --- /dev/null +++ b/src/globi/pipelines/training.py @@ -0,0 +1,444 @@ +"""The training pipeline.""" + +import tempfile +from datetime import timedelta +from pathlib import Path +from typing import cast + +import boto3 +import pandas as pd +import yaml +from hatchet_sdk import Context +from scythe.base import ExperimentInputSpec +from scythe.experiments import ( + BaseExperiment, +) +from scythe.hatchet import hatchet +from scythe.registry import ExperimentRegistry +from scythe.scatter_gather import RecursionMap, ScatterGatherResult, scatter_gather +from scythe.settings import ScytheStorageSettings +from scythe.utils.filesys import S3Url +from scythe.worker import ScytheWorkerLabel + +from globi.models.surrogate.outputs import ( + CombineResultsResult, + ExperimentRunWithRef, + FinalizeResult, + RecursionTransition, + StartTrainingResult, + TrainingEvaluationResult, +) +from globi.models.surrogate.sampling import SampleSpec +from globi.models.surrogate.training import ( + FoldResult, + ProgressiveTrainingSpec, + TrainFoldSpec, + TrainWithCVSpec, +) + + +@ExperimentRegistry.Register( + description="Train a regressor with cross-fold validation.", + schedule_timeout=timedelta(hours=5), + execution_timeout=timedelta(hours=1), + desired_worker_labels=ScytheWorkerLabel.HAS_GPU.worker_label, +) +def train_regressor_with_cv_fold( + input_spec: TrainFoldSpec, tempdir: Path +) -> FoldResult: + """Train a regressor with cross-fold validation.""" + # DO TRAINING + _model, (global_results, stratum_results), model_path = input_spec.train(tempdir) + return FoldResult( + regressor=model_path, + dataframes={ + "global": global_results, + "strata": stratum_results, + }, + ) + + +iterative_training = hatchet.workflow( + name="iterative_training", + description="Sample a collection of buliding simulations to then simulate and train a surrogate model.", + input_validator=ProgressiveTrainingSpec, +) + + +@iterative_training.task( + name="iterative_training.create_simulations", + schedule_timeout=timedelta(minutes=30), + execution_timeout=timedelta(minutes=10), +) +def create_simulations( + spec: ProgressiveTrainingSpec, context: Context +) -> ExperimentRunWithRef: + """Create the simulations.""" + # STEP 1: Generate the training samples, allocate simulations + context.log("Generating training samples...") + sample_spec = SampleSpec(parent=spec, priors=spec.samplers) + sample_df = sample_spec.populate_sample_df() + context.log("Training samples generated.") + + # TODO: we shouldn't have to cast here, but the typing on `runnable` is not working as expected. + input_validator = cast( + type[ExperimentInputSpec], spec.runnable.input_validator_type + ) + context.log("Converting training samples to specs...") + specs = sample_spec.convert_to_specs(sample_df, input_validator) + context.log("Training samples converted to specs.") + + # STEP 2: Simulate the simulations using scythe + run_name = spec.subrun_name("sample") + + exp = BaseExperiment( + runnable=spec.runnable, + run_name=run_name, + storage_settings=spec.storage_settings or ScytheStorageSettings(), + ) + + context.log("Allocating simulations...") + run, ref = exp.allocate( + specs, + version="bumpmajor", + recursion_map=spec.iteration.recursion, + ) + context.log("Simulations allocated.") + + run_name = run.versioned_experiment.base_experiment.run_name + if not run_name: + msg = "Run name is required." + raise ValueError(msg) + + return ExperimentRunWithRef( + run=run, + workflow_run_id=ref.workflow_run_id, + ) + + +@iterative_training.task( + name="iterative_training.await_simulations", + schedule_timeout=timedelta(hours=5), + execution_timeout=timedelta(hours=5), + parents=[create_simulations], +) +async def await_simulations( + spec: ProgressiveTrainingSpec, context: Context +) -> ScatterGatherResult: + """Await the simulations.""" + parent_output = context.task_output(create_simulations) + workflow_run_id = parent_output.workflow_run_id + context.log("Awaiting simulations...") + results = await scatter_gather.aio_get_result(workflow_run_id) + context.log("Simulations completed.") + + return results + + +@iterative_training.task( + name="iterative_training.combine_results", + schedule_timeout=timedelta(hours=5), + execution_timeout=timedelta(hours=1), + parents=[await_simulations], +) +def combine_results( + spec: ProgressiveTrainingSpec, context: Context +) -> CombineResultsResult: + """Combine the results of the simulations. + + Specifically, this step is responsible for combining the results of the simulations + of the previous iteration(s) with the results of the current iteration. In other words, + this is where we grow our simulation cache. + """ + # TODO: major consider how we handle beyond-memory scale scenarios. + # i.e. we probably need to refactor to allow lists of files that only the + # main worker is responsible for combining. + results = context.task_output(await_simulations) + combined_results: dict[str, S3Url] = {} + + # TODO: in the old version, w removed constant columns from the data, i.e.: + # is_constant = (df.max(axis=0) - df.min(axis=0)).abs() < 1e-5 + # df = df.loc[:, ~is_constant] + # Should this sort of data cleaning be done here, or should it be done in the training task? + # also, should we make sure to remove NaN? + + if spec.data_uris: + context.log("Combining results from previous iterations...") + shared_keys = set(spec.data_uris.uris.keys()) & set(results.uris.keys()) + old_keys_only = set(spec.data_uris.uris.keys()) - shared_keys + new_keys_only = set(results.uris.keys()) - shared_keys + # TODO: consider copying these over to the `combined` folder anyways. + for key in old_keys_only: + combined_results[key] = spec.data_uris.uris[key] + for key in new_keys_only: + combined_results[key] = results.uris[key] + # TODO: refactor to use a threadpool executor? + # For memory reasons, it might be a good idea to stay single threaded here. + for key in shared_keys: + context.log(f"Combining results for key {key}...") + old_df = pd.read_parquet(str(spec.data_uris.uris[key])) + new_df = pd.read_parquet(str(results.uris[key])) + combined_df = pd.concat([old_df, new_df], axis=0) + uri = spec.format_combined_output_uri(key) + combined_df.to_parquet(str(uri)) + context.log(f"Results for key {key} combined and saved to s3.") + combined_results[key] = uri + + else: + # TODO: consider copying these over to the `combined` folder anyways. + context.log( + "No previous iterations to combine results from, so using results from current iteration." + ) + combined_results = results.uris + + return CombineResultsResult( + incoming=results, + combined=ScatterGatherResult(uris=combined_results), + ) + + +@iterative_training.task( + name="iterative_training.start_training", + schedule_timeout=timedelta(hours=5), + execution_timeout=timedelta(hours=1), + parents=[combine_results], +) +def start_training( + spec: ProgressiveTrainingSpec, context: Context +) -> StartTrainingResult: + """Start the training.""" + results = context.task_output(combine_results) + + train_spec = TrainWithCVSpec( + parent=spec, + data_uris=results.combined, # TODO: should configure which results to use + ) + + # Alternatively, one task per fold-column combination? + specs = train_spec.schedule + + context.log("Scheduling training...") + run_name = spec.subrun_name("train") + exp = BaseExperiment( + runnable=train_regressor_with_cv_fold, + run_name=run_name, + storage_settings=spec.storage_settings or ScytheStorageSettings(), + ) + run, ref = exp.allocate( + specs, + version="bumpmajor", # There is normally only ever one training round per parent minor version, except during replays etc + recursion_map=RecursionMap( + factor=2, + max_depth=0, + ), + ) + context.log("Training scheduled.") + + if not run.versioned_experiment.base_experiment.run_name: + msg = "Run name is required." + raise ValueError(msg) + + return StartTrainingResult( + training_spec=train_spec, + experiment_run_with_ref=ExperimentRunWithRef( + run=run, + workflow_run_id=ref.workflow_run_id, + ), + ) + + +@iterative_training.task( + name="iterative_training.await_training", + schedule_timeout=timedelta(hours=5), + execution_timeout=timedelta(hours=5), + parents=[start_training], +) +async def await_training( + spec: ProgressiveTrainingSpec, context: Context +) -> ScatterGatherResult: + """Await the training.""" + parent_output = context.task_output(start_training) + workflow_run_id = parent_output.experiment_run_with_ref.workflow_run_id + context.log("Awaiting training...") + results = await scatter_gather.aio_get_result(workflow_run_id) + context.log("Training completed.") + + return results + + +@iterative_training.task( + name="iterative_training.evaluate_training", + schedule_timeout=timedelta(hours=5), + execution_timeout=timedelta(minutes=5), + parents=[await_training], +) +def evaluate_training( + spec: ProgressiveTrainingSpec, context: Context +) -> TrainingEvaluationResult: + """Evaluate the training.""" + results_output = context.task_output(await_training) + strata_uri = results_output.uris["strata"] + globals_uri = results_output.uris["global"] + context.log("Reading strata results from s3...") + results = pd.read_parquet(str(strata_uri)) + context.log("Strata results read from s3.") + context.log("Reading global results from s3...") + results_globals = pd.read_parquet(str(globals_uri)) + context.log("Global results read from s3.") + + fold_averages = cast( + pd.Series, + results.xs("test", level="split_segment", axis=1) + .groupby(level="iteration") + .mean() + .unstack(), + ) + # TODO: fold_averages and strata and globals should be saved to s3 + + global_averages = cast( + pd.Series, + results_globals.xs("test", level="split_segment", axis=1) + .groupby(level="iteration") + .mean() + .unstack(), + ) + + context.log("Running convergence criteria...") + ( + convergence_all, + _convergence_monitor_segment, + _convergence_monitor_segment_and_target, + _convergence, + ) = spec.convergence_criteria.run(fold_averages) + context.log("Convergence criteria run.") + + return TrainingEvaluationResult( + converged=convergence_all, + metrics={ + "global_averages": global_averages.reset_index().to_dict(orient="records"), + }, + ) + + +@iterative_training.task( + name="iterative_training.transition_recursion", + schedule_timeout=timedelta(hours=5), + execution_timeout=timedelta(minutes=5), + parents=[evaluate_training, combine_results, await_training], +) +def transition_recursion( + spec: ProgressiveTrainingSpec, context: Context +) -> RecursionTransition: + """Transition the recursion.""" + results = context.task_output(evaluate_training) + if results.converged: + context.log("Converged! Time to wrap up... no more recursion.") + return RecursionTransition(reasoning="converged", child_workflow_run_id=None) + if spec.iteration.at_max_iters: + context.log( + "Not converged, but we're at the max number of iterations. Time to wrap up... no more recursion." + ) + return RecursionTransition(reasoning="max_depth", child_workflow_run_id=None) + + await_training_output = context.task_output(await_training) + # start_training_output = context.task_output(start_training) + combine_results_output = context.task_output(combine_results) + + context.log( + "Not converged, but we have more iterations to try. Time to continue recursion..." + ) + next_spec = spec.model_copy(deep=True) + next_spec.iteration.current_iter += 1 + next_spec.data_uris = combine_results_output.combined + next_spec.metrics_uris.append(await_training_output) + next_spec.previous_experiment_ids.append(spec.experiment_id) + exp = BaseExperiment( + runnable=iterative_training, + run_name=f"{next_spec.base_run_name}", + storage_settings=spec.storage_settings or ScytheStorageSettings(), + ) + # manually bump minor here to avoid race conditions between e.g. simultaneously running v29.2.0 and v30.1.0... pretty sure the error only happens when they finish in the exact same second, but... it happened once so. + _run, ref = exp.allocate( + next_spec, + version=spec.current_version.next_minor_version(), + recursion_map=None, + ) + context.log("Recursion transitioned.") + return RecursionTransition( + reasoning=None, child_workflow_run_id=ref.workflow_run_id + ) + + +@iterative_training.task( + name="iterative_training.finalize", + schedule_timeout=timedelta(hours=5), + execution_timeout=timedelta(minutes=30), + parents=[transition_recursion, await_training, combine_results], + # skip_if=[ + # # TODO: maybe we should just run every time? + # ParentCondition( + # parent=transition_recursion, + # expression="output.reasoning == null", + # ) + # ], +) +def finalize(spec: ProgressiveTrainingSpec, context: Context) -> FinalizeResult: + """Run when training has exited the loop (converged, max depth, or other reason). Saves final models and artifacts.""" + # TODO: save the final model? + transition = context.task_output(transition_recursion) + context.log(f"Training finished. Finalizing: {transition.reasoning}") + + context.log("Fetching metrics from all iterations...") + await_training_output = context.task_output(await_training) + metrics_uris = [*spec.metrics_uris, await_training_output] + metrics_by_key: dict[str, list[pd.DataFrame]] = {} + for i, metrics_uri in enumerate(metrics_uris): + context.log(f"\tFetching metrics from iteration {i}...") + for key in metrics_uri.uris: + context.log(f"\t\tFetching metrics for key {key} from iteration {i}...") + if key not in metrics_by_key: + metrics_by_key[key] = [] + metrics_by_key[key].append(pd.read_parquet(str(metrics_uri.uris[key]))) + context.log("Combining metrics from all iterations...") + combined_metrics = { + key: pd.concat(metrics, axis=0) for key, metrics in metrics_by_key.items() + } + combined_metrics_uris = { + key: spec.format_metrics_output_uri(key) for key in combined_metrics + } + context.log("Saving combined metrics to s3...") + for key, metrics in combined_metrics.items(): + context.log(f"\tSaving metrics for key {key} to s3...") + metrics.to_parquet(str(combined_metrics_uris[key])) + context.log("Final metrics saved to s3.") + + # Get the simulation data outputs from all steps and this step + combine_results_output = context.task_output(combine_results) + + # Get the experiment ids from all steps and this step + experiment_ids = [*spec.previous_experiment_ids, spec.experiment_id] + + # TODO: save final models, or return them a little more directly? + + result = FinalizeResult( + reasoning=transition.reasoning, + data_uris=combine_results_output.combined.uris, + metrics_uris=combined_metrics_uris, + experiment_ids=experiment_ids, + ) + + s3_client = boto3.client("s3") + summary_manifest_uri = spec.format_summary_manifest_key() + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) / "summary.yml" + with open(temp_path, "w") as f: + yaml.dump(result.model_dump(mode="json"), f, indent=2, sort_keys=False) + if spec.storage_settings is None: + msg = ( + "Storage settings are not set, so we can't upload the summary manifest." + ) + raise ValueError(msg) + s3_client.upload_file( + temp_path.as_posix(), spec.storage_settings.BUCKET, summary_manifest_uri + ) + return result diff --git a/src/globi/tools/cli/main.py b/src/globi/tools/cli/main.py index e36e447..7950de6 100644 --- a/src/globi/tools/cli/main.py +++ b/src/globi/tools/cli/main.py @@ -102,6 +102,33 @@ def manifest( ) +@submit.command() +@click.option( + "--path", + type=click.Path(exists=True), + help="The path to the manifest file which will be used to configure the experiment.", + prompt="Manifest file path (.yml)", +) +def surrogate(path): + """Submit a GloBI surrogate experiment.""" + from scythe.experiments import BaseExperiment + + from globi.models.surrogate.configs.pipeline import ProgressiveTrainingSpec + from globi.pipelines.training import iterative_training + + with open(path) as f: + manifest = yaml.safe_load(f) + + config = ProgressiveTrainingSpec.model_validate(manifest) + + exp = BaseExperiment(runnable=iterative_training, run_name=config.base_run_name) + run, _ref = exp.allocate( + config, + version="bumpmajor", + ) + print(yaml.dump(run.model_dump(mode="json"), indent=2, sort_keys=False)) + + @cli.command() @click.option( "--config", @@ -127,7 +154,7 @@ def simulate( import pandas as pd from globi.models.tasks import MinimalBuildingSpec - from globi.pipelines import simulate_globi_building_pipeline + from globi.pipelines.simulations import simulate_globi_building_pipeline if isinstance(config, str): config = Path(config) @@ -371,7 +398,7 @@ def experiment( s3_client: S3Client = boto3.client("s3") s3_settings = ScytheStorageSettings() - exp = BaseExperiment(experiment=simulate_globi_building, run_name=run_name) + exp = BaseExperiment(runnable=simulate_globi_building, run_name=run_name) if not version: exp_version = exp.latest_version(s3_client, from_cache=False) diff --git a/src/globi/tools/visualization/data_sources.py b/src/globi/tools/visualization/data_sources.py index 8397258..4c8bace 100644 --- a/src/globi/tools/visualization/data_sources.py +++ b/src/globi/tools/visualization/data_sources.py @@ -262,7 +262,7 @@ def load_run_data(self, run_id: str) -> pd.DataFrame: s3_client = self.client s3_settings = ScytheStorageSettings() exp = BaseExperiment( - experiment=simulate_globi_building, + runnable=simulate_globi_building, run_name=run_id, ) diff --git a/src/globi/worker/Dockerfile b/src/globi/worker/Dockerfile index 321b76b..e4f1409 100644 --- a/src/globi/worker/Dockerfile +++ b/src/globi/worker/Dockerfile @@ -93,12 +93,13 @@ RUN EP_VERSION_DASH=$(echo "${EP_VERSION}" | tr '.' '-') && \ WORKDIR /code COPY uv.lock pyproject.toml README.md /code/ -RUN uv sync --locked --no-install-project --extra cli +# TODO: only insttall ml for certain containers by passing in a flag to the docker build command +RUN uv sync --locked --no-install-project --extra cli --extra ml-gpu RUN uv run epi prisma generate COPY src /code/src/ -RUN uv sync --locked --extra cli +RUN uv sync --locked --extra cli --extra ml-gpu CMD [ "uv", "run", "src/globi/worker/main.py" ] diff --git a/src/globi/worker/main.py b/src/globi/worker/main.py index 4a38a4a..3b9f074 100644 --- a/src/globi/worker/main.py +++ b/src/globi/worker/main.py @@ -3,14 +3,15 @@ from scythe.worker import ScytheWorkerConfig from globi.pipelines import * # noqa: F403 +from globi.pipelines import iterative_training conf = ScytheWorkerConfig() def main(): - """Main function for the worker.""" - conf.start() + """Start the worker.""" + conf.start(additional_workflows=[iterative_training]) if __name__ == "__main__": - conf.start() + main() diff --git a/uv.lock b/uv.lock index 73b2575..7c83226 100644 --- a/uv.lock +++ b/uv.lock @@ -2,9 +2,21 @@ version = 1 revision = 3 requires-python = ">=3.12" resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version < '3.13'", + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version < '3.14' and sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version < '3.14' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version >= '3.14' and sys_platform == 'darwin'", + "python_full_version < '3.14' and sys_platform == 'darwin'", +] + +[[package]] +name = "absl-py" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/64/c7/8de93764ad66968d19329a7e0c147a2bb3c7054c554d4a119111b8f9440f/absl_py-2.4.0.tar.gz", hash = "sha256:8c6af82722b35cf71e0f4d1d47dcaebfff286e27110a99fc359349b247dfb5d4", size = 116543, upload-time = "2026-01-28T10:17:05.322Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/a6/907a406bb7d359e6a63f99c313846d9eec4f7e6f7437809e03aa00fa3074/absl_py-2.4.0-py3-none-any.whl", hash = "sha256:88476fd881ca8aab94ffa78b7b6c632a782ab3ba1cd19c9bd423abc4fb4cd28d", size = 135750, upload-time = "2026-01-28T10:17:04.19Z" }, ] [[package]] @@ -148,6 +160,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, ] +[[package]] +name = "antlr4-python3-runtime" +version = "4.9.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b", size = 117034, upload-time = "2021-11-06T17:52:23.524Z" } + [[package]] name = "anyio" version = "4.11.0" @@ -899,6 +917,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/c6/c71e82e041c95ffe6a92ac707785500aa2a515a4339c2c7dd67e3c449249/cramjam-2.11.0-cp314-cp314t-win_amd64.whl", hash = "sha256:028400d699442d40dbda02f74158c73d05cb76587a12490d0bfedd958fd49188", size = 1713108, upload-time = "2025-07-27T21:24:10.147Z" }, ] +[[package]] +name = "cuda-bindings" +version = "12.9.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cuda-pathfinder", marker = "sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/c2/65bfd79292b8ff18be4dd7f7442cea37bcbc1a228c1886f1dea515c45b67/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:694ba35023846625ef471257e6b5a4bc8af690f961d197d77d34b1d1db393f56", size = 11760260, upload-time = "2025-10-21T14:51:40.79Z" }, + { url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" }, + { url = "https://files.pythonhosted.org/packages/05/8b/b4b2d1c7775fa403b64333e720cfcfccef8dcb9cdeb99947061ca5a77628/cuda_bindings-12.9.4-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cf8bfaedc238f3b115d957d1fd6562b7e8435ba57f6d0e2f87d0e7149ccb2da5", size = 11570071, upload-time = "2025-10-21T14:51:47.472Z" }, + { url = "https://files.pythonhosted.org/packages/63/56/e465c31dc9111be3441a9ba7df1941fe98f4aa6e71e8788a3fb4534ce24d/cuda_bindings-12.9.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:32bdc5a76906be4c61eb98f546a6786c5773a881f3b166486449b5d141e4a39f", size = 11906628, upload-time = "2025-10-21T14:51:49.905Z" }, + { url = "https://files.pythonhosted.org/packages/ec/07/6aff13bc1e977e35aaa6b22f52b172e2890c608c6db22438cf7ed2bf43a6/cuda_bindings-12.9.4-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3adf4958dcf68ae7801a59b73fb00a8b37f8d0595060d66ceae111b1002de38d", size = 11566797, upload-time = "2025-10-21T14:51:54.581Z" }, + { url = "https://files.pythonhosted.org/packages/a3/84/1e6be415e37478070aeeee5884c2022713c1ecc735e6d82d744de0252eee/cuda_bindings-12.9.4-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:56e0043c457a99ac473ddc926fe0dc4046694d99caef633e92601ab52cbe17eb", size = 11925991, upload-time = "2025-10-21T14:51:56.535Z" }, + { url = "https://files.pythonhosted.org/packages/1e/b5/96a6696e20c4ffd2b327f54c7d0fde2259bdb998d045c25d5dedbbe30290/cuda_bindings-12.9.4-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f53a7f453d4b2643d8663d036bafe29b5ba89eb904c133180f295df6dc151e5", size = 11624530, upload-time = "2025-10-21T14:52:01.539Z" }, + { url = "https://files.pythonhosted.org/packages/d1/af/6dfd8f2ed90b1d4719bc053ff8940e494640fe4212dc3dd72f383e4992da/cuda_bindings-12.9.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8b72ee72a9cc1b531db31eebaaee5c69a8ec3500e32c6933f2d3b15297b53686", size = 11922703, upload-time = "2025-10-21T14:52:03.585Z" }, + { url = "https://files.pythonhosted.org/packages/39/73/d2fc40c043bac699c3880bf88d3cebe9d88410cd043795382826c93a89f0/cuda_bindings-12.9.4-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:20f2699d61d724de3eb3f3369d57e2b245f93085cab44fd37c3bea036cea1a6f", size = 11565056, upload-time = "2025-10-21T14:52:08.338Z" }, + { url = "https://files.pythonhosted.org/packages/6c/19/90ac264acc00f6df8a49378eedec9fd2db3061bf9263bf9f39fd3d8377c3/cuda_bindings-12.9.4-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d80bffc357df9988dca279734bc9674c3934a654cab10cadeed27ce17d8635ee", size = 11924658, upload-time = "2025-10-21T14:52:10.411Z" }, +] + +[[package]] +name = "cuda-pathfinder" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/02/59a5bc738a09def0b49aea0e460bdf97f65206d0d041246147cf6207e69c/cuda_pathfinder-1.4.1-py3-none-any.whl", hash = "sha256:40793006082de88e0950753655e55558a446bed9a7d9d0bcb48b2506d50ed82a", size = 43903, upload-time = "2026-03-06T21:05:24.372Z" }, +] + [[package]] name = "cycler" version = "0.12.1" @@ -956,6 +1002,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, ] +[[package]] +name = "einops" +version = "0.8.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" }, +] + [[package]] name = "energy-pandas" version = "0.4.1" @@ -1292,6 +1347,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/47/71/70db47e4f6ce3e5c37a607355f80da8860a33226be640226ac52cb05ef2e/fsspec-2025.9.0-py3-none-any.whl", hash = "sha256:530dc2a2af60a414a832059574df4a6e10cce927f6f4a78209390fe38955cfb7", size = 199289, upload-time = "2025-09-02T19:10:47.708Z" }, ] +[package.optional-dependencies] +http = [ + { name = "aiohttp" }, +] + [[package]] name = "future" version = "1.0.0" @@ -1379,6 +1439,16 @@ cli = [ { name = "click" }, { name = "xlsxwriter" }, ] +ml-gpu = [ + { name = "lightgbm" }, + { name = "numba" }, + { name = "pytorch-tabular" }, + { name = "tensorboard" }, + { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.10.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform != 'darwin'" }, + { name = "wandb" }, + { name = "xgboost" }, +] visualization = [ { name = "bokeh" }, { name = "folium" }, @@ -1421,23 +1491,31 @@ requires-dist = [ { name = "folium", marker = "extra == 'visualization'", specifier = ">=0.15.0" }, { name = "geopandas", specifier = ">=0.14.0" }, { name = "ladybug-core", specifier = ">=0.44.29" }, + { name = "lightgbm", marker = "extra == 'ml-gpu'", specifier = ">=4.6.0" }, { name = "matplotlib", marker = "extra == 'visualization'", specifier = ">=3.8.0" }, + { name = "numba", marker = "extra == 'ml-gpu'", specifier = ">=0.63.1" }, { name = "numpy", specifier = ">=1.26.0" }, { name = "pandas", specifier = ">=2.1.0" }, { name = "playwright", marker = "extra == 'visualization'", specifier = ">=1.40.0" }, { name = "plotly", marker = "extra == 'visualization'", specifier = ">=5.18.0" }, { name = "pydantic", specifier = ">=2.11,<3" }, { name = "pyproj", specifier = ">=3.6.0" }, + { name = "pytorch-tabular", marker = "extra == 'ml-gpu'", specifier = ">=1.2.0" }, { name = "rasterio", marker = "extra == 'visualization'", specifier = ">=1.3.9" }, { name = "scikit-learn", specifier = ">=1.3.0" }, { name = "scipy", specifier = ">=1.11.0,<1.15" }, - { name = "scythe-engine", specifier = ">=0.1.2" }, + { name = "scythe-engine", git = "https://github.com/szvsw/scythe?branch=feature%2Fallow-versioning-workflows" }, { name = "seaborn", marker = "extra == 'visualization'", specifier = ">=0.13.0" }, { name = "shapely", specifier = ">=2.0.0" }, { name = "streamlit", marker = "extra == 'visualization'", specifier = ">=1.28.0" }, + { name = "tensorboard", marker = "extra == 'ml-gpu'", specifier = ">=2.20.0" }, + { name = "torch", marker = "sys_platform == 'darwin' and extra == 'ml-gpu'", specifier = ">=2.5.0", index = "https://pypi.org/simple", conflict = { package = "globi", extra = "ml-gpu" } }, + { name = "torch", marker = "sys_platform != 'darwin' and extra == 'ml-gpu'", specifier = ">=2.5.0", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "globi", extra = "ml-gpu" } }, + { name = "wandb", marker = "extra == 'ml-gpu'", specifier = ">=0.25.0" }, + { name = "xgboost", marker = "extra == 'ml-gpu'", specifier = ">=3.2.0" }, { name = "xlsxwriter", marker = "extra == 'cli'", specifier = ">=3.2.9" }, ] -provides-extras = ["visualization", "cli"] +provides-extras = ["visualization", "ml-gpu", "cli"] [package.metadata.requires-dev] dev = [ @@ -1976,7 +2054,7 @@ dependencies = [ { name = "nbformat" }, { name = "packaging" }, { name = "prometheus-client" }, - { name = "pywinpty", marker = "os_name == 'nt'" }, + { name = "pywinpty", marker = "os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux'" }, { name = "pyzmq" }, { name = "send2trash" }, { name = "terminado" }, @@ -1994,7 +2072,7 @@ name = "jupyter-server-terminals" version = "0.5.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pywinpty", marker = "os_name == 'nt'" }, + { name = "pywinpty", marker = "os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux'" }, { name = "terminado" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f4/a7/bcd0a9b0cbba88986fe944aaaf91bfda603e5a50bda8ed15123f381a3b2f/jupyter_server_terminals-0.5.4.tar.gz", hash = "sha256:bbda128ed41d0be9020349f9f1f2a4ab9952a73ed5f5ac9f1419794761fb87f5", size = 31770, upload-time = "2026-01-14T16:53:20.213Z" } @@ -2156,6 +2234,36 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d4/52/7b8421a8ace22a17ae77dd9a8367e916364ed8be72502cb744805f06d6ac/ladybug_geometry-1.34.14-py3-none-any.whl", hash = "sha256:af91ee9285333ca1ddfaf439530306dff7f0a891cae40d4dc5491f139fcf7d36", size = 198221, upload-time = "2025-11-07T04:16:46.986Z" }, ] +[[package]] +name = "lightgbm" +version = "4.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/68/0b/a2e9f5c5da7ef047cc60cef37f86185088845e8433e54d2e7ed439cce8a3/lightgbm-4.6.0.tar.gz", hash = "sha256:cb1c59720eb569389c0ba74d14f52351b573af489f230032a1c9f314f8bab7fe", size = 1703705, upload-time = "2025-02-15T04:03:03.111Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f2/75/cffc9962cca296bc5536896b7e65b4a7cdeb8db208e71b9c0133c08f8f7e/lightgbm-4.6.0-py3-none-macosx_10_15_x86_64.whl", hash = "sha256:b7a393de8a334d5c8e490df91270f0763f83f959574d504c7ccb9eee4aef70ed", size = 2010151, upload-time = "2025-02-15T04:02:50.961Z" }, + { url = "https://files.pythonhosted.org/packages/21/1b/550ee378512b78847930f5d74228ca1fdba2a7fbdeaac9aeccc085b0e257/lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:2dafd98d4e02b844ceb0b61450a660681076b1ea6c7adb8c566dfd66832aafad", size = 1592172, upload-time = "2025-02-15T04:02:53.937Z" }, + { url = "https://files.pythonhosted.org/packages/64/41/4fbde2c3d29e25ee7c41d87df2f2e5eda65b431ee154d4d462c31041846c/lightgbm-4.6.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4d68712bbd2b57a0b14390cbf9376c1d5ed773fa2e71e099cac588703b590336", size = 3454567, upload-time = "2025-02-15T04:02:56.443Z" }, + { url = "https://files.pythonhosted.org/packages/42/86/dabda8fbcb1b00bcfb0003c3776e8ade1aa7b413dff0a2c08f457dace22f/lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:cb19b5afea55b5b61cbb2131095f50538bd608a00655f23ad5d25ae3e3bf1c8d", size = 3569831, upload-time = "2025-02-15T04:02:58.925Z" }, + { url = "https://files.pythonhosted.org/packages/5e/23/f8b28ca248bb629b9e08f877dd2965d1994e1674a03d67cd10c5246da248/lightgbm-4.6.0-py3-none-win_amd64.whl", hash = "sha256:37089ee95664b6550a7189d887dbf098e3eadab03537e411f52c63c121e3ba4b", size = 1451509, upload-time = "2025-02-15T04:03:01.515Z" }, +] + +[[package]] +name = "lightning-utilities" +version = "0.15.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f1/45/7fa8f56b17dc0f0a41ec70dd307ecd6787254483549843bef4c30ab5adce/lightning_utilities-0.15.3.tar.gz", hash = "sha256:792ae0204c79f6859721ac7f386c237a33b0ed06ba775009cb894e010a842033", size = 33553, upload-time = "2026-02-22T14:48:53.348Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/f4/ead6e0e37209b07c9baa3e984ccdb0348ca370b77cea3aaea8ddbb097e00/lightning_utilities-0.15.3-py3-none-any.whl", hash = "sha256:6c55f1bee70084a1cbeaa41ada96e4b3a0fea5909e844dd335bd80f5a73c5f91", size = 31906, upload-time = "2026-02-22T14:48:52.488Z" }, +] + [[package]] name = "littleutils" version = "0.2.4" @@ -2236,6 +2344,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/59/1b/6ef961f543593969d25b2afe57a3564200280528caa9bd1082eecdd7b3bc/markdown-3.10.1-py3-none-any.whl", hash = "sha256:867d788939fe33e4b736426f5b9f651ad0c0ae0ecf89df0ca5d1176c70812fe3", size = 107684, upload-time = "2026-01-21T18:09:27.203Z" }, ] +[[package]] +name = "markdown-it-py" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, +] + [[package]] name = "markupsafe" version = "3.0.3" @@ -2365,6 +2485,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, ] +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, +] + [[package]] name = "mergedeep" version = "1.3.4" @@ -2515,6 +2644,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/5c/2597cef67b6947b15c47f8dba967a0baf19fbdfdc86f6e4a8ba7af8b581a/mkdocstrings_python-1.19.0-py3-none-any.whl", hash = "sha256:395c1032af8f005234170575cc0c5d4d20980846623b623b35594281be4a3059", size = 143417, upload-time = "2025-11-10T13:30:54.164Z" }, ] +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + [[package]] name = "msgpack" version = "1.1.2" @@ -2992,6 +3130,168 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374, upload-time = "2025-05-17T21:43:35.479Z" }, ] +[[package]] +name = "nvidia-cublas-cu12" +version = "12.8.4.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/29/99/db44d685f0e257ff0e213ade1964fc459b4a690a73293220e98feb3307cf/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0", size = 590537124, upload-time = "2025-03-07T01:43:53.556Z" }, + { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" }, +] + +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/1f/b3bd73445e5cb342727fd24fe1f7b748f690b460acadc27ea22f904502c8/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed", size = 9533318, upload-time = "2025-03-07T01:40:10.421Z" }, + { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" }, + { url = "https://files.pythonhosted.org/packages/eb/d1/e50d0acaab360482034b84b6e27ee83c6738f7d32182b987f9c7a4e32962/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8", size = 43106076, upload-time = "2025-03-07T01:41:59.817Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/75/f865a3b236e4647605ea34cc450900854ba123834a5f1598e160b9530c3a/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d", size = 965265, upload-time = "2025-03-07T01:39:43.533Z" }, + { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.10.2.21" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" }, + { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, +] + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.3.3.83" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" }, + { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" }, +] + +[[package]] +name = "nvidia-cufile-cu12" +version = "1.13.1.3" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" }, + { url = "https://files.pythonhosted.org/packages/1e/f5/5607710447a6fe9fd9b3283956fceeee8a06cda1d2f56ce31371f595db2a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a", size = 1120705, upload-time = "2025-03-07T01:45:41.434Z" }, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.9.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/5e/92aa15eca622a388b80fbf8375d4760738df6285b1e92c43d37390a33a9a/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd", size = 63625754, upload-time = "2025-03-07T01:46:10.735Z" }, + { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" }, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.7.3.90" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" }, + { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" }, +] + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.5.8.93" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" }, + { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" }, +] + +[[package]] +name = "nvidia-cusparselt-cu12" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" }, + { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" }, +] + +[[package]] +name = "nvidia-nccl-cu12" +version = "2.27.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/1c/857979db0ef194ca5e21478a0612bcdbbe59458d7694361882279947b349/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a", size = 322400625, upload-time = "2025-06-26T04:11:04.496Z" }, + { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" }, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" }, + { url = "https://files.pythonhosted.org/packages/2a/a2/8cee5da30d13430e87bf99bb33455d2724d0a4a9cb5d7926d80ccb96d008/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7", size = 38386204, upload-time = "2025-03-07T01:49:43.612Z" }, +] + +[[package]] +name = "nvidia-nvshmem-cu12" +version = "3.4.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1d/6a/03aa43cc9bd3ad91553a88b5f6fb25ed6a3752ae86ce2180221962bc2aa5/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b48363fc6964dede448029434c6abed6c5e37f823cb43c3bcde7ecfc0457e15", size = 138936938, upload-time = "2025-09-06T00:32:05.589Z" }, + { url = "https://files.pythonhosted.org/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd", size = 139103095, upload-time = "2025-09-06T00:32:31.266Z" }, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/c0/1b303feea90d296f6176f32a2a70b5ef230f9bdeb3a72bddb0dc922dc137/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615", size = 91161, upload-time = "2025-03-07T01:42:23.922Z" }, + { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, +] + +[[package]] +name = "omegaconf" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "antlr4-python3-runtime" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/48/6388f1bb9da707110532cb70ec4d2822858ddfb44f1cdf1233c20a80ea4b/omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7", size = 3298120, upload-time = "2022-12-08T20:59:22.753Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/94/1843518e420fa3ed6919835845df698c7e27e183cb997394e4a670973a65/omegaconf-2.3.0-py3-none-any.whl", hash = "sha256:7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b", size = 79500, upload-time = "2022-12-08T20:59:19.686Z" }, +] + [[package]] name = "openpyxl" version = "3.1.5" @@ -3867,6 +4167,49 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/51/e5/fecf13f06e5e5f67e8837d777d1bc43fac0ed2b77a676804df5c34744727/python_json_logger-4.0.0-py3-none-any.whl", hash = "sha256:af09c9daf6a813aa4cc7180395f50f2a9e5fa056034c9953aec92e381c5ba1e2", size = 15548, upload-time = "2025-10-06T04:15:17.553Z" }, ] +[[package]] +name = "pytorch-lightning" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fsspec", extra = ["http"] }, + { name = "lightning-utilities" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.10.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform != 'darwin'" }, + { name = "torchmetrics" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8b/ac/ebd5f6f58691cbd4f73836e43e1727f3814311b960c41f88e259606ca2b2/pytorch_lightning-2.6.1.tar.gz", hash = "sha256:ba08f8901cf226fcca473046ad9346f414e99117762dc869c76e650d5b3d7bdc", size = 665563, upload-time = "2026-01-30T14:59:11.636Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/93/c8c361bf0a2fe50f828f32def460e8b8a14b93955d3fd302b1a9b63b19e4/pytorch_lightning-2.6.1-py3-none-any.whl", hash = "sha256:1f8118567ec829e3055f16cf1aa320883a86a47c836951bfd9dcfa34ec7ffd59", size = 857273, upload-time = "2026-01-30T14:59:10.141Z" }, +] + +[[package]] +name = "pytorch-tabular" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "einops" }, + { name = "numpy" }, + { name = "omegaconf" }, + { name = "pandas" }, + { name = "pytorch-lightning" }, + { name = "rich" }, + { name = "scikit-base" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.10.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform != 'darwin'" }, + { name = "torchmetrics" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0b/f2/823de16d6a461504f4ed8e4a555d6ce356e5f81e6525d95e2b64895ec94f/pytorch_tabular-1.2.0.tar.gz", hash = "sha256:1b96b576eb3de443840b313d0b298293eaf83dcfdbba53ed8974b76d1351b821", size = 2312825, upload-time = "2026-01-26T21:48:22.577Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6f/c9/1e01c682e2ad7132bc1943d8d367c96f241bf85679e76d66eb0c4e4cbde9/pytorch_tabular-1.2.0-py3-none-any.whl", hash = "sha256:0a59f8a2304856b3d1e905f7b66153ebc65df1a6a017f2c8a13a29f62dc95b26", size = 165800, upload-time = "2026-01-26T21:48:21.195Z" }, +] + [[package]] name = "pytz" version = "2025.2" @@ -4045,6 +4388,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, ] +[[package]] +name = "rich" +version = "14.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" }, +] + [[package]] name = "rpds-py" version = "0.28.0" @@ -4164,6 +4520,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/f0/ae7ca09223a81a1d890b2557186ea015f6e0502e9b8cb8e1813f1d8cfa4e/s3transfer-0.14.0-py3-none-any.whl", hash = "sha256:ea3b790c7077558ed1f02a3072fb3cb992bbbd253392f4b6e9e8976941c7d456", size = 85712, upload-time = "2025-09-09T19:23:30.041Z" }, ] +[[package]] +name = "scikit-base" +version = "0.13.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/56/a8/610f99f01f326178b8a7347db2ede654b42548e9697b516480cc081e344d/scikit_base-0.13.1.tar.gz", hash = "sha256:169e5427233f7237b38c7d858bf07b8a86bbf59feccf0708e26dad4ac312c593", size = 134482, upload-time = "2026-01-25T11:31:38.814Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/55/c20d8319aab037e11f1d6403b6102d1041694abe24a3aa4a1e27f2cdb9f2/scikit_base-0.13.1-py3-none-any.whl", hash = "sha256:1aca86759435fd2d32d83a526ce11095119c0745e4e5dd91f2e5820023ca8e39", size = 159779, upload-time = "2026-01-25T11:31:36.759Z" }, +] + [[package]] name = "scikit-learn" version = "1.7.2" @@ -4228,7 +4593,7 @@ wheels = [ [[package]] name = "scythe-engine" version = "0.1.2" -source = { registry = "https://pypi.org/simple" } +source = { git = "https://github.com/szvsw/scythe?branch=feature%2Fallow-versioning-workflows#54e0668df5ab4741d05925c3b5dddff39ff4c9e6" } dependencies = [ { name = "boto3" }, { name = "fastparquet" }, @@ -4241,10 +4606,6 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ac/00/061a3e1301b03b3b1c6817ea5db19cc62c0448c02c504e391d49273451c2/scythe_engine-0.1.2.tar.gz", hash = "sha256:a53c49a8a8700f1dfd7a61f4868898289c1d3751b42ca767369faf7a3c08dc5e", size = 225628, upload-time = "2026-02-12T15:53:02.416Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d3/69/1cfac0fe0aa049d335f2ff6a3aeef32cc7893551ffe831e4d78ccde50b7b/scythe_engine-0.1.2-py3-none-any.whl", hash = "sha256:b2dd6924c0b26a1dfe9a68e9f6b028b77a944263849c82d41a28e635baf899d8", size = 33195, upload-time = "2026-02-12T15:53:00.827Z" }, -] [[package]] name = "seaborn" @@ -4269,6 +4630,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1c/78/504fdd027da3b84ff1aecd9f6957e65f35134534ccc6da8628eb71e76d3f/send2trash-2.1.0-py3-none-any.whl", hash = "sha256:0da2f112e6d6bb22de6aa6daa7e144831a4febf2a87261451c4ad849fe9a873c", size = 17610, upload-time = "2026-01-14T06:27:35.218Z" }, ] +[[package]] +name = "sentry-sdk" +version = "2.54.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c8/e9/2e3a46c304e7fa21eaa70612f60354e32699c7102eb961f67448e222ad7c/sentry_sdk-2.54.0.tar.gz", hash = "sha256:2620c2575128d009b11b20f7feb81e4e4e8ae08ec1d36cbc845705060b45cc1b", size = 413813, upload-time = "2026-03-02T15:12:41.355Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/39/be412cc86bc6247b8f69e9383d7950711bd86f8d0a4a4b0fe8fad685bc21/sentry_sdk-2.54.0-py2.py3-none-any.whl", hash = "sha256:fd74e0e281dcda63afff095d23ebcd6e97006102cdc8e78a29f19ecdf796a0de", size = 439198, upload-time = "2026-03-02T15:12:39.546Z" }, +] + [[package]] name = "setuptools" version = "80.9.0" @@ -4429,6 +4803,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/39/60/868371b6482ccd9ef423c6f62650066cf8271fdb2ee84f192695ad6b7a96/streamlit-1.51.0-py3-none-any.whl", hash = "sha256:4008b029f71401ce54946bb09a6a3e36f4f7652cbb48db701224557738cfda38", size = 10171702, upload-time = "2025-10-29T17:07:35.97Z" }, ] +[[package]] +name = "sympy" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, +] + [[package]] name = "tables" version = "3.10.2" @@ -4473,13 +4859,43 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, ] +[[package]] +name = "tensorboard" +version = "2.20.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "absl-py" }, + { name = "grpcio" }, + { name = "markdown" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "protobuf" }, + { name = "setuptools" }, + { name = "tensorboard-data-server" }, + { name = "werkzeug" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/d9/a5db55f88f258ac669a92858b70a714bbbd5acd993820b41ec4a96a4d77f/tensorboard-2.20.0-py3-none-any.whl", hash = "sha256:9dc9f978cb84c0723acf9a345d96c184f0293d18f166bb8d59ee098e6cfaaba6", size = 5525680, upload-time = "2025-07-17T19:20:49.638Z" }, +] + +[[package]] +name = "tensorboard-data-server" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/13/e503968fefabd4c6b2650af21e110aa8466fe21432cd7c43a84577a89438/tensorboard_data_server-0.7.2-py3-none-any.whl", hash = "sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb", size = 2356, upload-time = "2023-10-23T21:23:32.16Z" }, + { url = "https://files.pythonhosted.org/packages/b7/85/dabeaf902892922777492e1d253bb7e1264cadce3cea932f7ff599e53fea/tensorboard_data_server-0.7.2-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60", size = 4823598, upload-time = "2023-10-23T21:23:33.714Z" }, + { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363, upload-time = "2023-10-23T21:23:35.583Z" }, +] + [[package]] name = "terminado" version = "0.18.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "ptyprocess", marker = "os_name != 'nt'" }, - { name = "pywinpty", marker = "os_name == 'nt'" }, + { name = "pywinpty", marker = "os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux'" }, { name = "tornado" }, ] sdist = { url = "https://files.pythonhosted.org/packages/8a/11/965c6fd8e5cc254f1fe142d547387da17a8ebfd75a3455f637c663fb38a0/terminado-0.18.1.tar.gz", hash = "sha256:de09f2c4b85de4765f7714688fff57d3e75bad1f909b589fde880460c753fd2e", size = 32701, upload-time = "2024-03-12T14:34:39.026Z" } @@ -4573,6 +4989,103 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bd/75/8539d011f6be8e29f339c42e633aae3cb73bffa95dd0f9adec09b9c58e85/tomlkit-0.13.3-py3-none-any.whl", hash = "sha256:c89c649d79ee40629a9fda55f8ace8c6a1b42deb912b2a8fd8d942ddadb606b0", size = 38901, upload-time = "2025-06-05T07:13:43.546Z" }, ] +[[package]] +name = "torch" +version = "2.10.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'darwin'", + "python_full_version < '3.14' and sys_platform == 'darwin'", +] +dependencies = [ + { name = "filelock", marker = "sys_platform == 'darwin'" }, + { name = "fsspec", marker = "sys_platform == 'darwin'" }, + { name = "jinja2", marker = "sys_platform == 'darwin'" }, + { name = "networkx", marker = "sys_platform == 'darwin'" }, + { name = "setuptools", marker = "sys_platform == 'darwin'" }, + { name = "sympy", marker = "sys_platform == 'darwin'" }, + { name = "typing-extensions", marker = "sys_platform == 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/d3/54/a2ba279afcca44bbd320d4e73675b282fcee3d81400ea1b53934efca6462/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:13ec4add8c3faaed8d13e0574f5cd4a323c11655546f91fbe6afa77b57423574", size = 79498202, upload-time = "2026-02-10T21:44:52.603Z" }, + { url = "https://files.pythonhosted.org/packages/ec/23/2c9fe0c9c27f7f6cb865abcea8a4568f29f00acaeadfc6a37f6801f84cb4/torch-2.10.0-2-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:e521c9f030a3774ed770a9c011751fb47c4d12029a3d6522116e48431f2ff89e", size = 79498254, upload-time = "2026-02-10T21:44:44.095Z" }, + { url = "https://files.pythonhosted.org/packages/c9/5c/dee910b87c4d5c0fcb41b50839ae04df87c1cfc663cf1b5fca7ea565eeaa/torch-2.10.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6d3707a61863d1c4d6ebba7be4ca320f42b869ee657e9b2c21c736bf17000294", size = 79498198, upload-time = "2026-01-21T16:24:34.704Z" }, + { url = "https://files.pythonhosted.org/packages/1a/0b/39929b148f4824bc3ad6f9f72a29d4ad865bcf7ebfc2fa67584773e083d2/torch-2.10.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:3202429f58309b9fa96a614885eace4b7995729f44beb54d3e4a47773649d382", size = 79851305, upload-time = "2026-01-21T16:24:09.209Z" }, + { url = "https://files.pythonhosted.org/packages/0e/13/e76b4d9c160e89fff48bf16b449ea324bda84745d2ab30294c37c2434c0d/torch-2.10.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:cdf2a523d699b70d613243211ecaac14fe9c5df8a0b0a9c02add60fb2a413e0f", size = 79498248, upload-time = "2026-01-21T16:23:09.315Z" }, + { url = "https://files.pythonhosted.org/packages/4f/93/716b5ac0155f1be70ed81bacc21269c3ece8dba0c249b9994094110bfc51/torch-2.10.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:bf0d9ff448b0218e0433aeb198805192346c4fd659c852370d5cc245f602a06a", size = 79464992, upload-time = "2026-01-21T16:23:05.162Z" }, + { url = "https://files.pythonhosted.org/packages/d8/94/71994e7d0d5238393df9732fdab607e37e2b56d26a746cb59fdb415f8966/torch-2.10.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:f5ab4ba32383061be0fb74bda772d470140a12c1c3b58a0cfbf3dae94d164c28", size = 79850324, upload-time = "2026-01-21T16:22:09.494Z" }, +] + +[[package]] +name = "torch" +version = "2.10.0+cu128" +source = { registry = "https://download.pytorch.org/whl/cu128" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version < '3.14' and sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version < '3.14' and sys_platform != 'darwin' and sys_platform != 'linux'", +] +dependencies = [ + { name = "cuda-bindings", marker = "sys_platform == 'linux'" }, + { name = "filelock", marker = "sys_platform != 'darwin'" }, + { name = "fsspec", marker = "sys_platform != 'darwin'" }, + { name = "jinja2", marker = "sys_platform != 'darwin'" }, + { name = "networkx", marker = "sys_platform != 'darwin'" }, + { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cufile-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvshmem-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "sys_platform == 'linux'" }, + { name = "setuptools", marker = "sys_platform != 'darwin'" }, + { name = "sympy", marker = "sys_platform != 'darwin'" }, + { name = "triton", marker = "sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6f09cdf2415516be028ae82e6b985bcfc3eac37bc52ab401142689f6224516ca" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:628e89bd5110ced7debee2a57c69959725b7fbc64eab81a39dd70e46c7e28ba5" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:fbde8f6a9ec8c76979a0d14df21c10b9e5cab6f0d106a73ca73e2179bc597cae" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:bdbcc703382f948e951c063448c9406bf38ce66c41dd698d9e2733fcf96c037a" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:7b4bd23ed63de97456fcc81c26fea9f02ee02ce1112111c4dac0d8cfe574b23e" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:4d1b0b49c54223c7c04050b49eac141d77b6edbc34aea1dfc74a6fdb661baa8c" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:f1f8b840c64b645a4bc61a393db48effb9c92b2dc26c8373873911f0750d1ea7" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:23f58258012bcf1c349cb22af387e33aadca7f83ea617b080e774eb41e4fe8ff" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:01b216e097b17a5277cfb47c383cdcacf06abeadcb0daca0c76b59e72854c3b6" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:c42377bc2607e3e1c60da71b792fb507c3938c87fd6edab8b21c59c91473c36d" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:37d71feea068776855686a1512058df3f19f6f040a151f055aa746601678744f" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp314-cp314-win_amd64.whl", hash = "sha256:c57017ca29e62271e362fdeee7d20070e254755a5148b30b553d8a10fc83c7ef" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:777461f50b2daf77e4bdd8e2ad34bdfc5a993bf1bdf2ab9ef39f5edfe4e9c12b" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:7bcba6a7c5f0987a13298b1ca843155dcceceac758fa3c7ccd5c7af4059a1080" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp314-cp314t-win_amd64.whl", hash = "sha256:70d89143c956389d4806cb4e5fe0b1129fe0db280e1073288d17fa76c101cba4" }, +] + +[[package]] +name = "torchmetrics" +version = "1.8.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lightning-utilities" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.10.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform != 'darwin'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/85/2e/48a887a59ecc4a10ce9e8b35b3e3c5cef29d902c4eac143378526e7485cb/torchmetrics-1.8.2.tar.gz", hash = "sha256:cf64a901036bf107f17a524009eea7781c9c5315d130713aeca5747a686fe7a5", size = 580679, upload-time = "2025-09-03T14:00:54.077Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/21/aa0f434434c48490f91b65962b1ce863fdcce63febc166ca9fe9d706c2b6/torchmetrics-1.8.2-py3-none-any.whl", hash = "sha256:08382fd96b923e39e904c4d570f3d49e2cc71ccabd2a94e0f895d1f0dac86242", size = 983161, upload-time = "2025-09-03T14:00:51.921Z" }, +] + [[package]] name = "tornado" version = "6.5.2" @@ -4625,6 +5138,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/61/7a/f38385f1b2d5f54221baf1db3d6371dc6eef8041d95abff39576c694e9d9/transforms3d-0.4.2-py3-none-any.whl", hash = "sha256:1c70399d9e9473ecc23311fd947f727f7c69ed0b063244828c383aa1aefa5941", size = 1376759, upload-time = "2024-06-20T11:09:19.43Z" }, ] +[[package]] +name = "triton" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/5d/08201db32823bdf77a0e2b9039540080b2e5c23a20706ddba942924ebcd6/triton-3.6.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:374f52c11a711fd062b4bfbb201fd9ac0a5febd28a96fb41b4a0f51dde3157f4", size = 176128243, upload-time = "2026-01-20T16:16:07.857Z" }, + { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" }, + { url = "https://files.pythonhosted.org/packages/3c/12/34d71b350e89a204c2c7777a9bba0dcf2f19a5bfdd70b57c4dbc5ffd7154/triton-3.6.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:448e02fe6dc898e9e5aa89cf0ee5c371e99df5aa5e8ad976a80b93334f3494fd", size = 176133521, upload-time = "2026-01-20T16:16:13.321Z" }, + { url = "https://files.pythonhosted.org/packages/f9/0b/37d991d8c130ce81a8728ae3c25b6e60935838e9be1b58791f5997b24a54/triton-3.6.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c7f76c6e72d2ef08df639e3d0d30729112f47a56b0c81672edc05ee5116ac9", size = 188289450, upload-time = "2026-01-20T16:00:49.136Z" }, + { url = "https://files.pythonhosted.org/packages/ce/4e/41b0c8033b503fd3cfcd12392cdd256945026a91ff02452bef40ec34bee7/triton-3.6.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1722e172d34e32abc3eb7711d0025bb69d7959ebea84e3b7f7a341cd7ed694d6", size = 176276087, upload-time = "2026-01-20T16:16:18.989Z" }, + { url = "https://files.pythonhosted.org/packages/35/f8/9c66bfc55361ec6d0e4040a0337fb5924ceb23de4648b8a81ae9d33b2b38/triton-3.6.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d002e07d7180fd65e622134fbd980c9a3d4211fb85224b56a0a0efbd422ab72f", size = 188400296, upload-time = "2026-01-20T16:00:56.042Z" }, + { url = "https://files.pythonhosted.org/packages/49/55/5ecf0dcaa0f2fbbd4420f7ef227ee3cb172e91e5fede9d0ecaddc43363b4/triton-3.6.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef5523241e7d1abca00f1d240949eebdd7c673b005edbbce0aca95b8191f1d43", size = 176138577, upload-time = "2026-01-20T16:16:25.426Z" }, + { url = "https://files.pythonhosted.org/packages/df/3d/9e7eee57b37c80cec63322c0231bb6da3cfe535a91d7a4d64896fcb89357/triton-3.6.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a17a5d5985f0ac494ed8a8e54568f092f7057ef60e1b0fa09d3fd1512064e803", size = 188273063, upload-time = "2026-01-20T16:01:07.278Z" }, + { url = "https://files.pythonhosted.org/packages/48/db/56ee649cab5eaff4757541325aca81f52d02d4a7cd3506776cad2451e060/triton-3.6.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b3a97e8ed304dfa9bd23bb41ca04cdf6b2e617d5e782a8653d616037a5d537d", size = 176274804, upload-time = "2026-01-20T16:16:31.528Z" }, + { url = "https://files.pythonhosted.org/packages/f6/56/6113c23ff46c00aae423333eb58b3e60bdfe9179d542781955a5e1514cb3/triton-3.6.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46bd1c1af4b6704e554cad2eeb3b0a6513a980d470ccfa63189737340c7746a7", size = 188397994, upload-time = "2026-01-20T16:01:14.236Z" }, +] + [[package]] name = "tsam" version = "2.3.9" @@ -4726,6 +5256,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/2a/dc2228b2888f51192c7dc766106cd475f1b768c10caaf9727659726f7391/virtualenv-20.36.1-py3-none-any.whl", hash = "sha256:575a8d6b124ef88f6f51d56d656132389f961062a9177016a50e4f507bbcc19f", size = 6008258, upload-time = "2026-01-09T18:20:59.425Z" }, ] +[[package]] +name = "wandb" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "gitpython" }, + { name = "packaging" }, + { name = "platformdirs" }, + { name = "protobuf" }, + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "sentry-sdk" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fd/60/d94952549920469524b689479c864c692ca47eca4b8c2fe3389b64a58778/wandb-0.25.0.tar.gz", hash = "sha256:45840495a288e34245d69d07b5a0b449220fbc5b032e6b51c4f92ec9026d2ad1", size = 43951335, upload-time = "2026-02-13T00:17:45.515Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/7d/0c131db3ec9deaabbd32263d90863cbfbe07659527e11c35a5c738cecdc5/wandb-0.25.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:5eecb3c7b5e60d1acfa4b056bfbaa0b79a482566a9db58c9f99724b3862bc8e5", size = 23287536, upload-time = "2026-02-13T00:17:20.265Z" }, + { url = "https://files.pythonhosted.org/packages/c3/95/31bb7f76a966ec87495e5a72ac7570685be162494c41757ac871768dbc4f/wandb-0.25.0-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:daeedaadb183dc466e634fba90ab2bab1d4e93000912be0dee95065a0624a3fd", size = 25196062, upload-time = "2026-02-13T00:17:23.356Z" }, + { url = "https://files.pythonhosted.org/packages/d9/a1/258cdedbf30cebc692198a774cf0ef945b7ed98ee64bdaf62621281c95d8/wandb-0.25.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:5e0127dbcef13eea48f4b84268da7004d34d3120ebc7b2fa9cefb72b49dbb825", size = 22799744, upload-time = "2026-02-13T00:17:26.437Z" }, + { url = "https://files.pythonhosted.org/packages/de/91/ec9465d014cfd199c5b2083d271d31b3c2aedeae66f3d8a0712f7f54bdf3/wandb-0.25.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:6c4c38077836f9b7569a35b0e1dcf1f0c43616fcd936d182f475edbfea063665", size = 25262839, upload-time = "2026-02-13T00:17:28.8Z" }, + { url = "https://files.pythonhosted.org/packages/c7/95/cb2d1c7143f534544147fb53fe87944508b8cb9a058bc5b6f8a94adbee15/wandb-0.25.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:6edd8948d305cb73745bf564b807bd73da2ccbd47c548196b8a362f7df40aed8", size = 22853714, upload-time = "2026-02-13T00:17:31.68Z" }, + { url = "https://files.pythonhosted.org/packages/d7/94/68163f70c1669edcf130822aaaea782d8198b5df74443eca0085ec596774/wandb-0.25.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:ada6f08629bb014ad6e0a19d5dec478cdaa116431baa3f0a4bf4ab8d9893611f", size = 25358037, upload-time = "2026-02-13T00:17:34.676Z" }, + { url = "https://files.pythonhosted.org/packages/cc/fb/9578eed2c01b2fc6c8b693da110aa9c73a33d7bb556480f5cfc42e48c94e/wandb-0.25.0-py3-none-win32.whl", hash = "sha256:020b42ca4d76e347709d65f59b30d4623a115edc28f462af1c92681cb17eae7c", size = 24604118, upload-time = "2026-02-13T00:17:37.641Z" }, + { url = "https://files.pythonhosted.org/packages/25/97/460f6cb738aaa39b4eb2e6b4c630b2ae4321cdd70a79d5955ea75a878981/wandb-0.25.0-py3-none-win_amd64.whl", hash = "sha256:78307ac0b328f2dc334c8607bec772851215584b62c439eb320c4af4fb077a00", size = 24604122, upload-time = "2026-02-13T00:17:39.991Z" }, + { url = "https://files.pythonhosted.org/packages/27/6c/5847b4dda1dfd52630dac08711d4348c69ed657f0698fc2d949c7f7a6622/wandb-0.25.0-py3-none-win_arm64.whl", hash = "sha256:c6174401fd6fb726295e98d57b4231c100eca96bd17de51bfc64038a57230aaf", size = 21785298, upload-time = "2026-02-13T00:17:42.475Z" }, +] + [[package]] name = "watchdog" version = "6.0.0" @@ -4777,6 +5336,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/34/db/b10e48aa8fff7407e67470363eac595018441cf32d5e1001567a7aeba5d2/websocket_client-1.9.0-py3-none-any.whl", hash = "sha256:af248a825037ef591efbf6ed20cc5faa03d3b47b9e5a2230a529eeee1c1fc3ef", size = 82616, upload-time = "2025-10-07T21:16:34.951Z" }, ] +[[package]] +name = "werkzeug" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/f1/ee81806690a87dab5f5653c1f146c92bc066d7f4cebc603ef88eb9e13957/werkzeug-3.1.6.tar.gz", hash = "sha256:210c6bede5a420a913956b4791a7f4d6843a43b6fcee4dfa08a65e93007d0d25", size = 864736, upload-time = "2026-02-19T15:17:18.884Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/ec/d58832f89ede95652fd01f4f24236af7d32b70cab2196dfcc2d2fd13c5c2/werkzeug-3.1.6-py3-none-any.whl", hash = "sha256:7ddf3357bb9564e407607f988f683d72038551200c704012bb9a4c523d42f131", size = 225166, upload-time = "2026-02-19T15:17:17.475Z" }, +] + [[package]] name = "widgetsnbextension" version = "4.0.15" @@ -4786,6 +5357,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3f/0e/fa3b193432cfc60c93b42f3be03365f5f909d2b3ea410295cf36df739e31/widgetsnbextension-4.0.15-py3-none-any.whl", hash = "sha256:8156704e4346a571d9ce73b84bee86a29906c9abfd7223b7228a28899ccf3366", size = 2196503, upload-time = "2025-11-01T21:15:53.565Z" }, ] +[[package]] +name = "xgboost" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/bb/1eb0242409d22db725d7a88088e6cfd6556829fb0736f9ff69aa9f1e9455/xgboost-3.2.0.tar.gz", hash = "sha256:99b0e9a2a64896cdaf509c5e46372d336c692406646d20f2af505003c0c5d70d", size = 1263936, upload-time = "2026-02-10T11:03:05.542Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/49/6e4cdd877c24adf56cb3586bc96d93d4dcd780b5ea1efb32e1ee0de08bae/xgboost-3.2.0-py3-none-macosx_10_15_x86_64.whl", hash = "sha256:2f661966d3e322536d9c448090a870fcba1e32ee5760c10b7c46bac7a342079a", size = 2507014, upload-time = "2026-02-10T10:50:57.44Z" }, + { url = "https://files.pythonhosted.org/packages/93/f1/c09ef1add609453aa3ba5bafcd0d1c1a805c1263c0b60138ec968f8ec296/xgboost-3.2.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:eabbd40d474b8dbf6cb3536325f9150b9e6f0db32d18de9914fb3227d0bef5b7", size = 2328527, upload-time = "2026-02-10T10:51:17.502Z" }, + { url = "https://files.pythonhosted.org/packages/96/9f/d9914a7b8df842832850b1a18e5f47aaa071c217cdd1da2ae9deb291018b/xgboost-3.2.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:852eabc6d3b3702a59bf78dbfdcd1cb9c4d3a3b6e5ed1f8781d8b9512354fdd2", size = 131100954, upload-time = "2026-02-10T11:02:42.704Z" }, + { url = "https://files.pythonhosted.org/packages/79/98/679de17c2caa4fd3b0b4386ecf7377301702cb0afb22930a07c142fcb1d8/xgboost-3.2.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:99b4a6bbcb47212fec5cf5fbe12347215f073c08967431b0122cfbd1ee70312c", size = 131748579, upload-time = "2026-02-10T10:54:40.424Z" }, + { url = "https://files.pythonhosted.org/packages/1f/3d/1661dd114a914a67e3f7ab66fa1382e7599c2a8c340f314ad30a3e2b4d08/xgboost-3.2.0-py3-none-win_amd64.whl", hash = "sha256:0d169736fd836fc13646c7ab787167b3a8110351c2c6bc770c755ee1618f0442", size = 101681668, upload-time = "2026-02-10T10:59:31.202Z" }, +] + [[package]] name = "xlsxwriter" version = "3.2.9"