-
Notifications
You must be signed in to change notification settings - Fork 0
Description
Describe the bug
SageMaker Local lets you train models using data either on your local machine or S3. When using ModelTrainer, these artifacts are stored in a local directory under local_container_root, where for each channel, a TemporaryDirectory is created and files are copied there.
However, I've noticed that if your input data is a single S3 file, the temporary directory is not created, leading to a File or directory not found error.
I found that adding a os.makedirs(local_dir, exists_ok=True) after this line fixes the problem:
https://github.com/aws/sagemaker-python-sdk/blob/96e49ba6292c25e8569303d2a003812bbc9d7ba1/src/sagemaker/modules/local_core/local_container.py#L551
but there's probably a cleaner way.
To reproduce
trainer = ModelTrainer(
training_mode=Mode.LOCAL_CONTAINER,
local_directory_root='.smlocal',
...
)
trainer.train(
input_data_config=[
InputData(channel_name="input", data_source="s3://my-bucket/my-single-file.csv")
],
)Expected behavior
The files are downloaded locally
Screenshots or logs
See stack trace below:
Stack trace (click to expand)
│ ❱ 153 │ trainer = start_training_job( │
│ 154 │ │ num=num, │
│ 155 │ │ data_config=input_data, │
│ 156 │ │ config=config_dict, │
│ │
│ /Users/aigars/workspace/capture-v2/sm/main.py:76 in start_training_job │
│ │
│ 73 │ │ # instance_count=1, │
│ 74 │ │ # metric_definitions=METRIC_DEFINITIONS, │
│ 75 │ ) │
│ ❱ 76 │ trainer.train( │
│ 77 │ │ input_data_config=data_config, │
│ 78 │ │ wait=not run_in_background, # whether to wait for job to finish │
│ 79 │ │ logs=not run_in_background, # whether to log job logs to stdout │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/te │
│ lemetry/telemetry_logging.py:175 in wrapper │
│ │
│ 172 │ │ │ │ │ "sagemaker_session is not provided or not valid.", │
│ 173 │ │ │ │ │ func_name, │
│ 174 │ │ │ │ ) │
│ ❱ 175 │ │ │ │ return func(*args, **kwargs) │
│ 176 │ │ │
│ 177 │ │ return wrapper │
│ 178 │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/pydantic/val │
│ idate_call_decorator.py:60 in wrapper_function │
│ │
│ 57 │ │ │
│ 58 │ │ @functools.wraps(function) │
│ 59 │ │ def wrapper_function(*args, **kwargs): │
│ ❱ 60 │ │ │ return validate_call_wrapper(*args, **kwargs) │
│ 61 │ │ │
│ 62 │ │ wrapper_function.raw_function = function # type: ignore │
│ 63 │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/pydantic/_in │
│ ternal/_validate_call.py:96 in __call__ │
│ │
│ 93 │ │ │ self.__return_pydantic_validator__ = None │
│ 94 │ │
│ 95 │ def __call__(self, *args: Any, **kwargs: Any) -> Any: │
│ ❱ 96 │ │ res = self.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args, │
│ 97 │ │ if self.__return_pydantic_validator__: │
│ 98 │ │ │ return self.__return_pydantic_validator__(res) │
│ 99 │ │ return res │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/mo │
│ dules/train/model_trainer.py:649 in train │
│ │
│ 646 │ │ │ │ hyper_parameters=string_hyper_parameters, │
│ 647 │ │ │ │ environment=self.environment, │
│ 648 │ │ │ ) │
│ ❱ 649 │ │ │ local_container.train(wait) │
│ 650 │ │
│ 651 │ def create_input_data_channel( │
│ 652 │ │ self, channel_name: str, data_source: DataSourceType, key_prefix: Optional[str] │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/mo │
│ dules/local_core/local_container.py:166 in train │
│ │
│ 163 │ │ │
│ 164 │ │ data_dir = os.path.join(self.container_root, "input", "data") │
│ 165 │ │ os.makedirs(data_dir, exist_ok=True) │
│ ❱ 166 │ │ volumes = self._prepare_training_volumes( │
│ 167 │ │ │ data_dir, self.input_data_config, self.hyper_parameters │
│ 168 │ │ ) │
│ 169 │ │ # If local, source directory needs to be updated to mounted /opt/ml/code path │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/mo │
│ dules/local_core/local_container.py:519 in _prepare_training_volumes │
│ │
│ 516 │ │ │ channel_dir = os.path.join(data_dir, channel_name) │
│ 517 │ │ │ os.makedirs(channel_dir, exist_ok=True) │
│ 518 │ │ │ │
│ ❱ 519 │ │ │ data_source_local_path = self._get_data_source_local_path(channel.data_sourc │
│ 520 │ │ │ volumes.append(_Volume(data_source_local_path, channel=channel_name).map) │
│ 521 │ │ │
│ 522 │ │ # If there is a training script directory and it is a local directory, │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/mo │
│ dules/local_core/local_container.py:555 in _get_data_source_local_path │
│ │
│ 552 │ │ │ # make sure local_dir exists │
│ 553 │ │ │ # os.makedirs(local_dir, exist_ok=True) │
│ 554 │ │ │ self._temporary_folders.append(local_dir) │
│ ❱ 555 │ │ │ download_folder(parsed_uri.netloc, parsed_uri.path, local_dir, self.sagemake │
│ 556 │ │ │ return local_dir │
│ 557 │ │ else: │
│ 558 │ │ │ return os.path.abspath(data_source.file_system_data_source.directory_path) │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/sagemaker/ut │
│ ils.py:410 in download_folder │
│ │
│ 407 │ if not prefix.endswith("/"): │
│ 408 │ │ try: │
│ 409 │ │ │ file_destination = os.path.join(target, os.path.basename(prefix)) │
│ ❱ 410 │ │ │ s3.Object(bucket_name, prefix).download_file(file_destination) │
│ 411 │ │ │ return │
│ 412 │ │ except botocore.exceptions.ClientError as e: │
│ 413 │ │ │ err_info = e.response["Error"] │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/boto3/s3/inj │
│ ect.py:361 in object_download_file │
│ │
│ 358 │ :param Config: The transfer configuration to be used when performing the │
│ 359 │ │ transfer. │
│ 360 │ """ │
│ ❱ 361 │ return self.meta.client.download_file( │
│ 362 │ │ Bucket=self.bucket_name, │
│ 363 │ │ Key=self.key, │
│ 364 │ │ Filename=Filename, │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/boto3/s3/inj │
│ ect.py:192 in download_file │
│ │
│ 189 │ │ transfer. │
│ 190 │ """ │
│ 191 │ with S3Transfer(self, Config) as transfer: │
│ ❱ 192 │ │ return transfer.download_file( │
│ 193 │ │ │ bucket=Bucket, │
│ 194 │ │ │ key=Key, │
│ 195 │ │ │ filename=Filename, │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/boto3/s3/tra │
│ nsfer.py:406 in download_file │
│ │
│ 403 │ │ │ bucket, key, filename, extra_args, subscribers │
│ 404 │ │ ) │
│ 405 │ │ try: │
│ ❱ 406 │ │ │ future.result() │
│ 407 │ │ # This is for backwards compatibility where when retries are │
│ 408 │ │ # exceeded we need to throw the same error from boto3 instead of │
│ 409 │ │ # s3transfer's built in RetriesExceededError as current users are │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/f │
│ utures.py:103 in result │
│ │
│ 100 │ │ │ # Usually the result() method blocks until the transfer is done, │
│ 101 │ │ │ # however if a KeyboardInterrupt is raised we want want to exit │
│ 102 │ │ │ # out of this and propagate the exception. │
│ ❱ 103 │ │ │ return self._coordinator.result() │
│ 104 │ │ except KeyboardInterrupt as e: │
│ 105 │ │ │ self.cancel() │
│ 106 │ │ │ raise e │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/f │
│ utures.py:264 in result │
│ │
│ 261 │ │ # Once done waiting, raise an exception if present or return the │
│ 262 │ │ # final result. │
│ 263 │ │ if self._exception: │
│ ❱ 264 │ │ │ raise self._exception │
│ 265 │ │ return self._result │
│ 266 │ │
│ 267 │ def cancel(self, msg='', exc_type=CancelledError): │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/t │
│ asks.py:135 in __call__ │
│ │
│ 132 │ │ │ # task to the TransferFuture had failed) then execute the task's │
│ 133 │ │ │ # main() method. │
│ 134 │ │ │ if not self._transfer_coordinator.done(): │
│ ❱ 135 │ │ │ │ return self._execute_main(kwargs) │
│ 136 │ │ except Exception as e: │
│ 137 │ │ │ self._log_and_set_exception(e) │
│ 138 │ │ finally: │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/t │
│ asks.py:158 in _execute_main │
│ │
│ 155 │ │ # Log what is about to be executed. │
│ 156 │ │ logger.debug(f"Executing task {self} with kwargs {kwargs_to_display}") │
│ 157 │ │ │
│ ❱ 158 │ │ return_value = self._main(**kwargs) │
│ 159 │ │ # If the task is the final task, then set the TransferFuture's │
│ 160 │ │ # value to the return value from main(). │
│ 161 │ │ if self._is_final: │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/d │
│ ownload.py:640 in _main │
│ │
│ 637 │ │ :param data: The data to write │
│ 638 │ │ :param offset: The offset to write the data to. │
│ 639 │ │ """ │
│ ❱ 640 │ │ fileobj.seek(offset) │
│ 641 │ │ fileobj.write(data) │
│ 642 │
│ 643 │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/u │
│ tils.py:393 in seek │
│ │
│ 390 │ │ self._fileobj.write(data) │
│ 391 │ │
│ 392 │ def seek(self, where, whence=0): │
│ ❱ 393 │ │ self._open_if_needed() │
│ 394 │ │ self._fileobj.seek(where, whence) │
│ 395 │ │
│ 396 │ def tell(self): │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/u │
│ tils.py:376 in _open_if_needed │
│ │
│ 373 │ │
│ 374 │ def _open_if_needed(self): │
│ 375 │ │ if self._fileobj is None: │
│ ❱ 376 │ │ │ self._fileobj = self._open_function(self._filename, self._mode) │
│ 377 │ │ │ if self._start_byte != 0: │
│ 378 │ │ │ │ self._fileobj.seek(self._start_byte) │
│ 379 │
│ │
│ /opt/homebrew/Caskroom/mambaforge/base/envs/capture-v2/lib/python3.12/site-packages/s3transfer/u │
│ tils.py:287 in open │
│ │
│ 284 │ │ ) │
│ 285 │ │
│ 286 │ def open(self, filename, mode): │
│ ❱ 287 │ │ return open(filename, mode) │
│ 288 │ │
│ 289 │ def remove_file(self, filename): │
│ 290 │ │ """Remove a file, noop if file does not exist."""
System information
A description of your system. Please provide:
- SageMaker Python SDK version:
2.239.0 - Framework name (eg. PyTorch) or algorithm (eg. KMeans): n/a
- Framework version: n/a
- Python version: 3.12
- CPU or GPU: CPU
- Custom Docker image (Y/N): Y
Additional context
Add any other context about the problem here.