[data] support for specifying a dataset in cloud storage (#7567)

* add support for loading datasets from s3/gcs

* add comments to readme

* run linter and address comments

* add option to pass in kwargs to ray init (i.e. runtime env)

* address comment

* revert mixed up changes
This commit is contained in:
Eric Tang
2025-04-09 20:31:35 -07:00
committed by GitHub
parent bb8d79bae2
commit a8caf09c7f
5 changed files with 63 additions and 6 deletions

View File

@@ -141,6 +141,8 @@ def get_dataset_list(dataset_names: Optional[list[str]], dataset_dir: str) -> li
dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])
elif "script_url" in dataset_info[name]:
dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
elif "cloud_file_name" in dataset_info[name]:
dataset_attr = DatasetAttr("cloud_file", dataset_name=dataset_info[name]["cloud_file_name"])
else:
dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"])