fix modelscope data hub
Former-commit-id: 5b63e8c22538a4788e4b6c8df50e6e6be93ceeac
This commit is contained in:
@@ -2,17 +2,19 @@ import os
|
||||
import json
|
||||
from typing import List, Literal, Optional
|
||||
from dataclasses import dataclass, field
|
||||
from llmtuner.extras.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
DATA_CONFIG = "dataset_info.json"
|
||||
|
||||
|
||||
def use_modelscope() -> bool:
|
||||
return bool(int(os.environ.get("USE_MODELSCOPE_HUB", "0")))
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatasetAttr:
|
||||
|
||||
load_from: str
|
||||
load_from: Literal["hf_hub", "ms_hub", "script", "file"]
|
||||
dataset_name: Optional[str] = None
|
||||
dataset_sha1: Optional[str] = None
|
||||
system_prompt: Optional[str] = None
|
||||
@@ -155,19 +157,25 @@ class DataArguments:
|
||||
if name not in dataset_info:
|
||||
raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG))
|
||||
|
||||
if "hf_hub_url" in dataset_info[name] or 'ms_hub_url' in dataset_info[name]:
|
||||
url_key_name = "hf_hub_url"
|
||||
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')):
|
||||
if 'ms_hub_url' in dataset_info[name]:
|
||||
url_key_name = 'ms_hub_url'
|
||||
else:
|
||||
logger.warning('You are using ModelScope Hub, but the specified dataset '
|
||||
'has no `ms_hub_url` key, so `hf_hub_url` will be used instead.')
|
||||
has_hf_url = "hf_hub_url" in dataset_info[name]
|
||||
has_ms_url = "ms_hub_url" in dataset_info[name]
|
||||
|
||||
dataset_attr = DatasetAttr(url_key_name[:url_key_name.index('_url')],
|
||||
dataset_name=dataset_info[name][url_key_name])
|
||||
if has_hf_url or has_ms_url:
|
||||
if (use_modelscope() and has_ms_url) or (not has_hf_url):
|
||||
dataset_attr = DatasetAttr(
|
||||
"ms_hub",
|
||||
dataset_name=dataset_info[name]["ms_hub_url"]
|
||||
)
|
||||
else:
|
||||
dataset_attr = DatasetAttr(
|
||||
"hf_hub",
|
||||
dataset_name=dataset_info[name]["hf_hub_url"]
|
||||
)
|
||||
elif "script_url" in dataset_info[name]:
|
||||
dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
|
||||
dataset_attr = DatasetAttr(
|
||||
"script",
|
||||
dataset_name=dataset_info[name]["script_url"]
|
||||
)
|
||||
else:
|
||||
dataset_attr = DatasetAttr(
|
||||
"file",
|
||||
|
||||
Reference in New Issue
Block a user