Former-commit-id: 819cc1353599e5fa45658bc56dd0dbe4b258b197
This commit is contained in:
@@ -1,40 +1,50 @@
|
||||
import os
|
||||
import hashlib
|
||||
from typing import List
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
from datasets import Dataset, concatenate_datasets, load_dataset
|
||||
from datasets import concatenate_datasets, interleave_datasets, load_dataset
|
||||
|
||||
from llmtuner.extras.logging import get_logger
|
||||
from llmtuner.hparams import ModelArguments, DataArguments
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datasets import Dataset
|
||||
from llmtuner.hparams import ModelArguments, DataArguments
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
EXT2TYPE = {
|
||||
"csv": "csv",
|
||||
"json": "json",
|
||||
"jsonl": "json",
|
||||
"txt": "text"
|
||||
}
|
||||
|
||||
|
||||
def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None:
|
||||
if file_sha1 is None:
|
||||
logger.warning("Checksum failed: missing SHA-1 hash value in dataset_info.json.")
|
||||
return
|
||||
|
||||
if len(data_files) != 1:
|
||||
logger.warning("Checksum failed: too many files.")
|
||||
return
|
||||
|
||||
with open(data_files[0], "rb") as f:
|
||||
sha1 = hashlib.sha1(f.read()).hexdigest()
|
||||
if sha1 != file_sha1:
|
||||
logger.warning("Checksum failed: mismatched SHA-1 hash value at {}.".format(data_files[0]))
|
||||
|
||||
|
||||
def get_dataset(
|
||||
model_args: ModelArguments,
|
||||
data_args: DataArguments
|
||||
) -> Dataset:
|
||||
|
||||
def checksum(file_path, hash):
|
||||
with open(file_path, "rb") as datafile:
|
||||
binary_data = datafile.read()
|
||||
sha1 = hashlib.sha1(binary_data).hexdigest()
|
||||
if sha1 != hash:
|
||||
logger.warning("Checksum failed for {}. It may vary depending on the platform.".format(file_path))
|
||||
|
||||
ext2type = {
|
||||
"csv": "csv",
|
||||
"json": "json",
|
||||
"jsonl": "json",
|
||||
"txt": "text"
|
||||
}
|
||||
|
||||
model_args: "ModelArguments",
|
||||
data_args: "DataArguments"
|
||||
) -> "Dataset":
|
||||
max_samples = data_args.max_samples
|
||||
all_datasets: List[Dataset] = [] # support multiple datasets
|
||||
all_datasets: List["Dataset"] = [] # support multiple datasets
|
||||
|
||||
for dataset_attr in data_args.dataset_list:
|
||||
|
||||
logger.info("Loading dataset {}...".format(dataset_attr))
|
||||
|
||||
if dataset_attr.load_from == "hf_hub":
|
||||
@@ -47,60 +57,56 @@ def get_dataset(
|
||||
data_path = None
|
||||
data_files: List[str] = []
|
||||
|
||||
if os.path.isdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)):
|
||||
if os.path.isdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # directory
|
||||
for file_name in os.listdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)):
|
||||
data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name, file_name))
|
||||
|
||||
if data_path is None:
|
||||
data_path = ext2type.get(data_files[0].split(".")[-1], None)
|
||||
data_path = EXT2TYPE.get(file_name.split(".")[-1], None)
|
||||
else:
|
||||
assert data_path == ext2type.get(data_files[-1].split(".")[-1], None), "file type does not match."
|
||||
elif os.path.isfile(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)):
|
||||
assert data_path == EXT2TYPE.get(file_name.split(".")[-1], None), "file type does not match."
|
||||
elif os.path.isfile(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # single file
|
||||
data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name))
|
||||
data_path = ext2type.get(data_files[0].split(".")[-1], None)
|
||||
data_path = EXT2TYPE.get(dataset_attr.dataset_name.split(".")[-1], None)
|
||||
else:
|
||||
raise ValueError("File not found.")
|
||||
|
||||
assert data_path, "File extension must be txt, csv, json or jsonl."
|
||||
|
||||
if len(data_files) == 1 and dataset_attr.dataset_sha1 is not None:
|
||||
checksum(data_files[0], dataset_attr.dataset_sha1)
|
||||
else:
|
||||
logger.warning("Checksum failed: missing SHA-1 hash value in dataset_info.json or too many files.")
|
||||
checksum(data_files, dataset_attr.dataset_sha1)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
raw_datasets = load_dataset(
|
||||
dataset = load_dataset(
|
||||
data_path,
|
||||
data_files=data_files,
|
||||
split=data_args.split,
|
||||
cache_dir=model_args.cache_dir,
|
||||
streaming=data_args.streaming,
|
||||
use_auth_token=True if model_args.use_auth_token else None
|
||||
)
|
||||
dataset = raw_datasets[data_args.split]
|
||||
|
||||
if max_samples is not None:
|
||||
max_samples_temp = min(len(dataset), max_samples)
|
||||
dataset = dataset.select(range(max_samples_temp))
|
||||
|
||||
dummy_data = [None] * len(dataset)
|
||||
prefix_data = [dataset_attr.source_prefix] * len(dataset)
|
||||
for column_name, target_name in [
|
||||
("prompt_column", "prompt"),
|
||||
("query_column", "query"),
|
||||
("response_column", "response"),
|
||||
("history_column", "history")
|
||||
]: # every dataset will have 4 columns same as each other
|
||||
if getattr(dataset_attr, column_name) != target_name:
|
||||
if getattr(dataset_attr, column_name):
|
||||
dataset = dataset.rename_column(getattr(dataset_attr, column_name), target_name)
|
||||
else: # None or empty string
|
||||
dataset = dataset.add_column(target_name, dummy_data)
|
||||
dataset = dataset.add_column("prefix", prefix_data)
|
||||
for column_name in ["prompt", "query", "response", "history"]: # align datasets
|
||||
if getattr(dataset_attr, column_name) and getattr(dataset_attr, column_name) != column_name:
|
||||
dataset = dataset.rename_column(getattr(dataset_attr, column_name), column_name)
|
||||
|
||||
if dataset_attr.source_prefix: # add prefix
|
||||
dataset = dataset.map(lambda _: {"prefix": dataset_attr.source_prefix})
|
||||
|
||||
all_datasets.append(dataset)
|
||||
|
||||
if len(data_args.dataset_list) == 1:
|
||||
all_datasets = all_datasets[0]
|
||||
return all_datasets[0]
|
||||
elif data_args.mix_strategy == "concat":
|
||||
if data_args.streaming:
|
||||
logger.warning("The samples between different datasets will not be mixed in streaming mode.")
|
||||
return concatenate_datasets(all_datasets)
|
||||
elif data_args.mix_strategy.startswith("interleave"):
|
||||
if not data_args.streaming:
|
||||
logger.warning("We recommend using `mix_strategy=concat` in non-streaming mode.")
|
||||
stopping_strategy = "first_exhausted" if data_args.mix_strategy.endswith("under") else "all_exhausted"
|
||||
return interleave_datasets(all_datasets, stopping_strategy=stopping_strategy)
|
||||
else:
|
||||
all_datasets = concatenate_datasets(all_datasets)
|
||||
|
||||
return all_datasets
|
||||
raise ValueError("Unknown mixing strategy.")
|
||||
|
||||
@@ -1,65 +1,63 @@
|
||||
from typing import Literal
|
||||
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal
|
||||
from itertools import chain
|
||||
from transformers import Seq2SeqTrainingArguments
|
||||
from transformers.tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from llmtuner.extras.constants import IGNORE_INDEX
|
||||
from llmtuner.extras.template import get_template
|
||||
from llmtuner.hparams import DataArguments
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datasets import Dataset
|
||||
from transformers import Seq2SeqTrainingArguments
|
||||
from transformers.tokenization_utils import PreTrainedTokenizer
|
||||
from llmtuner.hparams import DataArguments
|
||||
|
||||
|
||||
def preprocess_dataset(
|
||||
dataset: Dataset,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
data_args: DataArguments,
|
||||
training_args: Seq2SeqTrainingArguments,
|
||||
dataset: "Dataset",
|
||||
tokenizer: "PreTrainedTokenizer",
|
||||
data_args: "DataArguments",
|
||||
training_args: "Seq2SeqTrainingArguments",
|
||||
stage: Literal["pt", "sft", "rm", "ppo"]
|
||||
) -> Dataset:
|
||||
) -> "Dataset":
|
||||
column_names = list(dataset.column_names or [])
|
||||
template = get_template(data_args.template)
|
||||
|
||||
column_names = list(dataset.column_names)
|
||||
prompt_template = get_template(data_args.prompt_template)
|
||||
|
||||
# support question with a single answer or multiple answers
|
||||
def get_dialog(examples):
|
||||
def construct_example(examples: Dict[str, List[Any]]) -> Generator[Any, None, None]:
|
||||
for i in range(len(examples["prompt"])):
|
||||
if examples["prompt"][i] and examples["response"][i]:
|
||||
query, answer = examples["prompt"][i], examples["response"][i]
|
||||
query = query + "\n" + examples["query"][i] if examples["query"][i] else query
|
||||
prefix = examples["prefix"][i] if examples["prefix"][i] else ""
|
||||
dialog = prompt_template.get_dialog(query, answer, examples["history"][i], prefix)
|
||||
yield dialog
|
||||
query, response = examples["prompt"][i], examples["response"][i]
|
||||
query = query + "\n" + examples["query"][i] if "query" in examples and examples["query"][i] else query
|
||||
history = history if "history" in examples and examples["history"][i] else []
|
||||
prefix = prefix if "prefix" in examples and examples["prefix"][i] else ""
|
||||
yield query, response, history, prefix
|
||||
|
||||
def preprocess_pretrain_dataset(examples):
|
||||
def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
|
||||
# build grouped texts with format `<bos> X1 X2 X3 ...` (without <eos>)
|
||||
text_ids = tokenizer(examples["prompt"], add_special_tokens=False)["input_ids"]
|
||||
concatenated_ids = list(chain(*text_ids))
|
||||
total_length = len(concatenated_ids)
|
||||
block_size = data_args.max_source_length - 1
|
||||
tokenized_examples = tokenizer(examples["prompt"], add_special_tokens=False)
|
||||
concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
|
||||
total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
|
||||
block_size = data_args.max_source_length
|
||||
# we drop the small remainder, and if the total_length < block_size, we exclude this batch
|
||||
total_length = (total_length // block_size) * block_size
|
||||
# split by chunks of max_source_length
|
||||
result = [[tokenizer.bos_token_id] + concatenated_ids[i: i + block_size]
|
||||
for i in range(0, total_length, block_size)]
|
||||
return {
|
||||
"input_ids": result,
|
||||
"labels": result.copy()
|
||||
result = {
|
||||
k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
|
||||
for k, t in concatenated_examples.items()
|
||||
}
|
||||
result["labels"] = result["input_ids"].copy()
|
||||
return result
|
||||
|
||||
def preprocess_supervised_dataset(examples):
|
||||
def preprocess_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
|
||||
# build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
|
||||
# for input with history, we build multiple input-label pairs just like:
|
||||
# https://github.com/lm-sys/FastChat/blob/f17c092f64840fa6354ed52789dccb2daa793d0b/fastchat/train/train.py#L112
|
||||
model_inputs = {"input_ids": [], "labels": []}
|
||||
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
|
||||
max_length = data_args.max_source_length + data_args.max_target_length
|
||||
|
||||
for dialog in get_dialog(examples):
|
||||
for query, response, history, prefix in construct_example(examples):
|
||||
input_ids, labels = [], []
|
||||
|
||||
for i in range(len(dialog) // 2):
|
||||
source_ids = tokenizer.encode(text=dialog[2*i], add_special_tokens=(i == 0))
|
||||
target_ids = tokenizer.encode(text=dialog[2*i+1], add_special_tokens=False)
|
||||
for i, (query_i, resp_i) in enumerate(template.get_dialog(query, response, history, prefix)):
|
||||
source_ids = tokenizer.encode(text=query_i, add_special_tokens=(i == 0))
|
||||
target_ids = tokenizer.encode(text=resp_i, add_special_tokens=False)
|
||||
|
||||
if len(source_ids) > data_args.max_source_length:
|
||||
source_ids = source_ids[:data_args.max_source_length]
|
||||
@@ -73,19 +71,20 @@ def preprocess_dataset(
|
||||
labels += [IGNORE_INDEX] * len(source_ids) + target_ids + [tokenizer.eos_token_id]
|
||||
|
||||
model_inputs["input_ids"].append(input_ids)
|
||||
model_inputs["attention_mask"].append([1] * len(input_ids))
|
||||
model_inputs["labels"].append(labels)
|
||||
|
||||
return model_inputs
|
||||
|
||||
def preprocess_unsupervised_dataset(examples):
|
||||
def preprocess_unsupervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
|
||||
# build inputs with format `<bos> X` and labels with format `<bos> Y`
|
||||
model_inputs = {"input_ids": [], "labels": []}
|
||||
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
|
||||
|
||||
for dialog in get_dialog(examples):
|
||||
prompt, answer = "".join(dialog[:-1]), dialog[-1]
|
||||
for query, response, history, prefix in construct_example(examples):
|
||||
prompt = template.get_prompt(query, history, prefix, tokenizer.eos_token)
|
||||
|
||||
source_ids = tokenizer.encode(text=prompt, add_special_tokens=True)
|
||||
target_ids = tokenizer.encode(text=answer, add_special_tokens=True)
|
||||
target_ids = tokenizer.encode(text=response, add_special_tokens=True)
|
||||
|
||||
if len(source_ids) > data_args.max_source_length:
|
||||
source_ids = source_ids[:data_args.max_source_length]
|
||||
@@ -93,6 +92,7 @@ def preprocess_dataset(
|
||||
target_ids = target_ids[:data_args.max_target_length]
|
||||
|
||||
model_inputs["input_ids"].append(source_ids)
|
||||
model_inputs["attention_mask"].append([1] * len(source_ids))
|
||||
model_inputs["labels"].append(target_ids)
|
||||
|
||||
return model_inputs
|
||||
@@ -100,12 +100,12 @@ def preprocess_dataset(
|
||||
def preprocess_pairwise_dataset(examples):
|
||||
# build input pairs with format `<bos> X Y1 <eos>` and `<bos> X Y2 <eos>`
|
||||
model_inputs = {"accept_ids": [], "reject_ids": []}
|
||||
for dialog in get_dialog(examples):
|
||||
prompt, answer = "".join(dialog[:-1]), dialog[-1]
|
||||
for query, response, history, prefix in construct_example(examples):
|
||||
prompt = template.get_prompt(query, history, prefix, tokenizer.eos_token)
|
||||
|
||||
source_ids = tokenizer.encode(text=prompt, add_special_tokens=True)
|
||||
accept_ids = tokenizer.encode(text=answer[0], add_special_tokens=False)
|
||||
reject_ids = tokenizer.encode(text=answer[1], add_special_tokens=False)
|
||||
accept_ids = tokenizer.encode(text=response[0], add_special_tokens=False)
|
||||
reject_ids = tokenizer.encode(text=response[1], add_special_tokens=False)
|
||||
|
||||
if len(source_ids) > data_args.max_source_length:
|
||||
source_ids = source_ids[:data_args.max_source_length]
|
||||
@@ -141,34 +141,44 @@ def preprocess_dataset(
|
||||
print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
|
||||
|
||||
if stage == "pt":
|
||||
dataset = dataset.filter(lambda example: example["prompt"])
|
||||
preprocess_function = preprocess_pretrain_dataset
|
||||
elif stage == "sft":
|
||||
if not training_args.predict_with_generate:
|
||||
preprocess_function = preprocess_supervised_dataset
|
||||
else:
|
||||
preprocess_function = preprocess_unsupervised_dataset
|
||||
elif stage == "sft" and not training_args.predict_with_generate:
|
||||
dataset = dataset.filter(lambda example: example["prompt"] and example["response"])
|
||||
preprocess_function = preprocess_supervised_dataset
|
||||
elif stage == "rm":
|
||||
dataset = dataset.filter(lambda example: example["prompt"] and len(example["response"]) > 1)
|
||||
preprocess_function = preprocess_pairwise_dataset
|
||||
elif stage == "ppo":
|
||||
else:
|
||||
dataset = dataset.filter(lambda example: example["prompt"])
|
||||
preprocess_function = preprocess_unsupervised_dataset
|
||||
|
||||
with training_args.main_process_first(desc="dataset map pre-processing"):
|
||||
kwargs = {}
|
||||
if not data_args.streaming:
|
||||
kwargs = dict(
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on dataset"
|
||||
)
|
||||
|
||||
dataset = dataset.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
batched=True,
|
||||
remove_columns=column_names,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on dataset"
|
||||
**kwargs
|
||||
)
|
||||
|
||||
if data_args.streaming:
|
||||
dataset = dataset.shuffle(buffer_size=data_args.buffer_size)
|
||||
|
||||
if stage == "pt":
|
||||
print_unsupervised_dataset_example(dataset[0])
|
||||
print_unsupervised_dataset_example(next(iter(dataset)))
|
||||
elif stage == "sft":
|
||||
print_supervised_dataset_example(dataset[0])
|
||||
print_supervised_dataset_example(next(iter(dataset)))
|
||||
elif stage == "rm":
|
||||
print_pairwise_dataset_example(dataset[0])
|
||||
print_pairwise_dataset_example(next(iter(dataset)))
|
||||
elif stage == "ppo":
|
||||
print_unsupervised_dataset_example(dataset[0])
|
||||
print_unsupervised_dataset_example(next(iter(dataset)))
|
||||
|
||||
return dataset
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
from typing import Dict
|
||||
from datasets import Dataset
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datasets import Dataset
|
||||
|
||||
|
||||
def split_dataset(
|
||||
dataset: Dataset, dev_ratio: float, do_train: bool
|
||||
) -> Dict[str, Dataset]:
|
||||
# Split the dataset
|
||||
def split_dataset(dataset: "Dataset", dev_ratio: float, do_train: bool) -> Dict[str, "Dataset"]:
|
||||
if do_train:
|
||||
if dev_ratio > 1e-6:
|
||||
if dev_ratio > 1e-6: # Split the dataset
|
||||
dataset = dataset.train_test_split(test_size=dev_ratio)
|
||||
return {"train_dataset": dataset["train"], "eval_dataset": dataset["test"]}
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user