预训练数据集处理过程
class PretrainDataset(Dataset):
def __init__(self, data_path, tokenizer, max_length=512):
super().__init__()
self.tokenizer = tokenizer
#存储分词器
self.max_length = max_length
#最大文本长度
self.samples = self.load_data(data_path)
#获得数据初始样本
def load_data(self, path):
samples = []
with open(path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
data = json.loads(line.strip())
# print(type(data))
# example(dict):{'text': '<|im_start|>abc。def。<|im_end|> <|im_start|>esdw。rwqe。<|im_end|>'}
# time.sleep(10)
samples.append(data)
# example(list):[{'text': '<|im_start|>abc。def。<|im_end|> <|im_start|>esdw。rwqe。<|im_end|>'},
# {'text': '<|im_start|>fadsf。wqe。<|im_end|> <|im_start|>ertw。tret。<|im_end|>'}]
# print(samples)
# time.sleep(10)
return samples
def __len__(self):
return len(self.samples)
def __getitem__(self, index):
sample = self.samples[index]
# example(dict):{'text': '<|im_start|>abc。def。<|im_end|> <|im_start|>esdw。rwqe。<|im_end|>'}
# 构建输入文本
encoding = self.tokenizer(
str(sample['text']),
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
# example(str)
# sample['text']:'<|im_start|>abc。def。<|im_end|> <|im_start|>esdw。rwqe。<|im_end|>'
input_ids = encoding.input_ids.squeeze()
# example(list):
# input_ids: tensor([ 1, 2177, ..., 1055, 2, 223, 1, 1083, ...,
# 2, 223, 1, 2104, ..., 2, 0, 0, ..., 0])
# print(type(self.tokenizer.pad_token_id))
# example(int):
# pad_token_id: 0
# print("input_ids:",input_ids)
# print("pad_token_id:",self.tokenizer.pad_token_id)
# time.sleep(10)
#
loss_mask = (input_ids != self.tokenizer.pad_token_id)
# 只标记非填充位置
X = torch.tensor(input_ids[:-1], dtype=torch.long)
# 去掉最后一位
Y = torch.tensor(input_ids[1:], dtype=torch.long)
# 去掉第一位
# print("X:",X)
# time.sleep(10)
# print("Y:",Y)
# time.sleep(10)
loss_mask = torch.tensor(loss_mask[1:], dtype=torch.long)
# 同Y
return X, Y, loss_mask
微调数据集处理过程
class SFTDataset(Dataset):
def __init__(self, jsonl_path, tokenizer, max_length=1024):
super().__init__()
self.tokenizer = tokenizer
# 存储分词器
self.max_length = max_length
# 最大文本长度
self.samples = self.load_data(jsonl_path)
# 获得数据初始样本
self.bos_id = tokenizer('<|im_start|>assistant', add_special_tokens=False).input_ids
# 获得开始标识
# example(list):[1, 1078, 538, 501]
# print(self.bos_id)
# time.sleep(10)
self.eos_id = tokenizer('<|im_end|>', add_special_tokens=False).input_ids
# 获得结束标识
# example(list):[2]
# print(self.eos_id)
# time.sleep(10)
def __len__(self):
return len(self.samples)
# 获取整体数据量
def load_data(self, path):
# 加载数据
samples = []
with open(path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
data = json.loads(line.strip())
# example(dict):
# {'conversations': [{'role': 'user', 'content': 'Requirements。'},
# {'role': 'assistant', 'content': 'feedback。'}]}
# print(data)
# time.sleep(10)
samples.append(data)
# examples(list):
# [{'conversations': [{'role': 'user', 'content': 'Requirements1。'},
# {'role': 'assistant', 'content': 'feedback1。'}]},
# {'conversations': [{'role': 'user', 'content': 'Requirements2。'},
# {'role': 'assistant', 'content': 'feedback2。'}]},
# ...
# ]
# print(samples)
# time.sleep(10)
return samples
def _create_chat_prompt(self, conversations):
"""构建符合ChatML格式的对话"""
messages = []
# example(list):
# conversations:[{'role': 'user', 'content': 'Requirements。'},
# {'role': 'assistant', 'content': 'feedback。'}]
# print(conversations)
# time.sleep(10)
for i, turn in enumerate(conversations):
role = 'user' if i % 2 == 0 else 'assistant'
messages.append({"role": role, "content": turn['content']})
# print(messages)
# time.sleep(10)
# example(list):
# messages:[{'role': 'user', 'content': 'Requirements。'},
# {'role': 'assistant', 'content': 'feedback。'}]
return self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False
)
# 把类似 ChatGPT 的对话结构(role + content)转换成模型要求的 ChatML 格式文本(带 <|im_start|> / <|im_end|> 之类标记)。
# tokenize:是否将文本直接转换成 token
# add_generation_prompt:是否提示模型继续生成文本,如:<|im_start|>user你好!<|im_end|><|im_start|>assistant 和
# <|im_start|>user你好!<|im_end|> <|im_start|>assistant 你好,有什么我能帮你的吗?<|im_end|>的区别
def _generate_loss_mask(self, input_ids):
loss_mask = [0] * len(input_ids)
# 创建相同形状的 mask
i = 0
# 定初始下标
while i < len(input_ids):
# 循环到末尾为止
if input_ids[i:i + len(self.bos_id)] == self.bos_id:
# 检测到<|im_start|>assistant时
start = i + len(self.bos_id)
# 从<|im_start|>assistant后面说的话开始计算
end = start
# 启用 end 指针
while end < len(input_ids):
# 循环到末尾前
if input_ids[end:end + len(self.eos_id)] == self.eos_id:
# 找到第一个<|im_end|>之时
break
# 找到这句话的尾了,退出循环
end += 1
# 不停往后找
for j in range(start + 1, min(end + len(self.eos_id) + 1, self.max_length)):
# star+1 应该是为了跳过换行符,min(end + len(self.eos_id) + 1是为了让说完之后再加个换行符?
loss_mask[j] = 1
#
# segment = ''.join(self.tokenizer.decode([tid], skip_special_tokens=False)
# for tid in input_ids[start + 1:min(end + len(self.eos_id) + 1, self.max_length)])
# print(repr(segment))
# time.sleep(10)
# sample-star[star+1:min(end + len(self.eos_id) + 1]:
# '阿里巴巴集团的企业文化以“客户第一、员工第二、股东第三”为核心价值观,强调“让天下没有难做的生意”的使命。公司��导开放、透明、分享、责任的团队合作精神,鼓励员工创新、追求��越,同时注重员工的个人成长和幸福感。阿里巴巴的企业文化还体现在其独特的“����神��”价
# 值观体系中,包括客户第一、拥��变化、团队合作、诚信、激情、专业等��个方面,这些价值观不仅指导着公司的日常运营,也深深影响着每一位阿里人的行为准则。<|im_end|>\n'
i = end + len(self.eos_id) if end < len(input_ids) else len(input_ids)
# 跳到句末
else:
i += 1
# 没找到就继续找
return loss_mask
def __getitem__(self, index):
sample = self.samples[index]
# example(dict):
# {'conversations': [{'role': 'user', 'content': 'Requirements。'},
# {'role': 'assistant', 'content': 'feedback。'}]}
# 构建对话提示
prompt = self._create_chat_prompt(sample['conversations'])
# 进去一个列表,列表里有两个 dict,分别包含问和答的角色以及内容,将其改造成模型能识别的格式
# example(str):
# prompt:"<|im_start|>system\n
# You are a helpful assistant<|im_end|>
# <|im_start|>user
# Requirement。<|im_end|>
# <|im_start|>assistant
# FeedBack。<|im_end|>"
# print(repr(prompt))
# time.sleep(10)
input_ids = self.tokenizer(prompt).input_ids[:self.max_length]
# 长度过长就截断
input_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(input_ids))
# 长度不够就填充
# 生成动态损失掩码
loss_mask = self._generate_loss_mask(input_ids)
# 只需要对 assistant的 feedback 求损失就行了,所以只码住 assistant 的 feedback
# 构建训练数据
X = torch.tensor(input_ids[:-1], dtype=torch.long)
Y = torch.tensor(input_ids[1:], dtype=torch.long)
loss_mask = torch.tensor(loss_mask[1:], dtype=torch.long) # 对齐预测位置
return X, Y, loss_mask