🤗 Transformers 提供了便于快速下载和使用的 API,让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 model hub 与社区共享。同时,每个定义的 Python 模块都是完全独立的,便于修改和快速进行研究实验。
https://github.com/huggingface/transformers
简单说就是利用这个库可以快速简单帮我们微调模型,很多东西都封装好了可以直接拿来用。
准备
步骤 下载基模,使用 hf-mirror
会快,或者镜像环境如果支持魔法可以直接魔法下
1 2 3 4 $ cd /tmp $ git lfs install $ git clone https://hf-mirror.com/Qwen/Qwen2.5-7B-Instruct $ cd Qwen2.5-7B-Instruct
微调代码 数据集处理 data_preprocess.py 这个部分需要根据你实际的具体数据格式进行处理,其中的关键就是读取已有的数据格式,将已有的数据格式转换为训练所需的数据集格式而已。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 import jsonfrom torch.utils.data import Datasetclass MyDataset (Dataset ): def __init__ (self, data, tokenizer, args ): super (MyDataset, self ).__init__() self .data = data self .tokenizer = tokenizer self .prompt_column = args.prompt_column self .response_column = args.response_column self .max_source_length = args.max_source_length self .max_target_length = args.max_target_length def __len__ (self ): return len (self .data) def __getitem__ (self, i ): item = self .data[i] messages = item["messages" ] system_message = messages[0 ] user_message = messages[1 ] assistant_message = messages[2 ] prompt = build_prompt(system_message, user_message) response = build_response(assistant_message) context = self .tokenizer( prompt, max_length=self .max_source_length, add_special_tokens=False ) response_encoding = self .tokenizer( response, max_length=self .max_target_length, add_special_tokens=False ) input_ids = context["input_ids" ] + response_encoding["input_ids" ] attention_mask = context["attention_mask" ] + response_encoding["attention_mask" ] labels = [-100 ] * len (context["input_ids" ]) + response_encoding["input_ids" ] assert len (input_ids) == len (labels), f"length mismatch: {len (input_ids)} vs {len (labels)} " return { "input_ids" : input_ids, "attention_mask" : attention_mask, "labels" : labels } def build_prompt (system_message, user_message ): system_content = system_message["content" ] tools = system_message.get("tools" , []) user_content = user_message["content" ] prompt = f"<|im_start|>system\n{system_content} " if tools: tool_str = json.dumps(tools, ensure_ascii=False ) prompt += f"\n\nTools: {tool_str} " prompt += f"\n<|im_end|>\n<|im_start|>user\n{user_content} \n<|im_end|>\n<|im_start|>assistant\n" return prompt def build_response (assistant_message ): assistant_content = assistant_message["content" ] response = f"{assistant_content} \n<|im_end|>" return response
看代码容易迷糊,直接看数据案例,首先是提示文本大概的样子,我们需要将原始的数据集转换为对话模型训练 的格式,所以原始格式不重要 ,只要你最终能整成这样都可以。
1 2 3 4 5 6 7 8 9 <|im_start|>system You are a helpful assistant. <|im_end|> <|im_start|>user What is the capital of France? <|im_end|> <|im_start|>assistant The capital of France is Paris. <|im_end|>
然后是最终 tokenizer 后的样子
1 2 3 4 5 { 'input_ids': tensor([ 101 , 7429 , 2105 , 2019 , 17341 , 999 , 2005 , 3185 , 4201 , 14926 , 2000 , 7976 , 22124 , 102. .....] ), 'attention_mask': tensor([ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , ....] ), 'labels': tensor(1 ) }
微调 finetune.py 可以参考
网上还有很多,就不一一举例,直接让 GPT 写一个就好了,因为 transformers 都封装好了,其中最关键的还是在参数的设置上
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 import jsonimport torchfrom transformers import ( AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForSeq2Seq, HfArgumentParser, TrainingArguments, Trainer ) from peft import LoraConfig, TaskType, get_peft_modelfrom arguments import ModelArguments, DataTrainingArguments, PeftArgumentsfrom data_preprocess import MyDatasetdef main (): parser = HfArgumentParser((ModelArguments, DataTrainingArguments, PeftArguments, TrainingArguments)) model_args, data_args, peft_args, training_args = parser.parse_args_into_dataclasses() model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, torch_dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) lora_config = LoraConfig( inference_mode=False , task_type=TaskType.CAUSAL_LM, target_modules=["q_proj" , "k_proj" , "v_proj" ], r=peft_args.lora_rank, lora_alpha=peft_args.lora_alpha, lora_dropout=peft_args.lora_dropout ) model = get_peft_model(model, lora_config).to("cuda" ) model.print_trainable_parameters() data_collator = DataCollatorForSeq2Seq( tokenizer=tokenizer, padding=True ) if training_args.do_train: with open (data_args.train_file, "r" , encoding="utf-8" ) as f: train_data = [json.loads(line) for line in f] train_dataset = Dataset(train_data, tokenizer, data_args) trainer = Trainer( model=model, tokenizer=tokenizer, data_collator=data_collator, args=training_args, train_dataset=train_dataset if training_args.do_train else None , eval_dataset=eval_dataset if training_args.do_eval else None , ) if training_args.do_train: model.gradient_checkpointing_enable() model.enable_input_require_grads() trainer.train() if __name__ == "__main__" : main()
启动脚本 train.sh 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 #! /usr/bin/env bash set -exLR=2e-4 DATESTR=`date +%Y%m%d-%H%M%S` RUN_NAME=ft_linkinstar_qwen2 OUTPUT_DIR=output/${RUN_NAME} -${DATESTR} mkdir -p $OUTPUT_DIR MODEL_PATH="/tmp/Qwen2.5-7B-Instruct" CUDA_VISIBLE_DEVICES=0 python finetune.py \ --do_train \ --train_file ../data/train.jsonl \ --model_name_or_path "${MODEL_PATH} " \ --output_dir $OUTPUT_DIR \ --max_source_length 2048 \ --max_target_length 1024 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 4 \ --evaluation_strategy steps \ --eval_steps 300 \ --num_train_epochs 3 \ --logging_steps 30 \ --logging_dir $OUTPUT_DIR /logs \ --save_steps 200 \ --learning_rate $LR \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout 0.1 2>&1 | tee ${OUTPUT_DIR} /train.log
其中如果你租的 GPU 多可以调整 per_device_train_batch_size
和 per_device_eval_batch_size
更大 2、4…
然后其中 LR 是学习率
response_column
和 prompt_column
是用来指定字段用的,不用看,因为没用到,我们直接处理数据内容为对应格式就好
gradient_accumulation_steps
如果 GPU 内存够 48+ 可以减少
使用 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 import argparsefrom peft import PeftModelfrom transformers import AutoTokenizer, AutoModelForCausalLMdef load_model (model_path, checkpoint_path ): tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16) model = PeftModel.from_pretrained(model, model_id=checkpoint_path).to("cuda" ).eval () return tokenizer, model parser = argparse.ArgumentParser() parser.add_argument("--model" , type =str , default=None , required=True , help ="main model weights" ) parser.add_argument("--ckpt" , type =str , default=None , required=True , help ="The checkpoint path" ) args = parser.parse_args() tokenizer, model = load_model(args.model, args.ckpt) parser = argparse.ArgumentParser() parser.add_argument("--model" , type =str , default=None , required=True , help ="main model weights" ) parser.add_argument("--ckpt" , type =str , default=None , required=True , help ="The checkpoint path" ) args = parser.parse_args() tokenizer, model = load_model(args.model, args.ckpt) def get_completion (prompt ): inputs = tokenizer([prompt], return_tensors="pt" ).to("cuda" ) with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=1024 ) response = tokenizer.decode(outputs[:,inputs['input_ids' ].shape[1 ]:][0 ], skip_special_tokens=True ) return response
总结 经过这段时间的尝试和实践,总结如下几点:
定向场景小模型够用
模型训练成本低,测试成本高
数据集非常重要
transformers 微调已经非常易用了