LLM常见问题（中文二次预训练部分）-灵析社区

1. 为什么需要进行继续预训练？

我们新增加了一些中文词汇到词表中，但是这些词汇是没有得到训练的，因此在进行指令微调之前我们要进行预训练。预训练的方式一般都是相同的，简单来说，就是根据上一个字预测下一个字是什么。我们这里直接使用 IDEA-CCNL/Wenzhong2.0-GPT2-110M-BertTokenizer-chinese 模型，并且 tokenizer 也是其自带的。

2. 如何对继续预训练数据预处理？

首先使用 tokenizer() 函数得到相关的输入，需要注意的是可能会在文本前后添加特殊的标记，比如 bos_token_id 和 eos_token_id，针对于不同模型的 tokneizer 可能会不太一样。这里在 input_ids 前后添加了 21134 和 21133 两个标记。
然后将所有文本的 input_ids、attention_mask、token_type_ids 各自拼接起来（展开后拼接，不是二维数组之间的拼接），再设定一个最大长度 block_size，这样得到最终的输入。

import os
import logging
import datasets
import transformers

from pprint import pprint
from itertools import chain
from datasets import load_dataset, concatenate_datasets
from transformers.testing_utils import CaptureLogger
from transformers import AutoTokenizer, LlamaTokenizer


tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")

logger = logging.getLogger(__name__)

lm_datasets = []
files = ["data/test_corpus.txt"]
data_cache_dir = "./cache_data"
preprocessing_num_workers = 1

# tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")
tokenizer = LlamaTokenizer.from_pretrained("ziqingyang/chinese-llama-lora-7b")
tokenizer = AutoTokenizer.from_pretrained("IDEA-CCNL/Wenzhong2.0-GPT2-110M-BertTokenizer-chinese")

def print_dict(adict):
  for k,v in adict.items():
    print(k, v)

def tokenize_function(examples):
        with CaptureLogger(tok_logger) as cl:
            output = tokenizer(examples["text"])
        # clm input could be much much longer than block_size
        if "Token indices sequence length is longer than the" in cl.out:
            tok_logger.warning(
                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
                " before being passed to the model."
            )
        return output

block_size = 128

# 将所有文本进行拼接
def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

for idx, file in enumerate(files):
    data_file = file
    filename = ''.join(file.split(".")[:-1])

    cache_path = os.path.join(data_cache_dir, filename)
    os.makedirs(cache_path, exist_ok=True)
    try:
        processed_dataset = datasets.load_from_disk(cache_path, keep_in_memory=False)
        print(f'training datasets-{filename} has been loaded from disk')
    except Exception:
        cache_dir = os.path.join(data_cache_dir, filename + "_text")
        os.makedirs(cache_dir, exist_ok=True)

        raw_dataset = load_dataset("text", data_files=data_file, cache_dir=cache_dir, keep_in_memory=False)
        print_dict(raw_dataset["train"][0])
        # 直接进行tokenize，需要注意的是只需要在句子开头加上bos_token
        tokenized_dataset = raw_dataset.map(
            tokenize_function,
            batched=True,
            num_proc=preprocessing_num_workers,
            remove_columns="text",
            load_from_cache_file=True,
            keep_in_memory=False,
            cache_file_names={k: os.path.join(cache_dir, f'tokenized.arrow') for k in raw_dataset},
            desc="Running tokenizer on dataset",
        )

        print_dict(tokenized_dataset["train"][0])

        grouped_datasets = tokenized_dataset.map(
            group_texts,
            batched=True,
            num_proc=preprocessing_num_workers,
            load_from_cache_file=True,
            keep_in_memory=False,
            cache_file_names={k: os.path.join(cache_dir, f'grouped.arrow') for k in tokenized_dataset},
            desc=f"Grouping texts in chunks of {block_size}",
        )
        processed_dataset = grouped_datasets

        print_dict(processed_dataset["train"][0])
        processed_dataset.save_to_disk(cache_path)
    if idx == 0:
        lm_datasets = processed_dataset['train']
    else:
        assert lm_datasets.features.type == processed_dataset["train"].features.type
        lm_datasets = concatenate_datasets([lm_datasets, processed_dataset["train"]])

lm_datasets = lm_datasets.train_test_split(test_size=0.1)

print_dict(lm_datasets["train"][0])

3. 如何构建模型？

from transformers import BertTokenizer,GPT2LMHeadModel, AutoModelForCausalLM
hf_model_path = 'IDEA-CCNL/Wenzhong2.0-GPT2-110M-BertTokenizer-chinese'
tokenizer = BertTokenizer.from_pretrained(hf_model_path)
model = AutoModelForCausalLM.from_pretrained(hf_model_path)

def generate_word_level(input_text,n_return=5, max_length=128,top_p=0.9):
    inputs = tokenizer(input_text,return_tensors='pt', add_special_tokens=False).to(model.device)
    gen = model.generate(
                            inputs=inputs['input_ids'],
                            max_length=max_length,
                            do_sample=True,
                            top_p=top_p,
                            eos_token_id=21133,
                            pad_token_id=0,
                            num_return_sequences=n_return)

    sentences = tokenizer.batch_decode(gen)
    for idx,sentence in enumerate(sentences):
        print(f'sentence {idx}: {sentence}')
        print('*'*20)
    return gen
# 西湖的景色
outputs = generate_word_level('你好，世界！',n_return=5,max_length=128)
print(outputs)

接下来是使用该模型针对我们自己的数据进行继续预训练了。需要注意的几个地方：

如果是我们自己定义的tokenizer，需要将模型的嵌入层和lm_head层的词表数目进行重新设置：

model_vocab_size = model.get_output_embeddings().weight.size(0)
model.resize_token_embeddings(len(tokenizer))

这里我们使用参数有效微调方法lora进行微调，我们需要设置额外保存的参数：transformer.wte,lm_head。这个可以通过find_lora_names.py里面获得。
原始chinsee-llama-alpaca使用lora保存参数有问题，这里进行了修改并只保存一份lora权重。
使用test_pretrained_model.py的时候也要记得先对vocab_size进行重新设置。

训练指令：

torchrun --nnodes 1 --nproc_per_node 1 run_clm_pt_with_peft.py \
--deepspeed ds_zero2_no_offload.json \
--model_name_or_path IDEA-CCNL/Wenzhong2.0-GPT2-110M-BertTokenizer-chinese \
--tokenizer_name_or_path IDEA-CCNL/Wenzhong2.0-GPT2-110M-BertTokenizer-chinese \
--dataset_dir data \
--data_cache_dir temp_data_cache_dir \
--validation_split_percentage 0.001 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 16 \
--do_train --seed $RANDOM \
--fp16 \
--max_steps 2500 \
--lr_scheduler_type cosine \
--learning_rate 2e-4 \
--warmup_ratio 0.05 \
--weight_decay 0.01 \
--logging_strategy steps \
--logging_steps 10 \
--save_strategy steps \
--save_total_limit 3 \
--save_steps 50 \
--gradient_accumulation_steps 1 \
--preprocessing_num_workers 8 \
--block_size 512 \
--output_dir output_dir \
--overwrite_output_dir \
--ddp_timeout 30000 \
--logging_first_step True \
--lora_rank 8 \
--lora_alpha 32 \
--trainable c_attn \
--modules_to_save transformer.wte,lm_head \
--lora_dropout 0.05 \
--torch_dtype float16 \
--gradient_checkpointing \
--ddp_find_unused_parameters False

4. 如何使用模型？

import os
import torch
from transformers import BertTokenizer,GPT2LMHeadModel, AutoModelForCausalLM
from peft import PeftModel
hf_model_path = 'IDEA-CCNL/Wenzhong2.0-GPT2-110M-BertTokenizer-chinese'
tokenizer = BertTokenizer.from_pretrained(hf_model_path)
model = AutoModelForCausalLM.from_pretrained(hf_model_path)

model_vocab_size = model.get_output_embeddings().weight.size(0)
model.resize_token_embeddings(len(tokenizer))

model = PeftModel.from_pretrained(model, os.path.join("output_dir", "adapter_model"), torch_dtype=torch.float32)
model.cuda()
model.eval()

def generate_word_level(input_text,n_return=5,max_length=128,top_p=0.9):
    inputs = tokenizer(input_text,return_tensors='pt',add_special_tokens=False).to(model.device)
    gen = model.generate(
                            inputs=inputs['input_ids'],
                            max_length=max_length,
                            do_sample=True,
                            top_p=top_p,
                            eos_token_id=21133,
                            pad_token_id=0,
                            num_return_sequences=n_return)

    sentences = tokenizer.batch_decode(gen)
    for idx,sentence in enumerate(sentences):
        print(f'sentence {idx}: {sentence}')
        print('*'*20)
    return gen

outputs = generate_word_level('宫廷玉液酒',n_return=5,max_length=128)
print(outputs)