Keyformer 环境搭建及分析

ref https://github.com/d-matrix-ai/keyformer-llm
https://arxiv.org/pdf/2403.09054

1 software envirenment prep

# conda
wget https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
sudo chmod a+x ./Anaconda3-2024.06-1-Linux-x86_64.sh
./Anaconda3-2024.06-1-Linux-x86_64.sh 
# cuda
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin
sudo mv cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/cuda-repo-ubuntu2404-12-5-local_12.5.1-555.42.06-1_amd64.deb
sudo dpkg -i cuda-repo-ubuntu2404-12-5-local_12.5.1-555.42.06-1_amd64.deb
sudo cp /var/cuda-repo-ubuntu2404-12-5-local/cuda-*-keyring.gpg /usr/share/keyrings/
sudo apt-get update
sudo apt-get -y install cuda-toolkit-12-5
sudo apt-get install -y nvidia-driver-555-open
sudo apt-get install -y cuda-drivers-555

2 repo init

git clone https://github.com/d-matrix-ai/keyformer-llm.git
conda env create --file=conda-env.yml
conda activate keyformer-env
pip install torch flash_attn accelerate==0.32.1
pip install --upgrade transformers
# download mpt model
cd models
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/mosaicml/mpt-7b mpt-7b-keyformer
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/cerebras/Cerebras-GPT-6.7B cerebras-6.7b-keyformer
cd ..
cd models/model_download
# this step may need proxy to access huggingface.com and cuda (>12.0)
# if see any error , try change download_model func setting to  model_name,  trust_remote_code=True,  torchscript=True
python3 download_model.py --model_name mosaicml/mpt-7b
python3 download_model.py --model_name cerebras/Cerebras-GPT-6.7B

cd ../../
mv models/model_download/model/* models/mpt-7b-keyformer/.
mv models/model_download/model/* models/cerebras-6.7b-keyformer
cp -r models/mpt-keyformer-lib/* models/mpt-7b-keyformer
cp -r models/cerebras-keyformer-lib/* models/cerebras-6.7b-keyformer

run summarization

cd summarization/dataset_download
pip install -U datasets
# in this step , if blocked while Mapping, try delete the cache_dir setting in line 23
# the HF_ENDPOINT setting only available when http_proxy and https_proxy is unset
HF_ENDPOINT=https://hf-mirror.com python download_cnndm.py
cd ../
cp ../models/mpt-7b-keyformer/fc.py /home/nmhn/.cache/huggingface/modules/transformers_modules/mpt-7b-keyformer/
vim cerebras-6.7b-keyformer
# line 18 from typing import Dict, Any, List, Mapping, Optional
vim modeling_gpt2.py

line 629
		self.req_tokens = attn_outputs[7]
		self.itr_count = attn_outputs[6]
line 919
	    self.keyformer = config.keyformer_config["keyformer"]
        self.kv_cache = config.keyformer_config["kv_cache"]
        self.recent = config.keyformer_config["recent"]
        self.tau_init = config.keyformer_config["tau_init"]
        self.tau_delta = config.keyformer_config["tau_delta"]
        
chmod a+x run_summarization_task.sh
./run_summarization_task.sh

run conversation

cd conversation/dataset_download
HF_ENDPOINT=https://hf-mirror.com  python download_soda.py
chmod a+x ./run_conversation_task.sh
vim ./run_conversation_task.sh # add HF_ENDPOINT=https://hf-mirror.com before python
# modify the 413 line to 
                    if dialogue.dim() > 1:
                        dialogue = dialogue[:, input_ids.shape[-1]:].cpu()
                    else:
                        dialogue = dialogue.cpu()
./run_conversation_task.sh

--model_name mosaicml/mpt-7b \
                    --dataset_path ./data/soda_eval.json \
                    --save_path ./out_model.conversation \
                    --score_path ./out_model.score \
                    --model_path ../models/mpt-7b-keyformer \
                    --attentions_path ./out_model.attention \
                    --device cuda \
                    --task summarization \
                    --bs 1 \
                    --dtype float16 \
                    --causal_lm \
                    --early_stopping \
                    --output_summaries_only \
                    --output_sequence_scores \
                    --save_attentions \
                    --save_prompt_attentions \
                    --padding_side left \
                    --beam 4 \
                    --model_parallelize \
                    --keyformer \
                    --kv_cache 60 \
                    --recent 30 \
                    --tau_init 1 \
                    --tau_end 2 \
                    --no_repeat_ngram_size 0 \
                    --repetition_penalty 1 \
                    --max_tokenizer_length 1920 \
                    --max_new_tokens 128 \
                    --min_gen_length 30 \
                    --num_return_sequences 1 \
                    --seed 12345 \
                    --n_obs 1000

重要函数

1 2	# models/[model_name]/attention_llm_eval_harness.py keyformer_mask

1 software envirenment prep

2 repo init

run summarization

run conversation

重要函数

TODO 分析