Source code of LED: Lexicon-Enlightened Dense Retriever for Large-Scale Retrieval (WWW 2023)
All these must be done in Conda Env
conda create -n trans22 python=3.7 conda activate trans22 conda install pytorch=1.9.1 torchvision torchaudio cudatoolkit=11.1 -c pytorch -c nvidia # conda install pytorch=1.9.1 torchvision torchaudio -c pytorch pip install transformers==4.11.3; pip install gpustat ipython jupyter datasets accelerate sklearn tensorboard nltk # pip install ipython jupyter datasets accelerate sklearn tensorboard nltk conda install -c conda-forge faiss-gpu
$DATA_DIR/
--msmarco/
----collection.tsv
----collection.tsv.title.tsv (titles, copied from https://github.com/texttron/tevatron)
----passage_ranking/
------train.query.txt [502939 lines]
------qrels.train.tsv [532761 lines]
------train.negatives.tsv [400782 lines] (BM25 negatives, copied from tevatron)
------dev.query.txt [6980 lines]
------qrels.dev.tsv [7437 lines]
------top1000.dev [6668967 lines]
------test2019.query.txt [200 lines]
------qrels.test2019.tsv [9260 lines]
------top1000.test2019 [189877 lines]
If not specified, please download the file from official, https://microsoft.github.io/msmarco/Datasets, and then rename it accordingly.
Please refer to Anserini to support LEX inference (for evaluation & self hard negative mining).
- NVIDIA A100 80G
// you may need to re-search the hyperparameters when using multiple GPUs.
# first-stage dense retriever with best recipe we found export DATA_DIR=[DATA_DIR] CUDA_VISIBLE_DEVICES=7; python -m proj_dense.train_dense_retriever \ --do_train --do_eval --do_prediction --negs_sources official --num_negs_per_system 1000 \ --model_name_or_path Luyu/co-condenser-marco --overwrite_output_dir \ --warmup_proportion 0.01 --weight_decay 0.0 --max_grad_norm 1. --seed 42 \ --data_dir ${DATA_DIR} \ --output_dir ./output/dense/single-gpu-co-condenser-bs16nn7-lr1e5-ep3-qrylen32 \ --data_load_type memory --num_proc 6 --max_length 128 --eval_steps 10000 --logging_steps 100 \ --learning_rate 1e-5 --num_train_epochs 3 --eval_batch_size 32 --num_dev 1000 \ --train_batch_size 16 --gradient_accumulation_steps 1 --num_negatives 7 > logs/single-gpu-co-condenser-bs16nn7-lr1e5-ep3-qrylen32.log # Do prediction CUDA_VISIBLE_DEVICES=6,7; python -m torch.distributed.run --master_port 47784 --nproc_per_node=2 \ -m proj_dense.train_dense_retriever --do_prediction \ --model_name_or_path ./output/dense/single-gpu-co-condenser-bs16nn7-lr1e5-ep3-qrylen32 \ --overwrite_output_dir --data_dir ${DATA_DIR} \ --output_dir ./output/dense/single-gpu-co-condenser-bs16nn7-lr1e5-ep3-qrylen32 --eval_batch_size 64
# first-stage lexicon-aware retriever with best recipe we found export DATA_DIR=[DATA_DIR] # official here indicates BM25 negatives. python3 -m proj_sparse.train_splade_retriever \ --do_train --do_eval --encoder distilbert \ --model_name_or_path distilbert-base-uncased --overwrite_output_dir \ --warmup_proportion 0.01 --weight_decay 0.01 --max_grad_norm 1. --seed 42 \ --data_dir ${DATA_DIR} \ --output_dir ./output/sparse/splade-distilbert-bs48nn5-NSoff1000-lr3e5-ep3-baseline \ --data_load_type memory --num_proc 2 --max_length 128 --eval_steps 10000 --logging_steps 100 \ --learning_rate 3e-5 --num_train_epochs 3 --eval_batch_size 48 --num_dev 1000 \ --train_batch_size 48 --gradient_accumulation_steps 1 --num_negatives 5 \ --num_negs_per_system 1000 --negs_sources official > logs/splade-distilbert-bs48nn5-NSoff1000-lr3e5-ep3-baseline.log # --do_prediction separately if using DDP python3 -m proj_sparse.train_splade_retriever \ --do_prediction --encoder distilbert \ --model_name_or_path ./output/sparse/splade-distilbert-bs48nn5-NSoff1000-lr3e5-ep3-baseline --overwrite_output_dir \ --data_dir ${DATA_DIR} \ --output_dir ./output/sparse/splade-distilbert-bs48nn5-NSoff1000-lr3e5-ep3-baseline \ --data_load_type memory --num_proc 2 --max_length 128 --eval_batch_size 48
export DATA_DIR=[DATA_DIR] python -m torch.distributed.run --master_port 47778 --nproc_per_node=4 \ -m proj_dense.generate_dense_hard_negs \ --num_negatives 200 --overwrite_save_path \ --model_name_or_path ./output/dense/single-gpu-co-condenser-bs16nn7-lr1e5-ep3-qrylen32-stage2 \ --data_dir ${DATA_DIR} \ --output_dir ./output/dense/2stage/co-condenser-bs16nn7-lr1e5-ep3-qrylen32-stage2 \ --data_load_type memory --num_proc 6 --eval_batch_size 256
# second-stage dense retriever with best recipe we found export DATA_DIR=[DATA_DIR] python -m proj_dense.train_dense_retriever_continue \ --static_hard_negatives_path ./output/dense/2stage/co-condenser-bs16nn7-lr1e5-ep3-qrylen32-stage2 \ --do_train --do_eval --do_prediction --num_negs_per_system 200 \ --model_name_or_path ./output/dense/single-gpu-co-condenser-bs16nn7-lr1e5-ep3-qrylen32-stage2 --overwrite_output_dir \ --warmup_proportion 0.01 --weight_decay 0.0 --max_grad_norm 1. --seed 42 \ --data_dir ${DATA_DIR} \ --output_dir ./output/dense/2stage/single-gpu-co-condenser-bs16nn7-lr5e6-ep3-qrylen32-stage2 \ --data_load_type memory --num_proc 6 --max_length 128 --eval_steps 10000 --logging_steps 100 \ --learning_rate 5e-6 --num_train_epochs 3 --eval_batch_size 32 --num_dev 1000 \ --train_batch_size 16 --gradient_accumulation_steps 1 --num_negatives 7
export DATA_DIR=[DATA_DIR] python -m proj_sparse.generate_sparse_hard_negs \ --data_dir ${DATA_DIR} --eval_batch_size 64 --output_dir ./output/sparse/splade-distilbert-bs48nn5-NSoff1000-lr3e5-ep3-baseline \ --model_name_or_path ./output/sparse/splade-distilbert-bs48nn5-NSoff1000-lr3e5-ep3-baseline \ --num_negatives 200 --encoder distilbert --data_load_type memory --overwrite_save_path # (batchlize train query search because anserini works very slow after 200k search. Do this with several running, so possibly code has bugs. But current file is fine.)
# second-stage lexicon-aware retriever with best recipe we found export DATA_DIR=[DATA_DIR] python -m proj_sparse.train_splade_retriever_continue \ --static_hard_negatives_path ./output/sparse/splade-distilbert-bs48nn5-NSoff1000-lr3e5-ep3-baseline \ --do_train --do_eval --do_prediction --encoder distilbert \ --model_name_or_path ./output/sparse/splade-distilbert-bs48nn5-NSoff1000-lr3e5-ep3-baseline --overwrite_output_dir \ --warmup_proportion 0.01 --weight_decay 0.01 --max_grad_norm 1. --seed 42 \ --data_dir ${DATA_DIR} \ --output_dir ./output/sparse/2stage/splade-distilbert-bs48nn5-NSoff1000-lr2e5-ep3-baseline \ --data_load_type memory --num_proc 2 --max_length 128 --eval_steps 10000 --logging_steps 100 \ --learning_rate 2e-5 --num_train_epochs 3 --eval_batch_size 48 --num_dev 1000 \ --train_batch_size 48 --gradient_accumulation_steps 1 --num_negatives 5 \ --num_negs_per_system 200
# LED retriever with best recipe we found # in practice, we found NN=30 is even better. export DATA_DIR = [DATA_DIR] export CUSTOM_HN_DIR = [CUSTOM_HN_DIR] export NN=32 export DSTW=1.0 export DSTM=rank export NEGS=co-stg1,sp-stg1,sp-stg2 python -m proj_dense.train_LED_retriever \ --tch_model_path ./output/sp-stg2 \ --tch_encoder distilbert \ --with_pos \ --apply_dst \ --tch_no_drop \ --dst_method ${DSTM} \ --dst_loss_weight ${DSTW} \ --negs_sources custom/${NEGS} \ --do_train --do_eval --do_prediction --num_negs_per_system 200 \ --model_name_or_path ./output/co-stg1 \ --overwrite_output_dir \ --warmup_proportion 0.01 \ --weight_decay 0.0 \ --max_grad_norm 1. \ --seed 42 \ --data_dir ${DATA_DIR} \ --custom_hn_dir ${CUSTOM_HN_DIR} \ --output_dir ./output/entangle/stg1-co-bs16nn${NN}-lr5e6-ep3-sp-stg2-dst-${DSTM}-with-pos-${DSTW}-${NEGS}-tch-no-drop \ --data_load_type memory --num_proc 6 --max_length 128 --eval_steps 10000 --logging_steps 100 \ --learning_rate 5e-6 --num_train_epochs 3 --eval_batch_size 32 --num_dev 1000 \ --train_batch_size 16 --gradient_accumulation_steps 1 --num_negatives ${NN}
export DATA_DIR = [DATA_DIR] export CUSTOM_HN_DIR = [CUSTOM_HN_DIR] export TCH_PATH=[TCH_PATH] export DSTM=rank export DSTW=1.2 export TEMP=3.0 export XDSTW=1.5 export NN=32 export NEGS=co-stg1,sp-stg1,sp-stg2; # sp-stg2 is the LEX teacher, co-stg1 is the DEN after warming up, both obtained from previous steps. python -m proj_dense.train_LED_w_RT \ --sp_tch_model_path ./output/sp-stg2 \ --sp_tch_encoder distilbert \ --apply_dst \ --with_ce_loss \ --dst_method ${DSTM} \ --xe_tch_temp ${TEMP} \ --tch_no_drop \ --dst_loss_weight ${DSTW} \ --xe_tch_model_path \ --xe_tch_dst_loss_weight ${XDSTW} \ --negs_sources custom/${NEGS} \ --num_negs_per_system 200 \ --do_train --do_eval --do_prediction \ --model_name_or_path ./output/co-stg1 \ --overwrite_output_dir \ --warmup_proportion 0.01 \ --weight_decay 0.0 \ --max_grad_norm 1. \ --seed 42 \ --data_dir ${CORPUS_DIR} \ --custom_hn_dir ${CUSTOM_HN_DIR} \ --output_dir ./output/entangle/stg1-co-bs16nn${NN}-lr5e6-ep3-sp-stg2-dst-${DSTM}-${DSTW}-${NEGS}-xencoder-kl-${XDSTW}-tempv2-${TEMP}-tch-no-drop \ --data_load_type memory --num_proc 6 --max_length 128 --eval_steps 10000 --logging_steps 100 \ --learning_rate 5e-6 --num_train_epochs 3 --eval_batch_size 32 --num_dev 1000 \ --train_batch_size 16 --gradient_accumulation_steps 1 --num_negatives ${NN}
If you find our code helpful, please cite our paper: (the url is not available yet)
@inproceedings{Zhang2023LED,
title={LED: Lexicon-Enlightened Dense Retriever for Large-Scale Retrieval},
author={Kai Zhang, Chongyang Tao, Tao Shen, Can Xu, Xiubo Geng, Binxing Jiao, Daxin Jiang},
booktitle={Proceedings of WWW},
url ={https://doi.org/10.1145/3543507.3583294},
doi ={10.1145/3543507.3583294},
year={2023}
}
If you find any questions, please feel free to contact Kai Zhang drogozhang@gmail.com.