Skip to content

Commit

Permalink
Update README.md
Browse files Browse the repository at this point in the history
Added more clarification for reproduction
  • Loading branch information
ritaranx authored Jun 24, 2024
1 parent ad783bd commit f7a2aa8
Showing 1 changed file with 13 additions and 7 deletions.
20 changes: 13 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ This is the model and data for the paper **BMRetriever: Tuning Large Language Mo

See the paper [link](https://arxiv.org/abs/2404.18443) for details.

**Update on 06/24/24**: We have added a script `eval.py` for reproducing our experimental results on BEIR benchmarks. We have also provided the prompt (instructions) we used in experiments in `prompt.py`.

## Model Specifications

Expand Down Expand Up @@ -51,17 +52,18 @@ from transformers import AutoTokenizer, AutoModel

def last_token_pool(last_hidden_states: Tensor,
attention_mask: Tensor) -> Tensor:
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
return last_hidden_states[:, -1]
embedding = last_hidden[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


batch_size = last_hidden.shape[0]
embedding = last_hidden[torch.arange(batch_size, device=last_hidden.device), sequence_lengths]
return embedding
def get_detailed_instruct_query(task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery: {query}'
return f'{task_description}\nQuery: {query}'

def get_detailed_instruct_passage(passage: str) -> str:
return f'Represent this passage\npassage: {passage}'
Expand All @@ -83,7 +85,11 @@ input_texts = queries + documents
max_length = 512

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
batch_dict = tokenizer(input_texts, max_length=max_length-1, padding=True, truncation=True, return_tensors='pt')

# Important! Adding EOS token at the end
batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt').to("cuda")

model.eval()
with torch.no_grad():
Expand Down

0 comments on commit f7a2aa8

Please sign in to comment.