SDK下载 Git下载 Observe the cofigsetecetrasformers.jso to see all pre-built prompt ames. Otherwise, you ca use You ca use the [scripts/eval_mteb.py]() to reproduce the followig result of If you fid our paper or models helpful, please cosider cite:gte-Qwe2-1.5B-istruct
Model Iformatio
模型下载
#安装ModelScope
pip istall modelscope
pip istall setece_trasformers
#SDK模型下载
from modelscope import sapshot_dowload
model_dir = sapshot_dowload('iic/gte_Qwe2-1.5B-istruct')
#Git模型下载
git cloe https://www.modelscope.c/iic/gte_Qwe2-1.5B-istruct.git
Requiremets
trasformers>=4.39.2
flash_att>=2.5.6
Usage
Setece Trasformers
from setece_trasformers import SeteceTrasformer
from modelscope import sapshot_dowload
model_dir = sapshot_dowload("iic/gte_Qwe2-1.5B-istruct")
model = SeteceTrasformer(model_dir, trust_remote_code=True)
# I case you wat to reduce the maximum legth:
model.max_seq_legth = 8192
queries = [
"how much protei should a female eat",
"summit defie",
]
documets = [
"As a geeral guidelie, the CDC's average requiremet of protei for wome ages 19 to 70 is 46 grams per day. But, as you ca see from this chart, you'll eed to icrease that if you're expectig or traiig for a maratho. Check out the chart below to see how much protei you should be eatig each day.",
"Defiitio of summit for Eglish Laguage Learers. : 1 the highest poit of a moutai : the top of a moutai. : 2 the highest level. : 3 a meetig or series of meetigs betwee the leaders of two or more govermets.",
]
query_embeddigs = model.ecode(queries, prompt_ame="query")
documet_embeddigs = model.ecode(documets)
scores = (query_embeddigs @ documet_embeddigs.T) * 100
prit(scores.tolist())
# [[70.00668334960938, 8.184843063354492], [14.62419319152832, 77.71407318115234]]
model.ecode(queries, prompt="Istruct: ...\Query: "
to use a custom prompt of your choice.Trasformers
import torch
import torch..fuctioal as F
from torch import Tesor
from modelscope import AutoTokeizer, AutoModel
def last_toke_pool(last_hidde_states: Tesor,
attetio_mask: Tesor) -> Tesor:
left_paddig = (attetio_mask[:, -1].sum() == attetio_mask.shape[0])
if left_paddig:
retur last_hidde_states[:, -1]
else:
sequece_legths = attetio_mask.sum(dim=1) - 1
batch_size = last_hidde_states.shape[0]
retur last_hidde_states[torch.arage(batch_size, device=last_hidde_states.device), sequece_legths]
def get_detailed_istruct(task_descriptio: str, query: str) -> str:
retur f'Istruct: {task_descriptio}\Query: {query}'
# Each query must come with a oe-setece istructio that describes the task
task = 'Give a web search query, retrieve relevat passages that aswer the query'
queries = [
get_detailed_istruct(task, 'how much protei should a female eat'),
get_detailed_istruct(task, 'summit defie')
]
# No eed to add istructio for retrieval documets
documets = [
"As a geeral guidelie, the CDC's average requiremet of protei for wome ages 19 to 70 is 46 grams per day. But, as you ca see from this chart, you'll eed to icrease that if you're expectig or traiig for a maratho. Check out the chart below to see how much protei you should be eatig each day.",
"Defiitio of summit for Eglish Laguage Learers. : 1 the highest poit of a moutai : the top of a moutai. : 2 the highest level. : 3 a meetig or series of meetigs betwee the leaders of two or more govermets."
]
iput_texts = queries + documets
tokeizer = AutoTokeizer.from_pretraied('iic/gte_Qwe2-1.5B-istruct', trust_remote_code=True)
model = AutoModel.from_pretraied('iic/gte_Qwe2-1.5B-istruct', trust_remote_code=True)
max_legth = 8192
# Tokeize the iput texts
batch_dict = tokeizer(iput_texts, max_legth=max_legth, paddig=True, trucatio=True, retur_tesors='pt')
outputs = model(**batch_dict)
embeddigs = last_toke_pool(outputs.last_hidde_state, batch_dict['attetio_mask'])
# ormalize embeddigs
embeddigs = F.ormalize(embeddigs, p=2, dim=1)
scores = (embeddigs[:2] @ embeddigs[2:].T) * 100
prit(scores.tolist())
# [[70.00666809082031, 8.184867858886719], [14.62420654296875, 77.71405792236328]]
Evaluatio
MTEB & C-MTEB
Model Name
MTEB(56)
C-MTEB(35)
bge-base-e-1.5
64.23
-
bge-large-e-1.5
63.55
-
gte-large-e-v1.5
65.39
-
gte-base-e-v1.5
64.11
-
mxbai-embed-large-v1
64.68
-
acgetextembeddig
-
69.07
stella-mrl-large-zh-v3.5-1792d
-
68.55
gte-large-zh
-
66.72
multiligual-e5-base
59.45
56.21
multiligual-e5-large
61.50
58.81
e5-mistral-7b-istruct
66.63
60.81
67.34
69.52
Citatio
@article{li2023towards,
title={Towards geeral text embeddigs with multi-stage cotrastive learig},
author={Li, Zeha ad Zhag, Xi ad Zhag, Yazhao ad Log, Digku ad Xie, Pegju ad Zhag, Meisha},
joural={arXiv preprit arXiv:2308.03281},
year={2023}
}
点击空白处退出提示
评论