acge model
acge是一个通用的文本编码模型,主要有以下模型:
Model Name | Model Size (GB) | Dimension | Sequence Length | Language | Need instruction for retrieval? |
---|---|---|---|---|---|
acge-large-zh | 0.65 | 1024 | 1024 | Chinese | NO |
acge-base-zh | To be continuted | Chinese | No |
Metric
C-MTEB leaderboard (Chinese)
Model Name | Model Size (GB) | Dimension | Sequence Length | Average (35) | Classification (9) | Clustering (4) | Pair Classification (2) | Reranking (4) | Retrieval (8) | STS (8) |
---|---|---|---|---|---|---|---|---|---|---|
acge-large-zh | 0.65 | 1024 | 1024 | 67.0 | 73.39 | 55.89 | 81.38 | 66.59 | 70.96 | 58.02 |
Reproduce our results
C-MTEB:
import torch
import argparse
import functools
from C_MTEB.tasks import *
from typing import List, Dict
from sentence_transformers import SentenceTransformer
from mteb import MTEB, DRESModel
class RetrievalModel(DRESModel):
def __init__(self, encoder, **kwargs):
self.encoder = encoder
def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:
input_texts = ['{}'.format(q) for q in queries]
return self._do_encode(input_texts)
def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs) -> np.ndarray:
input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
input_texts = ['{}'.format(t) for t in input_texts]
return self._do_encode(input_texts)
@torch.no_grad()
def _do_encode(self, input_texts: List[str]) -> np.ndarray:
return self.encoder.encode(
sentences=input_texts,
batch_size=512,
normalize_embeddings=True,
convert_to_numpy=True
)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--model_name_or_path', default="acge-large-zh", type=str)
parser.add_argument('--task_type', default=None, type=str)
parser.add_argument('--pooling_method', default='cls', type=str)
parser.add_argument('--output_dir', default='zh_results',
type=str, help='output directory')
parser.add_argument('--max_len', default=1024, type=int, help='max length')
return parser.parse_args()
if __name__ == '__main__':
args = get_args()
encoder = SentenceTransformer(args.model_name_or_path).half()
encoder.encode = functools.partial(encoder.encode, normalize_embeddings=True)
encoder.max_seq_length = int(args.max_len)
task_names = [t.description["name"] for t in MTEB(task_types=args.task_type,
task_langs=['zh', 'zh-CN']).tasks]
TASKS_WITH_PROMPTS = ["T2Retrieval", "MMarcoRetrieval", "DuRetrieval", "CovidRetrieval", "CmedqaRetrieval",
"EcomRetrieval", "MedicalRetrieval", "VideoRetrieval"]
for task in task_names:
evaluation = MTEB(tasks=[task], task_langs=['zh', 'zh-CN'])
if task in TASKS_WITH_PROMPTS:
evaluation.run(RetrievalModel(encoder), output_folder=args.output_dir, overwrite_results=False)
else:
evaluation.run(encoder, output_folder=args.output_dir, overwrite_results=False)
Usage
acge 中文系列模型
在sentence-transformer库中的使用方法:
from sentence_transformers import SentenceTransformer
sentences = ["数据1", "数据2"]
model = SentenceTransformer('acge-large-zh')
print(model.max_seq_length)
embeddings_1 = model.encode(sentences, normalize_embeddings=True)
embeddings_2 = model.encode(sentences, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)
直接使用modelscope库:
from modelscope import AutoModel, AutoTokenizer
from sklearn.preprocessing import normalize
model = AutoModel.from_pretrained('AI-ModelScope/acge-large-zh')
tokenizer = AutoTokenizer.from_pretrained('AI-ModelScope/acge-large-zh')
sentences = ["数据1", "数据ABCDEFGH"]
batch_data = tokenizer(
batch_text_or_text_pairs=sentences,
padding="longest",
return_tensors="pt",
max_length=1024,
truncation=True,
)
attention_mask = batch_data["attention_mask"]
model_output = model(**batch_data)
last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
vectors = normalize(vectors, norm="l2", axis=1, )
print(vectors.shape) # 2,768
acge models for English
Using Sentence-Transformers:
from sentence_transformers import SentenceTransformer
sentences = ["one car come", "one car go"]
model = SentenceTransformer('acge-large-zh')
print(model.max_seq_length)
embeddings_1 = model.encode(sentences, normalize_embeddings=True)
embeddings_2 = model.encode(sentences, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)
Using modelscope:
from modelscope import AutoModel, AutoTokenizer
from sklearn.preprocessing import normalize
model = AutoModel.from_pretrained('AI-ModelScope/acge-large-zh')
tokenizer = AutoTokenizer.from_pretrained('AI-ModelScope/acge-large-zh')
sentences = ["one car come", "one car go"]
batch_data = tokenizer(
batch_text_or_text_pairs=sentences,
padding="longest",
return_tensors="pt",
max_length=1024,
truncation=True,
)
attention_mask = batch_data["attention_mask"]
model_output = model(**batch_data)
last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
vectors = normalize(vectors, norm="l2", axis=1, )
print(vectors.shape) # 2,768
评论