1- import  os 
2- from  concurrent .futures  import  ThreadPoolExecutor 
3- 4- import  numpy  as  np 
5- import  openai 
61import  torch .nn  as  nn 
7- from  more_itertools  import  chunked 
8- from  openai .openai_object  import  OpenAIObject 
92from  sentence_transformers  import  SentenceTransformer , models 
103from  src .sts  import  STSEvaluation 
114from  transformers  import  AutoModel , BertModel 
125
13- openai .api_key  =  os .environ ["OPENAI_API_KEY" ]
14- 156# MODEL_PATH = "cl-nagoya/sup-simcse-ja-large" 
167# MODEL_PATH = "cl-nagoya/sup-simcse-ja-base" 
178# MODEL_PATH = "MU-Kindai/Japanese-SimCSE-BERT-large-sup" 
189# MODEL_PATH = "colorfulscoop/sbert-base-ja" 
19- MODEL_PATH  =  "pkshatech/GLuCoSE-base-ja" 
10+ # MODEL_PATH = "pkshatech/GLuCoSE-base-ja" 
11+ # MODEL_PATH = "oshizo/sbert-jsnli-luke-japanese-base-lite" 
12+ MODEL_PATH  =  "intfloat/multilingual-e5-large" 
13+ 14+ 15+ sts  =  STSEvaluation (sts_dir = "./datasets/sts" )
16+ 2017
18+ def  evaluate ():
19+  model  =  SentenceTransformer (MODEL_PATH ).eval ().cuda ()
20+  print (sts .dev (encode = model .encode ))
21+  print (sts (encode = model .encode ))
2122
22- def  load_jcse (model_name : str ):
23-  backbone  =  models .Transformer (model_name )
24-  pretrained_model : BertModel  =  AutoModel .from_pretrained (model_name )
23+ 24+ def  evaluate_jcse ():
25+  backbone  =  models .Transformer (MODEL_PATH )
26+  pretrained_model : BertModel  =  AutoModel .from_pretrained (MODEL_PATH )
2527 hidden_size  =  pretrained_model .config .hidden_size 
2628
2729 # load weights of Transformer layers 
@@ -31,7 +33,7 @@ def load_jcse(model_name: str):
3133 pooling_mode = "cls" ,
3234 )
3335
34-  if  "unsup"  in  model_name :
36+  if  "unsup"  in  MODEL_PATH :
3537 model  =  SentenceTransformer (modules = [backbone , pooling ]).eval ().cuda ()
3638
3739 else :
@@ -49,44 +51,64 @@ def load_jcse(model_name: str):
4951 mlp .load_state_dict (mlp_state_dict )
5052 model  =  SentenceTransformer (modules = [backbone , pooling , mlp ]).eval ().cuda ()
5153
52-  return  model 
54+  print (sts .dev (encode = model .encode ))
55+  print (sts (encode = model .encode ))
5356
5457
55- def  load_vanilla ( model_name :  str ):
56-  backbone  =  models .Transformer (model_name )
58+ def  evaluate_vanilla ( ):
59+  backbone  =  models .Transformer (MODEL_PATH )
5760 pooling  =  models .Pooling (
5861 word_embedding_dimension = backbone .auto_model .config .hidden_size ,
5962 pooling_mode = "cls" ,
6063 )
61-  return  SentenceTransformer (modules = [backbone , pooling ]).eval ().cuda ()
64+  model  =  SentenceTransformer (modules = [backbone , pooling ]).eval ().cuda ()
65+  print (sts .dev (encode = model .encode ))
66+  print (sts (encode = model .encode ))
6267
6368
64- sts  =  STSEvaluation (sts_dir = "./datasets/sts" )
69+ def  evaluate_openai ():
70+  import  os 
71+  import  openai 
72+  import  numpy  as  np 
73+  from  concurrent .futures  import  ThreadPoolExecutor 
74+  from  more_itertools  import  chunked 
75+  from  openai .openai_object  import  OpenAIObject 
76+ 77+  openai .api_key  =  os .environ ["OPENAI_API_KEY" ]
78+ 79+  def  encode_openai (batch : list [str ]):
80+  res : OpenAIObject  =  openai .Embedding .create (
81+  model = "text-embedding-ada-002" ,
82+  input = batch ,
83+  )
84+  return  [d .embedding  for  d  in  res .data ]
85+ 86+  def  encode (sentences : list [str ], batch_size : int  =  128 ):
87+  embs  =  []
88+  with  ThreadPoolExecutor (max_workers = 32 ) as  executor :
89+  batches  =  chunked (list (sentences ), batch_size )
90+  for  emb  in  executor .map (encode_openai , batches ):
91+  embs  +=  emb 
92+  embs  =  np .array (embs )
93+  return  embs 
6594
66- # model = load_jcse(MODEL_PATH) 
67- # model = load_vanilla("cl-tohoku/bert-base-japanese-v3") 
68- model  =  SentenceTransformer (MODEL_PATH ).eval ().cuda ()
69- print (sts .dev (encode = model .encode ))
70- print (sts (encode = model .encode ))
95+  print (sts .dev (encode = encode ))
96+  print (sts (encode = encode ))
7197
7298
73- # def encode_openai(batch: list[str]): 
74- # res: OpenAIObject = openai.Embedding.create( 
75- # model="text-embedding-ada-002", 
76- # input=batch, 
77- # ) 
78- # return [d.embedding for d in res.data] 
99+ def  evaluate_e5 ():
100+  model  =  SentenceTransformer (MODEL_PATH ).eval ().cuda ()
79101
102+  def  encode (sentences : list [str ]):
103+  sentences  =  [f"query: { s }   for  s  in  sentences ]
104+  return  model .encode (sentences )
80105
81- # def encode(sentences: list[str], batch_size: int = 128): 
82- # embs = [] 
83- # with ThreadPoolExecutor(max_workers=32) as executor: 
84- # batches = chunked(list(sentences), batch_size) 
85- # for emb in executor.map(encode_openai, batches): 
86- # embs += emb 
87- # embs = np.array(embs) 
88- # return embs 
106+  print (sts .dev (encode = encode ))
107+  print (sts (encode = encode ))
89108
90109
91- # print(sts.dev(encode=encode)) 
92- # print(sts(encode=encode)) 
110+ if  __name__  ==  "__main__" :
111+  # evaluate() 
112+  # evaluate_vanilla() 
113+  # evaluate_openai() 
114+  evaluate_e5 ()
0 commit comments