53 lines
1.7 KiB
Python
53 lines
1.7 KiB
Python
|
|
|
|
!pip install sentence_transformers
|
|
!pip install unidecode
|
|
!pip install langchain
|
|
!pip install faiss-cpu
|
|
from torch.utils.data import DataLoader
|
|
import math
|
|
import logging
|
|
from unidecode import unidecode
|
|
from pathlib import Path
|
|
import json
|
|
from sentence_transformers import SentenceTransformer, losses, InputExample
|
|
model="paraphrase-multilingual-mpnet-base-v2"
|
|
model = SentenceTransformer(model)
|
|
batch_size = 32
|
|
num_epochs = 50
|
|
train_path = Path("/content/train.json")
|
|
with open(train_path, 'r', encoding='utf-8') as file:
|
|
|
|
queries_Categoricos = json.load(file)
|
|
|
|
train_loss = losses.MultipleNegativesRankingLoss(model=model)
|
|
train_examples = []
|
|
for i in queries_Categoricos.keys():
|
|
|
|
for j in queries_Categoricos[i]:
|
|
i=unidecode(i).strip().lower()
|
|
j=unidecode(j).strip().lower()
|
|
score = 1.0
|
|
#print(i)
|
|
train_examples.append(InputExample(texts=[ i,j], label=score))
|
|
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
|
|
#evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev')
|
|
|
|
# Configure the training
|
|
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
|
|
logging.info("Warmup-steps: {}".format(warmup_steps))
|
|
|
|
# Train the cross-encoder model
|
|
model.fit(train_objectives=[(train_dataloader, train_loss)],
|
|
#evaluator=evaluator,
|
|
epochs=num_epochs,
|
|
#evaluation_steps=1000,
|
|
warmup_steps=warmup_steps)
|
|
|
|
save_path = "./%spasos/paraphrase-multilingual-mpnet-base-v2/model/"%(str(num_epochs))
|
|
model.save(save_path)
|
|
|
|
from google.colab import drive
|
|
drive.mount('/content/drive', force_remount=True)
|
|
|
|
!zip "./%sp.zip"%(str(num_epochs)) "/content/%spasos"%(str(num_epochs)) |