!pip install sentence_transformers !pip install unidecode !pip install langchain !pip install faiss-cpu from torch.utils.data import DataLoader import math import logging from unidecode import unidecode from pathlib import Path import json from sentence_transformers import SentenceTransformer, losses, InputExample model="paraphrase-multilingual-mpnet-base-v2" model = SentenceTransformer(model) batch_size = 32 num_epochs = 50 train_path = Path("/content/train.json") with open(train_path, 'r', encoding='utf-8') as file: queries_Categoricos = json.load(file) train_loss = losses.MultipleNegativesRankingLoss(model=model) train_examples = [] for i in queries_Categoricos.keys(): for j in queries_Categoricos[i]: i=unidecode(i).strip().lower() j=unidecode(j).strip().lower() score = 1.0 #print(i) train_examples.append(InputExample(texts=[ i,j], label=score)) train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size) #evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the cross-encoder model model.fit(train_objectives=[(train_dataloader, train_loss)], #evaluator=evaluator, epochs=num_epochs, #evaluation_steps=1000, warmup_steps=warmup_steps) save_path = "./%spasos/paraphrase-multilingual-mpnet-base-v2/model/"%(str(num_epochs)) model.save(save_path) from google.colab import drive drive.mount('/content/drive', force_remount=True) !zip "./%sp.zip"%(str(num_epochs)) "/content/%spasos"%(str(num_epochs))