LLm2Node/finetrainCollabversion.py

53 lines
1.7 KiB
Python

!pip install sentence_transformers
!pip install unidecode
!pip install langchain
!pip install faiss-cpu
from torch.utils.data import DataLoader
import math
import logging
from unidecode import unidecode
from pathlib import Path
import json
from sentence_transformers import SentenceTransformer, losses, InputExample
model="paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model)
batch_size = 32
num_epochs = 50
train_path = Path("/content/train.json")
with open(train_path, 'r', encoding='utf-8') as file:
queries_Categoricos = json.load(file)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
train_examples = []
for i in queries_Categoricos.keys():
for j in queries_Categoricos[i]:
i=unidecode(i).strip().lower()
j=unidecode(j).strip().lower()
score = 1.0
#print(i)
train_examples.append(InputExample(texts=[ i,j], label=score))
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
#evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev')
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the cross-encoder model
model.fit(train_objectives=[(train_dataloader, train_loss)],
#evaluator=evaluator,
epochs=num_epochs,
#evaluation_steps=1000,
warmup_steps=warmup_steps)
save_path = "./%spasos/paraphrase-multilingual-mpnet-base-v2/model/"%(str(num_epochs))
model.save(save_path)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!zip "./%sp.zip"%(str(num_epochs)) "/content/%spasos"%(str(num_epochs))