feat: Option Ciditel, Code to googlecolab,BotCidit

This commit is contained in:
Mario Gil 2024-01-11 09:26:48 -05:00
parent 502cf96292
commit 32066fa9fe
3 changed files with 58 additions and 10 deletions

View File

@ -7,7 +7,7 @@ from sentence_transformers import SentenceTransformer
from pathlib import Path from pathlib import Path
import json import json
#"paraphrase-multilingual-mpnet-base-v2",'hackathon-pln-es/paraphrase-spanish-distilroberta' #"paraphrase-multilingual-mpnet-base-v2",'hackathon-pln-es/paraphrase-spanish-distilroberta'
nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid" nameModel="Modelo_embedding_CIDITEL"
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"): def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
configPath=Path(relPath) configPath=Path(relPath)
with open(configPath, 'r', encoding='utf-8') as file: with open(configPath, 'r', encoding='utf-8') as file:

53
finetrainCollabversion.py Normal file
View File

@ -0,0 +1,53 @@
!pip install sentence_transformers
!pip install unidecode
!pip install langchain
!pip install faiss-cpu
from torch.utils.data import DataLoader
import math
import logging
from unidecode import unidecode
from pathlib import Path
import json
from sentence_transformers import SentenceTransformer, losses, InputExample
model="paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model)
batch_size = 32
num_epochs = 50
train_path = Path("/content/train.json")
with open(train_path, 'r', encoding='utf-8') as file:
queries_Categoricos = json.load(file)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
train_examples = []
for i in queries_Categoricos.keys():
for j in queries_Categoricos[i]:
i=unidecode(i).strip().lower()
j=unidecode(j).strip().lower()
score = 1.0
#print(i)
train_examples.append(InputExample(texts=[ i,j], label=score))
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
#evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev')
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the cross-encoder model
model.fit(train_objectives=[(train_dataloader, train_loss)],
#evaluator=evaluator,
epochs=num_epochs,
#evaluation_steps=1000,
warmup_steps=warmup_steps)
save_path = "./%spasos/paraphrase-multilingual-mpnet-base-v2/model/"%(str(num_epochs))
model.save(save_path)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!zip "./%sp.zip"%(str(num_epochs)) "/content/%spasos"%(str(num_epochs))

13
main.py
View File

@ -17,28 +17,22 @@ from unidecode import unidecode
from nltk.corpus import stopwords from nltk.corpus import stopwords
from langchain.schema.embeddings import Embeddings from langchain.schema.embeddings import Embeddings
from langchain.document_loaders import DataFrameLoader from langchain.document_loaders import DataFrameLoader
from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted
#from langchain import PromptTemplate #from langchain import PromptTemplate
# from langchain.document_loaders import TextLoader # from langchain.document_loaders import TextLoader
# from langchain.text_splitter import CharacterTextSplitter # from langchain.text_splitter import CharacterTextSplitter
# from langchain.text_splitter import RecursiveCharacterTextSplitter # from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.document_loaders import UnstructuredFileLoader # from langchain.document_loaders import UnstructuredFileLoader
# from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader # from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
# from langchain.document_loaders import UnstructuredURLLoader # from langchain.document_loaders import UnstructuredURLLoader
# from langchain.document_loaders.csv_loader import CSVLoader # from langchain.document_loaders.csv_loader import CSVLoader
# #from langchain import LLMChain # #from langchain import LLMChain
#
#
# from langchain.embeddings import HuggingFaceEmbeddings # from langchain.embeddings import HuggingFaceEmbeddings
#from cleantext import clean #from cleantext import clean
from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted
class CustomEmbedding(Embeddings, BaseModel): class CustomEmbedding(Embeddings, BaseModel):
"""embedding model with preprocessing""" """embedding model with preprocessing"""
@ -55,8 +49,9 @@ class CustomEmbedding(Embeddings, BaseModel):
def embed_query(self, text: str) -> List[float]: def embed_query(self, text: str) -> List[float]:
return self._get_embedding(text) return self._get_embedding(text)
nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid" nameModel="Modelo_embedding_CIDITEL"
model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model" model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model"
print(model)
entrenamiento="V1.3" entrenamiento="V1.3"
pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite") pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite")
keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics") keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")