feat: Option Ciditel, Code to googlecolab,BotCidit
This commit is contained in:
parent
502cf96292
commit
32066fa9fe
|
@ -7,7 +7,7 @@ from sentence_transformers import SentenceTransformer
|
|||
from pathlib import Path
|
||||
import json
|
||||
#"paraphrase-multilingual-mpnet-base-v2",'hackathon-pln-es/paraphrase-spanish-distilroberta'
|
||||
nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"
|
||||
nameModel="Modelo_embedding_CIDITEL"
|
||||
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
|
||||
configPath=Path(relPath)
|
||||
with open(configPath, 'r', encoding='utf-8') as file:
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
|
||||
|
||||
!pip install sentence_transformers
|
||||
!pip install unidecode
|
||||
!pip install langchain
|
||||
!pip install faiss-cpu
|
||||
from torch.utils.data import DataLoader
|
||||
import math
|
||||
import logging
|
||||
from unidecode import unidecode
|
||||
from pathlib import Path
|
||||
import json
|
||||
from sentence_transformers import SentenceTransformer, losses, InputExample
|
||||
model="paraphrase-multilingual-mpnet-base-v2"
|
||||
model = SentenceTransformer(model)
|
||||
batch_size = 32
|
||||
num_epochs = 50
|
||||
train_path = Path("/content/train.json")
|
||||
with open(train_path, 'r', encoding='utf-8') as file:
|
||||
|
||||
queries_Categoricos = json.load(file)
|
||||
|
||||
train_loss = losses.MultipleNegativesRankingLoss(model=model)
|
||||
train_examples = []
|
||||
for i in queries_Categoricos.keys():
|
||||
|
||||
for j in queries_Categoricos[i]:
|
||||
i=unidecode(i).strip().lower()
|
||||
j=unidecode(j).strip().lower()
|
||||
score = 1.0
|
||||
#print(i)
|
||||
train_examples.append(InputExample(texts=[ i,j], label=score))
|
||||
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
|
||||
#evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev')
|
||||
|
||||
# Configure the training
|
||||
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
|
||||
logging.info("Warmup-steps: {}".format(warmup_steps))
|
||||
|
||||
# Train the cross-encoder model
|
||||
model.fit(train_objectives=[(train_dataloader, train_loss)],
|
||||
#evaluator=evaluator,
|
||||
epochs=num_epochs,
|
||||
#evaluation_steps=1000,
|
||||
warmup_steps=warmup_steps)
|
||||
|
||||
save_path = "./%spasos/paraphrase-multilingual-mpnet-base-v2/model/"%(str(num_epochs))
|
||||
model.save(save_path)
|
||||
|
||||
from google.colab import drive
|
||||
drive.mount('/content/drive', force_remount=True)
|
||||
|
||||
!zip "./%sp.zip"%(str(num_epochs)) "/content/%spasos"%(str(num_epochs))
|
13
main.py
13
main.py
|
@ -17,28 +17,22 @@ from unidecode import unidecode
|
|||
from nltk.corpus import stopwords
|
||||
from langchain.schema.embeddings import Embeddings
|
||||
from langchain.document_loaders import DataFrameLoader
|
||||
|
||||
from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted
|
||||
#from langchain import PromptTemplate
|
||||
# from langchain.document_loaders import TextLoader
|
||||
# from langchain.text_splitter import CharacterTextSplitter
|
||||
|
||||
# from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
# from langchain.document_loaders import UnstructuredFileLoader
|
||||
# from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
||||
# from langchain.document_loaders import UnstructuredURLLoader
|
||||
# from langchain.document_loaders.csv_loader import CSVLoader
|
||||
# #from langchain import LLMChain
|
||||
|
||||
#
|
||||
#
|
||||
# from langchain.embeddings import HuggingFaceEmbeddings
|
||||
|
||||
|
||||
#from cleantext import clean
|
||||
|
||||
|
||||
|
||||
from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted
|
||||
|
||||
|
||||
class CustomEmbedding(Embeddings, BaseModel):
|
||||
"""embedding model with preprocessing"""
|
||||
|
@ -55,8 +49,9 @@ class CustomEmbedding(Embeddings, BaseModel):
|
|||
def embed_query(self, text: str) -> List[float]:
|
||||
return self._get_embedding(text)
|
||||
|
||||
nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"
|
||||
nameModel="Modelo_embedding_CIDITEL"
|
||||
model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model"
|
||||
print(model)
|
||||
entrenamiento="V1.3"
|
||||
pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite")
|
||||
keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
|
||||
|
|
Loading…
Reference in New Issue