feat: Option Ciditel, Code to googlecolab,BotCidit
This commit is contained in:
parent
502cf96292
commit
32066fa9fe
|
@ -7,7 +7,7 @@ from sentence_transformers import SentenceTransformer
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import json
|
import json
|
||||||
#"paraphrase-multilingual-mpnet-base-v2",'hackathon-pln-es/paraphrase-spanish-distilroberta'
|
#"paraphrase-multilingual-mpnet-base-v2",'hackathon-pln-es/paraphrase-spanish-distilroberta'
|
||||||
nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"
|
nameModel="Modelo_embedding_CIDITEL"
|
||||||
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
|
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
|
||||||
configPath=Path(relPath)
|
configPath=Path(relPath)
|
||||||
with open(configPath, 'r', encoding='utf-8') as file:
|
with open(configPath, 'r', encoding='utf-8') as file:
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
|
||||||
|
|
||||||
|
!pip install sentence_transformers
|
||||||
|
!pip install unidecode
|
||||||
|
!pip install langchain
|
||||||
|
!pip install faiss-cpu
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
import math
|
||||||
|
import logging
|
||||||
|
from unidecode import unidecode
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
from sentence_transformers import SentenceTransformer, losses, InputExample
|
||||||
|
model="paraphrase-multilingual-mpnet-base-v2"
|
||||||
|
model = SentenceTransformer(model)
|
||||||
|
batch_size = 32
|
||||||
|
num_epochs = 50
|
||||||
|
train_path = Path("/content/train.json")
|
||||||
|
with open(train_path, 'r', encoding='utf-8') as file:
|
||||||
|
|
||||||
|
queries_Categoricos = json.load(file)
|
||||||
|
|
||||||
|
train_loss = losses.MultipleNegativesRankingLoss(model=model)
|
||||||
|
train_examples = []
|
||||||
|
for i in queries_Categoricos.keys():
|
||||||
|
|
||||||
|
for j in queries_Categoricos[i]:
|
||||||
|
i=unidecode(i).strip().lower()
|
||||||
|
j=unidecode(j).strip().lower()
|
||||||
|
score = 1.0
|
||||||
|
#print(i)
|
||||||
|
train_examples.append(InputExample(texts=[ i,j], label=score))
|
||||||
|
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
|
||||||
|
#evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev')
|
||||||
|
|
||||||
|
# Configure the training
|
||||||
|
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
|
||||||
|
logging.info("Warmup-steps: {}".format(warmup_steps))
|
||||||
|
|
||||||
|
# Train the cross-encoder model
|
||||||
|
model.fit(train_objectives=[(train_dataloader, train_loss)],
|
||||||
|
#evaluator=evaluator,
|
||||||
|
epochs=num_epochs,
|
||||||
|
#evaluation_steps=1000,
|
||||||
|
warmup_steps=warmup_steps)
|
||||||
|
|
||||||
|
save_path = "./%spasos/paraphrase-multilingual-mpnet-base-v2/model/"%(str(num_epochs))
|
||||||
|
model.save(save_path)
|
||||||
|
|
||||||
|
from google.colab import drive
|
||||||
|
drive.mount('/content/drive', force_remount=True)
|
||||||
|
|
||||||
|
!zip "./%sp.zip"%(str(num_epochs)) "/content/%spasos"%(str(num_epochs))
|
13
main.py
13
main.py
|
@ -17,28 +17,22 @@ from unidecode import unidecode
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from langchain.schema.embeddings import Embeddings
|
from langchain.schema.embeddings import Embeddings
|
||||||
from langchain.document_loaders import DataFrameLoader
|
from langchain.document_loaders import DataFrameLoader
|
||||||
|
from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted
|
||||||
#from langchain import PromptTemplate
|
#from langchain import PromptTemplate
|
||||||
# from langchain.document_loaders import TextLoader
|
# from langchain.document_loaders import TextLoader
|
||||||
# from langchain.text_splitter import CharacterTextSplitter
|
# from langchain.text_splitter import CharacterTextSplitter
|
||||||
|
|
||||||
# from langchain.text_splitter import RecursiveCharacterTextSplitter
|
# from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
# from langchain.document_loaders import UnstructuredFileLoader
|
# from langchain.document_loaders import UnstructuredFileLoader
|
||||||
# from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
# from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
||||||
# from langchain.document_loaders import UnstructuredURLLoader
|
# from langchain.document_loaders import UnstructuredURLLoader
|
||||||
# from langchain.document_loaders.csv_loader import CSVLoader
|
# from langchain.document_loaders.csv_loader import CSVLoader
|
||||||
# #from langchain import LLMChain
|
# #from langchain import LLMChain
|
||||||
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# from langchain.embeddings import HuggingFaceEmbeddings
|
# from langchain.embeddings import HuggingFaceEmbeddings
|
||||||
|
|
||||||
|
|
||||||
#from cleantext import clean
|
#from cleantext import clean
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted
|
|
||||||
|
|
||||||
class CustomEmbedding(Embeddings, BaseModel):
|
class CustomEmbedding(Embeddings, BaseModel):
|
||||||
"""embedding model with preprocessing"""
|
"""embedding model with preprocessing"""
|
||||||
|
@ -55,8 +49,9 @@ class CustomEmbedding(Embeddings, BaseModel):
|
||||||
def embed_query(self, text: str) -> List[float]:
|
def embed_query(self, text: str) -> List[float]:
|
||||||
return self._get_embedding(text)
|
return self._get_embedding(text)
|
||||||
|
|
||||||
nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"
|
nameModel="Modelo_embedding_CIDITEL"
|
||||||
model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model"
|
model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model"
|
||||||
|
print(model)
|
||||||
entrenamiento="V1.3"
|
entrenamiento="V1.3"
|
||||||
pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite")
|
pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite")
|
||||||
keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
|
keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
|
||||||
|
|
Loading…
Reference in New Issue