diff --git a/DownloadModels.py b/DownloadModels.py index 612b29c..9a655f8 100644 --- a/DownloadModels.py +++ b/DownloadModels.py @@ -7,7 +7,7 @@ from sentence_transformers import SentenceTransformer from pathlib import Path import json #"paraphrase-multilingual-mpnet-base-v2",'hackathon-pln-es/paraphrase-spanish-distilroberta' -nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid" +nameModel="Modelo_embedding_CIDITEL" def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"): configPath=Path(relPath) with open(configPath, 'r', encoding='utf-8') as file: diff --git a/finetrainCollabversion.py b/finetrainCollabversion.py new file mode 100644 index 0000000..ed6d6bf --- /dev/null +++ b/finetrainCollabversion.py @@ -0,0 +1,53 @@ + + +!pip install sentence_transformers +!pip install unidecode +!pip install langchain +!pip install faiss-cpu +from torch.utils.data import DataLoader +import math +import logging +from unidecode import unidecode +from pathlib import Path +import json +from sentence_transformers import SentenceTransformer, losses, InputExample +model="paraphrase-multilingual-mpnet-base-v2" +model = SentenceTransformer(model) +batch_size = 32 +num_epochs = 50 +train_path = Path("/content/train.json") +with open(train_path, 'r', encoding='utf-8') as file: + + queries_Categoricos = json.load(file) + +train_loss = losses.MultipleNegativesRankingLoss(model=model) +train_examples = [] +for i in queries_Categoricos.keys(): + + for j in queries_Categoricos[i]: + i=unidecode(i).strip().lower() + j=unidecode(j).strip().lower() + score = 1.0 + #print(i) + train_examples.append(InputExample(texts=[ i,j], label=score)) +train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size) +#evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev') + +# Configure the training +warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up +logging.info("Warmup-steps: {}".format(warmup_steps)) + +# Train the cross-encoder model +model.fit(train_objectives=[(train_dataloader, train_loss)], + #evaluator=evaluator, + epochs=num_epochs, + #evaluation_steps=1000, + warmup_steps=warmup_steps) + +save_path = "./%spasos/paraphrase-multilingual-mpnet-base-v2/model/"%(str(num_epochs)) +model.save(save_path) + +from google.colab import drive +drive.mount('/content/drive', force_remount=True) + +!zip "./%sp.zip"%(str(num_epochs)) "/content/%spasos"%(str(num_epochs)) \ No newline at end of file diff --git a/main.py b/main.py index 2878a23..3959aa0 100644 --- a/main.py +++ b/main.py @@ -17,28 +17,22 @@ from unidecode import unidecode from nltk.corpus import stopwords from langchain.schema.embeddings import Embeddings from langchain.document_loaders import DataFrameLoader - +from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted #from langchain import PromptTemplate # from langchain.document_loaders import TextLoader # from langchain.text_splitter import CharacterTextSplitter - # from langchain.text_splitter import RecursiveCharacterTextSplitter # from langchain.document_loaders import UnstructuredFileLoader # from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader # from langchain.document_loaders import UnstructuredURLLoader # from langchain.document_loaders.csv_loader import CSVLoader # #from langchain import LLMChain - -# -# # from langchain.embeddings import HuggingFaceEmbeddings - - #from cleantext import clean -from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted + class CustomEmbedding(Embeddings, BaseModel): """embedding model with preprocessing""" @@ -55,8 +49,9 @@ class CustomEmbedding(Embeddings, BaseModel): def embed_query(self, text: str) -> List[float]: return self._get_embedding(text) -nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid" +nameModel="Modelo_embedding_CIDITEL" model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model" +print(model) entrenamiento="V1.3" pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite") keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")