diff --git a/main.py b/main.py index 2f745c9..6e94f03 100644 --- a/main.py +++ b/main.py @@ -24,12 +24,23 @@ from nltk.corpus import stopwords from typing import Optional #from cleantext import clean import re -from langid.langid import LanguageIdentifier -from langid.langid import model as modellangid +from pathlib import Path +import json +#from langid.langid import LanguageIdentifier +#from langid.langid import model as modellangid import time model="Modelo_embedding_Mexico_Puebla/all-mpnet-base-v2/model" entrenamiento="V1.3" +def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"): + configPath=Path(relPath) + with open(configPath, 'r', encoding='utf-8') as file: + config = json.load(file)[nameModel] + + Output= Path(config[dataOut]) + return Output + + class CustomEmbedding(Embeddings, BaseModel,): @@ -100,12 +111,12 @@ def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'dev return st -def loadCopysAndData(pathsqlite="/opt/web2py/applications/MotorAngela/databases/storage.sqlite"): +def loadCopysAndData(pathsqlite=pathDb): con = sqlite3.connect(pathsqlite) copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con) copiesT = copies_df copiesT=copiesT[["copy_message","id","name","intentionality"]] - print(copiesT) + #print(copiesT) data = copiesT #print(data) B=DataFrameLoader(data,page_content_column="copy_message") @@ -116,8 +127,7 @@ def loadCopysAndData(pathsqlite="/opt/web2py/applications/MotorAngela/databases/ def makeFaissdb(documents,folder_path,embedding): try: - db=FAISS.load_local(folder_path=folder_path,embeddings=embedding) - + db=FAISS.load_local(folder_path=folder_path,embeddings=embedding) except: db = FAISS.from_documents(documents, embedding) FAISS.save_local(db,folder_path=folder_path) @@ -131,7 +141,7 @@ emb2=CustomEmbedding() db=makeFaissdb(documents,"Copies3",emb2) db2=makeFaissdb(documents2,"Intentionality3",emb2) #db3=makeFaissdb(documents2,"nameshf",hf) -identifier = LanguageIdentifier.from_modelstring(modellangid, norm_probs=True) +#identifier = LanguageIdentifier.from_modelstring(modellangid, norm_probs=True)