390 lines
13 KiB
Python
390 lines
13 KiB
Python
|
|
from langchain.document_loaders import DataFrameLoader
|
|
from langchain.embeddings import HuggingFaceEmbeddings
|
|
from langchain.vectorstores import FAISS
|
|
from langchain.pydantic_v1 import BaseModel
|
|
from langchain.schema.embeddings import Embeddings
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
from typing import List
|
|
import sqlite3
|
|
import pandas as pd
|
|
import shutil
|
|
import re
|
|
import numpy as np
|
|
import inspect
|
|
import time
|
|
from unidecode import unidecode
|
|
from nltk.corpus import stopwords
|
|
import seaborn as sns
|
|
import argparse
|
|
from scipy.spatial import distance
|
|
from pathlib import Path
|
|
import json
|
|
import os
|
|
from nltk.corpus import stopwords
|
|
import nltk
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("-f", "--file", help="Nombre de archivo a procesar")
|
|
parser.add_argument("-d", "--distance", default="distance")
|
|
parser.add_argument("-m", "--models", default="All")
|
|
args = parser.parse_args()
|
|
|
|
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
|
|
configPath=Path(relPath)
|
|
with open(configPath, 'r', encoding='utf-8') as file:
|
|
config = json.load(file)[nameModel]
|
|
if type(dataOut) is list and len(dataOut)==2:
|
|
Output= config[dataOut[0]][dataOut[1]]
|
|
else:
|
|
Output= config[dataOut]
|
|
return Output
|
|
|
|
# if args.file:
|
|
# print ("El nombre de archivo a procesar es: ", )
|
|
class CustomEmbedding(Embeddings, BaseModel,):
|
|
"""embedding model with preprocessing"""
|
|
def _get_embedding(self,text) -> List[float]:
|
|
#print(text,"text")
|
|
text=remove_unwanted(text,punctuationOK=True,stopOK=True)
|
|
Sal=emb.encode(text)
|
|
return Sal
|
|
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
Sal=[]
|
|
for text in texts:
|
|
Sal.append(self._get_embedding(text))
|
|
|
|
return Sal
|
|
def embed_query(self, text: str) -> List[float]:
|
|
return self._get_embedding(text)
|
|
|
|
def remove_emoji(string):
|
|
emoji_pattern = re.compile("["
|
|
u"\U0001F600-\U0001F64F" # emoticons
|
|
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
|
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
|
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
|
u"\U00002702-\U000027B0"
|
|
u"\U000024C2-\U0001F251"
|
|
"]+", flags=re.UNICODE)
|
|
return emoji_pattern.sub(r' ', string)
|
|
|
|
def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
|
|
if punctuationOK:
|
|
# remove punctuation
|
|
for sig in [".",",","!","¿","?","=","(",")"]:
|
|
document=document.replace(sig," ")
|
|
|
|
if xtrasOK:
|
|
# remove user mentions
|
|
document = re.sub("@[A-Za-z0-9_]+"," ", document)
|
|
# remove URLS
|
|
document = re.sub(r'http\S+', ' ', document)
|
|
# remove hashtags
|
|
document = re.sub("#[A-Za-z0-9_]+","", document)
|
|
if emojiOk:
|
|
# remove emoji's
|
|
document = remove_emoji(document)
|
|
|
|
#document = re.sub("[^0-9A-Za-z ]", "" , document)
|
|
# remove double spaces
|
|
#print(document)
|
|
if unidecodeOK:
|
|
document=unidecode(document)
|
|
|
|
|
|
if stopOK:
|
|
words=document.split(" ")
|
|
stop_words = set(stopwords.words('spanish'))
|
|
words = [w for w in words if not w in stop_words]
|
|
document=" ".join(words)
|
|
|
|
|
|
|
|
document = document.replace(' ',"")
|
|
#print(document)
|
|
return document.strip().lower()
|
|
|
|
def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
|
|
st = SentenceTransformer(model_name)
|
|
return st
|
|
|
|
pathsqlite=extractConfig(dataOut="pathsqlite")
|
|
def loadCopysAndData(pathsqlite=pathsqlite):
|
|
con = sqlite3.connect(pathsqlite)
|
|
copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
|
|
copiesT = copies_df
|
|
copiesT=copiesT[["copy_message","id","name","intentionality"]]
|
|
#print(copiesT)
|
|
data = copiesT
|
|
#print(data)
|
|
B=DataFrameLoader(data,page_content_column="copy_message")
|
|
B2=DataFrameLoader(data,page_content_column="intentionality")
|
|
documents=B.load()
|
|
documents2=B2.load()
|
|
return documents,documents2
|
|
def makeFaissdb(documents,folder_path,embedding):
|
|
try:
|
|
shutil.rmtree(folder_path)
|
|
except:
|
|
pass
|
|
db = FAISS.from_documents(documents, embedding)
|
|
FAISS.save_local(db,folder_path=folder_path)
|
|
return db
|
|
def FinderDbs(query,dbs,filtred=False,th=5000):
|
|
AllData={}
|
|
for dbt in dbs:
|
|
Sal = dbt.similarity_search_with_score(query,4)
|
|
for output in Sal:
|
|
#print(output)
|
|
if output[0].metadata["id"] in AllData.keys():
|
|
AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"]-0.1,output[1]-0.1])
|
|
else:
|
|
AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
|
|
#for item in AllData.items():
|
|
# print(item)
|
|
|
|
if filtred:
|
|
filtredData={}
|
|
for row in AllData.keys():
|
|
if AllData[row]["d"]<th:
|
|
filtredData[row]=AllData[row]
|
|
filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
|
|
return filtredData,filtredData.keys()
|
|
|
|
|
|
else:
|
|
AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
|
|
return AllData,AllData.keys()
|
|
|
|
|
|
|
|
nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"
|
|
valid_path = extractConfig(nameModel=nameModel,dataOut="valid_dataset")
|
|
baseModel= extractConfig(nameModel=nameModel,dataOut="base_model")
|
|
path_model=extractConfig(nameModel=nameModel,dataOut="path_model")
|
|
with open(valid_path, 'r', encoding='utf-8') as file:
|
|
queries_Categoricos = json.load(file)
|
|
models=["./"+path_model]
|
|
#print(1111,models)
|
|
copies_text=queries_Categoricos.keys()
|
|
|
|
try:
|
|
os.makedirs("./%s/Sta"%(path_model), exist_ok = True)
|
|
except OSError as error:
|
|
pass
|
|
|
|
def plotVioin(Sal,Listqueries):
|
|
NewData=pd.DataFrame.from_dict(Sal)
|
|
|
|
plt=sns.violinplot(data=NewData, x="model", y="distance", hue="copy_test",fill=False,inner=None,width=0.5)
|
|
|
|
plt.set_xticklabels(plt.get_xticklabels(), rotation=45,horizontalalignment='right')
|
|
|
|
fig=plt.get_figure()
|
|
fig.set_size_inches(17.7, 12.27)
|
|
fig.savefig('./%s/%s/Sta/sns_violin_plot%s.png'%(nameModel,baseModel,Listqueries), dpi=300)
|
|
NewData.to_csv("./%s/%s/Sta/NewData%s.csv"%(nameModel,baseModel,Listqueries))
|
|
def plotViointime(Sal,Listqueries):
|
|
NewData=pd.DataFrame.from_dict(Sal)
|
|
|
|
plt=sns.violinplot(data=NewData, x="model", y="time",fill=False,inner=None,width=0.5)
|
|
|
|
plt.set_xticklabels(plt.get_xticklabels(), rotation=45,horizontalalignment='right')
|
|
|
|
fig=plt.get_figure()
|
|
fig.set_size_inches(17.7, 12.27)
|
|
fig.savefig('./%s/%s/Sta/sns_violin_plot_time%s.png'%(nameModel,baseModel,Listqueries), dpi=300)
|
|
NewData.to_csv("./%s/%s/Sta/NewData%s.csv"%(nameModel,baseModel,Listqueries))
|
|
def queries_CatPlot(Listqueries):
|
|
Sal=[]
|
|
queries=queries_Categoricos[Listqueries]
|
|
for model in models:
|
|
for copy_text in copies_text:
|
|
global emb
|
|
emb=loadmodelEmb(model_name = model)
|
|
emb2=CustomEmbedding()
|
|
emb2.embed_query("test 123321")
|
|
sal=[]
|
|
for query in queries:
|
|
t=time.time()
|
|
A={"model":model,
|
|
"query":query,
|
|
"copy_test":copy_text,
|
|
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(copy_text)),
|
|
"time":time.time()-t
|
|
}
|
|
Sal.append(A)
|
|
if args.distance=="distance":
|
|
plotVioin(Sal,Listqueries)
|
|
if args.distance=="time":
|
|
plotViointime(Sal,Listqueries)
|
|
|
|
def queries_CatSta():
|
|
Sal=[]
|
|
for model in models:
|
|
for copy_text in copies_text:
|
|
global emb
|
|
#print(2222,model)
|
|
emb=loadmodelEmb(model_name = model+"/model")
|
|
emb2=CustomEmbedding()
|
|
emb2.embed_query("test 123321")
|
|
Sal=[]
|
|
for query in queries_Categoricos[copy_text]:
|
|
t=time.time()
|
|
A={"model":model,
|
|
"query":query,
|
|
"type":"insider",
|
|
"copy_test":copy_text,
|
|
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(copy_text)),
|
|
"time":time.time()-t
|
|
}
|
|
Sal.append(A)
|
|
outdata=set(queries_Categoricos.keys())
|
|
outdata.remove(copy_text)
|
|
for query in outdata:
|
|
t=time.time()
|
|
A={"model":model,
|
|
"query":query,
|
|
"type":"outsider_n1",
|
|
"copy_test":copy_text,
|
|
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(copy_text)),
|
|
"time":time.time()-t
|
|
}
|
|
Sal.append(A)
|
|
outdata2=queries_Categoricos[query]
|
|
for query2 in outdata2:
|
|
t=time.time()
|
|
A={"model":model,
|
|
"query":query2,
|
|
"type":"outsider_n2",
|
|
"copy_test":copy_text,
|
|
"distance":distance.cosine(emb2.embed_query(query2),emb2.embed_query(copy_text)),
|
|
"time":time.time()-t
|
|
}
|
|
Sal.append(A)
|
|
df=pd.DataFrame(Sal)
|
|
df.to_csv("./%s/Sta/NewData%s.csv"%(path_model,copy_text[0:50]))
|
|
return Sal
|
|
|
|
|
|
|
|
def queries_CatSta_in(queries_Categoricos,model="embeddings/all-mpnet-base-v2"):
|
|
global emb
|
|
emb=loadmodelEmb(model_name = model)
|
|
emb2=CustomEmbedding()
|
|
emb2.embed_query("test 123321")
|
|
Sal=[]
|
|
for objetive in queries_Categoricos.keys():
|
|
for query in queries_Categoricos[objetive]:
|
|
t=time.time()
|
|
A={"model":model,
|
|
"query":query,
|
|
"type":"insider",
|
|
"objetive":objetive,
|
|
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(objetive)),
|
|
"time":time.time()-t
|
|
}
|
|
Sal.append(A)
|
|
return Sal
|
|
|
|
def queries_CatSta_out(queries_Categoricos,model="embeddings/all-mpnet-base-v2"):
|
|
global emb
|
|
emb=loadmodelEmb(model_name = model)
|
|
emb2=CustomEmbedding()
|
|
emb2.embed_query("test 123321")
|
|
Sal=[]
|
|
for objetive in queries_Categoricos.keys():
|
|
outdata=set(queries_Categoricos.keys())
|
|
outdata.remove(objetive)
|
|
for outdataObj in list(outdata):
|
|
for query in queries_Categoricos[outdataObj]:
|
|
t=time.time()
|
|
A={"model":model,
|
|
"query":query,
|
|
"type":"outsider",
|
|
"objetive":objetive,
|
|
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(objetive)),
|
|
"time":time.time()-t
|
|
}
|
|
Sal.append(A)
|
|
return Sal
|
|
|
|
queries_CatSta()
|
|
|
|
def evalDb(text,dbs):
|
|
AllData=FinderDbs(text,dbs,filtred=5)
|
|
print(AllData)
|
|
if AllData:
|
|
AllData = list(AllData)
|
|
dis=[]
|
|
id=[]
|
|
for k,i in enumerate(AllData[0].items()):
|
|
dis.append(str(i[1]['d']))
|
|
id.append(i[0])
|
|
return dis,id
|
|
|
|
def EvalClass(dbs):
|
|
valid_path = Path(extractConfig(dataOut="valid_dataset_Class"))
|
|
with open(valid_path, 'r', encoding='utf-8') as file:
|
|
queries_Categoricos = json.load(file)
|
|
Sal = []
|
|
for i in queries_Categoricos.keys():
|
|
for j in queries_Categoricos[i]:
|
|
i=unidecode(i).strip().lower()
|
|
j=unidecode(j).strip().lower()
|
|
score = 1.0
|
|
dis,id=evalDb(j,dbs)
|
|
|
|
try:
|
|
pass
|
|
#print(j,i,id, dis[0])
|
|
except:
|
|
pass
|
|
#print(j,i,id)
|
|
Top8=0
|
|
Top1=0
|
|
Distancia=99
|
|
if int(i) in id:
|
|
Top8=1
|
|
try:
|
|
if int(i)==id[0]:
|
|
Top1=1
|
|
Distancia=dis[0]
|
|
except:
|
|
pass
|
|
Sal.append([j,i,Top8,Top1,Distancia])
|
|
df=pd.DataFrame(Sal,columns=['query', 'IdDb', 'Top8',"Top1","dist"])
|
|
df.to_csv("./%s/Sta/EvalClass.csv"%(path_model))
|
|
|
|
#queries_CatPlot(copies_text)
|
|
nltk.download('stopwords')
|
|
|
|
|
|
|
|
|
|
#llm,emb=loadModels()
|
|
model=models[0]
|
|
#print(model)
|
|
documents,documents2=loadCopysAndData()
|
|
emb=loadmodelEmb(model_name = model+"/model")
|
|
emb2=CustomEmbedding()
|
|
db=makeFaissdb(documents,"Copies3",emb2)
|
|
db2=makeFaissdb(documents2,"Intentionality3",emb2)
|
|
EvalClass([db,db2])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|