LLm2Node/metrics.py

390 lines
13 KiB
Python

from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.pydantic_v1 import BaseModel
from langchain.schema.embeddings import Embeddings
from sentence_transformers import SentenceTransformer
from typing import List
import sqlite3
import pandas as pd
import shutil
import re
import numpy as np
import inspect
import time
from unidecode import unidecode
from nltk.corpus import stopwords
import seaborn as sns
import argparse
from scipy.spatial import distance
from pathlib import Path
import json
import os
from nltk.corpus import stopwords
import nltk
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file", help="Nombre de archivo a procesar")
parser.add_argument("-d", "--distance", default="distance")
parser.add_argument("-m", "--models", default="All")
args = parser.parse_args()
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
configPath=Path(relPath)
with open(configPath, 'r', encoding='utf-8') as file:
config = json.load(file)[nameModel]
if type(dataOut) is list and len(dataOut)==2:
Output= config[dataOut[0]][dataOut[1]]
else:
Output= config[dataOut]
return Output
# if args.file:
# print ("El nombre de archivo a procesar es: ", )
class CustomEmbedding(Embeddings, BaseModel,):
"""embedding model with preprocessing"""
def _get_embedding(self,text) -> List[float]:
#print(text,"text")
text=remove_unwanted(text,punctuationOK=True,stopOK=True)
Sal=emb.encode(text)
return Sal
def embed_documents(self, texts: List[str]) -> List[List[float]]:
Sal=[]
for text in texts:
Sal.append(self._get_embedding(text))
return Sal
def embed_query(self, text: str) -> List[float]:
return self._get_embedding(text)
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r' ', string)
def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
if punctuationOK:
# remove punctuation
for sig in [".",",","!","¿","?","=","(",")"]:
document=document.replace(sig," ")
if xtrasOK:
# remove user mentions
document = re.sub("@[A-Za-z0-9_]+"," ", document)
# remove URLS
document = re.sub(r'http\S+', ' ', document)
# remove hashtags
document = re.sub("#[A-Za-z0-9_]+","", document)
if emojiOk:
# remove emoji's
document = remove_emoji(document)
#document = re.sub("[^0-9A-Za-z ]", "" , document)
# remove double spaces
#print(document)
if unidecodeOK:
document=unidecode(document)
if stopOK:
words=document.split(" ")
stop_words = set(stopwords.words('spanish'))
words = [w for w in words if not w in stop_words]
document=" ".join(words)
document = document.replace(' ',"")
#print(document)
return document.strip().lower()
def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
st = SentenceTransformer(model_name)
return st
pathsqlite=extractConfig(dataOut="pathsqlite")
def loadCopysAndData(pathsqlite=pathsqlite):
con = sqlite3.connect(pathsqlite)
copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
copiesT = copies_df
copiesT=copiesT[["copy_message","id","name","intentionality"]]
#print(copiesT)
data = copiesT
#print(data)
B=DataFrameLoader(data,page_content_column="copy_message")
B2=DataFrameLoader(data,page_content_column="intentionality")
documents=B.load()
documents2=B2.load()
return documents,documents2
def makeFaissdb(documents,folder_path,embedding):
try:
shutil.rmtree(folder_path)
except:
pass
db = FAISS.from_documents(documents, embedding)
FAISS.save_local(db,folder_path=folder_path)
return db
def FinderDbs(query,dbs,filtred=False,th=5000):
AllData={}
for dbt in dbs:
Sal = dbt.similarity_search_with_score(query,4)
for output in Sal:
#print(output)
if output[0].metadata["id"] in AllData.keys():
AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"]-0.1,output[1]-0.1])
else:
AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
#for item in AllData.items():
# print(item)
if filtred:
filtredData={}
for row in AllData.keys():
if AllData[row]["d"]<th:
filtredData[row]=AllData[row]
filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
return filtredData,filtredData.keys()
else:
AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
return AllData,AllData.keys()
nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"
valid_path = extractConfig(nameModel=nameModel,dataOut="valid_dataset")
baseModel= extractConfig(nameModel=nameModel,dataOut="base_model")
path_model=extractConfig(nameModel=nameModel,dataOut="path_model")
with open(valid_path, 'r', encoding='utf-8') as file:
queries_Categoricos = json.load(file)
models=["./"+path_model]
#print(1111,models)
copies_text=queries_Categoricos.keys()
try:
os.makedirs("./%s/Sta"%(path_model), exist_ok = True)
except OSError as error:
pass
def plotVioin(Sal,Listqueries):
NewData=pd.DataFrame.from_dict(Sal)
plt=sns.violinplot(data=NewData, x="model", y="distance", hue="copy_test",fill=False,inner=None,width=0.5)
plt.set_xticklabels(plt.get_xticklabels(), rotation=45,horizontalalignment='right')
fig=plt.get_figure()
fig.set_size_inches(17.7, 12.27)
fig.savefig('./%s/%s/Sta/sns_violin_plot%s.png'%(nameModel,baseModel,Listqueries), dpi=300)
NewData.to_csv("./%s/%s/Sta/NewData%s.csv"%(nameModel,baseModel,Listqueries))
def plotViointime(Sal,Listqueries):
NewData=pd.DataFrame.from_dict(Sal)
plt=sns.violinplot(data=NewData, x="model", y="time",fill=False,inner=None,width=0.5)
plt.set_xticklabels(plt.get_xticklabels(), rotation=45,horizontalalignment='right')
fig=plt.get_figure()
fig.set_size_inches(17.7, 12.27)
fig.savefig('./%s/%s/Sta/sns_violin_plot_time%s.png'%(nameModel,baseModel,Listqueries), dpi=300)
NewData.to_csv("./%s/%s/Sta/NewData%s.csv"%(nameModel,baseModel,Listqueries))
def queries_CatPlot(Listqueries):
Sal=[]
queries=queries_Categoricos[Listqueries]
for model in models:
for copy_text in copies_text:
global emb
emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding()
emb2.embed_query("test 123321")
sal=[]
for query in queries:
t=time.time()
A={"model":model,
"query":query,
"copy_test":copy_text,
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(copy_text)),
"time":time.time()-t
}
Sal.append(A)
if args.distance=="distance":
plotVioin(Sal,Listqueries)
if args.distance=="time":
plotViointime(Sal,Listqueries)
def queries_CatSta():
Sal=[]
for model in models:
for copy_text in copies_text:
global emb
#print(2222,model)
emb=loadmodelEmb(model_name = model+"/model")
emb2=CustomEmbedding()
emb2.embed_query("test 123321")
Sal=[]
for query in queries_Categoricos[copy_text]:
t=time.time()
A={"model":model,
"query":query,
"type":"insider",
"copy_test":copy_text,
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(copy_text)),
"time":time.time()-t
}
Sal.append(A)
outdata=set(queries_Categoricos.keys())
outdata.remove(copy_text)
for query in outdata:
t=time.time()
A={"model":model,
"query":query,
"type":"outsider_n1",
"copy_test":copy_text,
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(copy_text)),
"time":time.time()-t
}
Sal.append(A)
outdata2=queries_Categoricos[query]
for query2 in outdata2:
t=time.time()
A={"model":model,
"query":query2,
"type":"outsider_n2",
"copy_test":copy_text,
"distance":distance.cosine(emb2.embed_query(query2),emb2.embed_query(copy_text)),
"time":time.time()-t
}
Sal.append(A)
df=pd.DataFrame(Sal)
df.to_csv("./%s/Sta/NewData%s.csv"%(path_model,copy_text[0:50]))
return Sal
def queries_CatSta_in(queries_Categoricos,model="embeddings/all-mpnet-base-v2"):
global emb
emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding()
emb2.embed_query("test 123321")
Sal=[]
for objetive in queries_Categoricos.keys():
for query in queries_Categoricos[objetive]:
t=time.time()
A={"model":model,
"query":query,
"type":"insider",
"objetive":objetive,
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(objetive)),
"time":time.time()-t
}
Sal.append(A)
return Sal
def queries_CatSta_out(queries_Categoricos,model="embeddings/all-mpnet-base-v2"):
global emb
emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding()
emb2.embed_query("test 123321")
Sal=[]
for objetive in queries_Categoricos.keys():
outdata=set(queries_Categoricos.keys())
outdata.remove(objetive)
for outdataObj in list(outdata):
for query in queries_Categoricos[outdataObj]:
t=time.time()
A={"model":model,
"query":query,
"type":"outsider",
"objetive":objetive,
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(objetive)),
"time":time.time()-t
}
Sal.append(A)
return Sal
queries_CatSta()
def evalDb(text,dbs):
AllData=FinderDbs(text,dbs,filtred=5)
print(AllData)
if AllData:
AllData = list(AllData)
dis=[]
id=[]
for k,i in enumerate(AllData[0].items()):
dis.append(str(i[1]['d']))
id.append(i[0])
return dis,id
def EvalClass(dbs):
valid_path = Path(extractConfig(dataOut="valid_dataset_Class"))
with open(valid_path, 'r', encoding='utf-8') as file:
queries_Categoricos = json.load(file)
Sal = []
for i in queries_Categoricos.keys():
for j in queries_Categoricos[i]:
i=unidecode(i).strip().lower()
j=unidecode(j).strip().lower()
score = 1.0
dis,id=evalDb(j,dbs)
try:
pass
#print(j,i,id, dis[0])
except:
pass
#print(j,i,id)
Top8=0
Top1=0
Distancia=99
if int(i) in id:
Top8=1
try:
if int(i)==id[0]:
Top1=1
Distancia=dis[0]
except:
pass
Sal.append([j,i,Top8,Top1,Distancia])
df=pd.DataFrame(Sal,columns=['query', 'IdDb', 'Top8',"Top1","dist"])
df.to_csv("./%s/Sta/EvalClass.csv"%(path_model))
#queries_CatPlot(copies_text)
nltk.download('stopwords')
#llm,emb=loadModels()
model=models[0]
#print(model)
documents,documents2=loadCopysAndData()
emb=loadmodelEmb(model_name = model+"/model")
emb2=CustomEmbedding()
db=makeFaissdb(documents,"Copies3",emb2)
db2=makeFaissdb(documents2,"Intentionality3",emb2)
EvalClass([db,db2])