LLm2Node/dbExtractTrainModelRamdonFo...

146 lines
5.1 KiB
Python

from models import dbvotes,dbcopies
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from typing import List
from langchain.pydantic_v1 import BaseModel
from langchain.schema.embeddings import Embeddings
from unidecode import unidecode
from nltk.corpus import stopwords
import re
from pathlib import Path
import json
import matplotlib.pyplot as plt
import seaborn as sns
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
configPath=Path(relPath)
with open(configPath, 'r', encoding='utf-8') as file:
config = json.load(file)[nameModel]
if type(dataOut) is list and len(dataOut)==2:
Output= config[dataOut[0]][dataOut[1]]
else:
Output= config[dataOut]
return Output
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r' ', string)
def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
if punctuationOK:
# remove punctuation
for sig in [".",",","!","¿","?","=","(",")"]:
document=document.replace(sig," ")
if xtrasOK:
# remove user mentions
document = re.sub("@[A-Za-z0-9_]+"," ", document)
# remove URLS
document = re.sub(r'http\S+', ' ', document)
# remove hashtags
document = re.sub("#[A-Za-z0-9_]+","", document)
if emojiOk:
# remove emoji's
document = remove_emoji(document)
#document = re.sub("[^0-9A-Za-z ]", "" , document)
# remove double spaces
#print(document)
if unidecodeOK:
document=unidecode(document)
if stopOK:
words=document.split(" ")
stop_words = set(stopwords.words('spanish'))
words = [w for w in words if not w in stop_words]
document=" ".join(words)
document = document.replace(' ',"")
#print(document)
return document.strip().lower()
output=[]
for row in dbvotes(dbvotes.votes.id).select():
if int(row.vote)==1:
Sal={}
#print(row.message, row.copy_id,row.vote)
query = (dbvotes.messages.id==row.message)
messagequery = dbvotes(query).select(dbvotes.messages.ALL)
Sal["texto"]=messagequery[0].message
Sal["etiqueta"]=row.copy_id
query = (dbcopies.copies.id==row.copy_id)
copiesquery =dbcopies(query).select(dbcopies.copies.ALL)
#Sal["copy_message"]=copiesquery[0].copy_message
Sal["intentionality"]=copiesquery[0].intentionality
#print(copiesquery)
output.append(Sal)
df=pd.DataFrame(output)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
st = SentenceTransformer(model_name)
return st
class CustomEmbedding(Embeddings, BaseModel,):
"""embedding model with preprocessing"""
def _get_embedding(self,text) -> List[float]:
#print(text,"text")
text=remove_unwanted(text,punctuationOK=True,stopOK=True)
Sal=emb.encode(text)
return Sal
def embed_documents(self, texts: List[str]) -> List[List[float]]:
Sal=[]
for text in texts:
Sal.append(self._get_embedding(text))
return Sal
def embed_query(self, text: str) -> List[float]:
return self._get_embedding(text)
nameModel="Modelo_embedding_Mexico_Puebla"
valid_path = extractConfig(dataOut="valid_dataset")
baseModel= extractConfig(dataOut="base_model")
with open(valid_path, 'r', encoding='utf-8') as file:
queries_Categoricos = json.load(file)
model="./%s/%s/model"%(nameModel,baseModel)
emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding()
train_embeddings = pd.DataFrame(emb2.embed_documents(train_data['texto'].tolist()))
test_embeddings = pd.DataFrame(emb2.embed_documents(test_data['texto'].tolist()))
print(pd.DataFrame(test_embeddings))
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_embeddings, train_data['etiqueta'])
# Hacer predicciones en el conjunto de prueba
predictions = rf_model.predict(test_embeddings)
# Calcular la precisión
accuracy = accuracy_score(test_data['etiqueta'], predictions)
print(f'Precisión del modelo: {accuracy:.2f}')
# verificar características importantes
feature_importances_df = pd.DataFrame(
{"feature": list(test_embeddings.columns), "importance": rf_model.feature_importances_}
).sort_values("importance", ascending=False)
# Mostrar
print(feature_importances_df)