LLm2Node/dbExtractTrainModelRamdonFo...

from models import dbvotes,dbcopies
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from typing import List
from langchain.pydantic_v1 import BaseModel
from langchain.schema.embeddings import Embeddings
from unidecode import unidecode
from nltk.corpus import stopwords
import re
from pathlib import Path
import json
import matplotlib.pyplot as plt
import seaborn as sns
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
    configPath=Path(relPath)
    with open(configPath, 'r', encoding='utf-8') as file:
        config = json.load(file)[nameModel]
    if type(dataOut) is list and len(dataOut)==2:
      Output= config[dataOut[0]][dataOut[1]]
    else:
      Output= config[dataOut]
    return Output

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
    if punctuationOK:
    # remove punctuation
        for sig in [".",",","!","¿","?","=","(",")"]:
            document=document.replace(sig," ")

    if xtrasOK:
        # remove user mentions
        document = re.sub("@[A-Za-z0-9_]+"," ", document)
        # remove URLS
        document = re.sub(r'http\S+', ' ', document)
        # remove hashtags
        document = re.sub("#[A-Za-z0-9_]+","", document)
    if emojiOk:
        # remove emoji's
        document = remove_emoji(document)

    #document = re.sub("[^0-9A-Za-z ]", "" , document)
    # remove double spaces
    #print(document)
    if unidecodeOK:
        document=unidecode(document)


    if stopOK:
        words=document.split(" ")
        stop_words = set(stopwords.words('spanish'))
        words = [w for w in words if not w in stop_words]
        document=" ".join(words)


    document = document.replace('  ',"")
    #print(document)
    return document.strip().lower()

output=[]
for row in dbvotes(dbvotes.votes.id).select():
    if int(row.vote)==1:
        Sal={}
        #print(row.message, row.copy_id,row.vote)
        query = (dbvotes.messages.id==row.message)
        messagequery = dbvotes(query).select(dbvotes.messages.ALL)
        Sal["texto"]=messagequery[0].message
        Sal["etiqueta"]=row.copy_id
        query = (dbcopies.copies.id==row.copy_id)
        copiesquery =dbcopies(query).select(dbcopies.copies.ALL)
        #Sal["copy_message"]=copiesquery[0].copy_message
        Sal["intentionality"]=copiesquery[0].intentionality
        #print(copiesquery)
        output.append(Sal)


df=pd.DataFrame(output)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
    st = SentenceTransformer(model_name)
    return st

class CustomEmbedding(Embeddings, BaseModel,):
    """embedding model with preprocessing"""
    def _get_embedding(self,text) -> List[float]:
        #print(text,"text")
        text=remove_unwanted(text,punctuationOK=True,stopOK=True)
        Sal=emb.encode(text)
        return Sal
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        Sal=[]
        for text in texts:
            Sal.append(self._get_embedding(text))

        return Sal
    def embed_query(self, text: str) -> List[float]:
        return self._get_embedding(text)

nameModel="Modelo_embedding_Mexico_Puebla"
valid_path = extractConfig(dataOut="valid_dataset")
baseModel= extractConfig(dataOut="base_model")
with open(valid_path, 'r', encoding='utf-8') as file:
  queries_Categoricos = json.load(file)
model="./%s/%s/model"%(nameModel,baseModel)

emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding()
train_embeddings = pd.DataFrame(emb2.embed_documents(train_data['texto'].tolist()))
test_embeddings = pd.DataFrame(emb2.embed_documents(test_data['texto'].tolist()))
print(pd.DataFrame(test_embeddings))

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_embeddings, train_data['etiqueta'])

# Hacer predicciones en el conjunto de prueba
predictions = rf_model.predict(test_embeddings)

# Calcular la precisión
accuracy = accuracy_score(test_data['etiqueta'], predictions)
print(f'Precisión del modelo: {accuracy:.2f}')


# verificar características importantes
feature_importances_df = pd.DataFrame(
    {"feature": list(test_embeddings.columns), "importance": rf_model.feature_importances_}
).sort_values("importance", ascending=False)

# Mostrar
print(feature_importances_df)