from models import dbvotes,dbcopies from sentence_transformers import SentenceTransformer from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import pandas as pd from typing import List from langchain.pydantic_v1 import BaseModel from langchain.schema.embeddings import Embeddings from unidecode import unidecode from nltk.corpus import stopwords import re from pathlib import Path import json import matplotlib.pyplot as plt import seaborn as sns def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"): configPath=Path(relPath) with open(configPath, 'r', encoding='utf-8') as file: config = json.load(file)[nameModel] if type(dataOut) is list and len(dataOut)==2: Output= config[dataOut[0]][dataOut[1]] else: Output= config[dataOut] return Output def remove_emoji(string): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) return emoji_pattern.sub(r' ', string) def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False): if punctuationOK: # remove punctuation for sig in [".",",","!","¿","?","=","(",")"]: document=document.replace(sig," ") if xtrasOK: # remove user mentions document = re.sub("@[A-Za-z0-9_]+"," ", document) # remove URLS document = re.sub(r'http\S+', ' ', document) # remove hashtags document = re.sub("#[A-Za-z0-9_]+","", document) if emojiOk: # remove emoji's document = remove_emoji(document) #document = re.sub("[^0-9A-Za-z ]", "" , document) # remove double spaces #print(document) if unidecodeOK: document=unidecode(document) if stopOK: words=document.split(" ") stop_words = set(stopwords.words('spanish')) words = [w for w in words if not w in stop_words] document=" ".join(words) document = document.replace(' ',"") #print(document) return document.strip().lower() output=[] for row in dbvotes(dbvotes.votes.id).select(): if int(row.vote)==1: Sal={} #print(row.message, row.copy_id,row.vote) query = (dbvotes.messages.id==row.message) messagequery = dbvotes(query).select(dbvotes.messages.ALL) Sal["texto"]=messagequery[0].message Sal["etiqueta"]=row.copy_id query = (dbcopies.copies.id==row.copy_id) copiesquery =dbcopies(query).select(dbcopies.copies.ALL) #Sal["copy_message"]=copiesquery[0].copy_message Sal["intentionality"]=copiesquery[0].intentionality #print(copiesquery) output.append(Sal) df=pd.DataFrame(output) train_data, test_data = train_test_split(df, test_size=0.2, random_state=42) def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}): st = SentenceTransformer(model_name) return st class CustomEmbedding(Embeddings, BaseModel,): """embedding model with preprocessing""" def _get_embedding(self,text) -> List[float]: #print(text,"text") text=remove_unwanted(text,punctuationOK=True,stopOK=True) Sal=emb.encode(text) return Sal def embed_documents(self, texts: List[str]) -> List[List[float]]: Sal=[] for text in texts: Sal.append(self._get_embedding(text)) return Sal def embed_query(self, text: str) -> List[float]: return self._get_embedding(text) nameModel="Modelo_embedding_Mexico_Puebla" valid_path = extractConfig(dataOut="valid_dataset") baseModel= extractConfig(dataOut="base_model") with open(valid_path, 'r', encoding='utf-8') as file: queries_Categoricos = json.load(file) model="./%s/%s/model"%(nameModel,baseModel) emb=loadmodelEmb(model_name = model) emb2=CustomEmbedding() train_embeddings = pd.DataFrame(emb2.embed_documents(train_data['texto'].tolist())) test_embeddings = pd.DataFrame(emb2.embed_documents(test_data['texto'].tolist())) print(pd.DataFrame(test_embeddings)) rf_model = RandomForestClassifier(n_estimators=100, random_state=42) rf_model.fit(train_embeddings, train_data['etiqueta']) # Hacer predicciones en el conjunto de prueba predictions = rf_model.predict(test_embeddings) # Calcular la precisión accuracy = accuracy_score(test_data['etiqueta'], predictions) print(f'Precisión del modelo: {accuracy:.2f}') # verificar características importantes feature_importances_df = pd.DataFrame( {"feature": list(test_embeddings.columns), "importance": rf_model.feature_importances_} ).sort_values("importance", ascending=False) # Mostrar print(feature_importances_df)