146 lines
5.1 KiB
Python
146 lines
5.1 KiB
Python
from models import dbvotes,dbcopies
|
|
from sentence_transformers import SentenceTransformer
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import accuracy_score
|
|
import pandas as pd
|
|
from typing import List
|
|
from langchain.pydantic_v1 import BaseModel
|
|
from langchain.schema.embeddings import Embeddings
|
|
from unidecode import unidecode
|
|
from nltk.corpus import stopwords
|
|
import re
|
|
from pathlib import Path
|
|
import json
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
|
|
configPath=Path(relPath)
|
|
with open(configPath, 'r', encoding='utf-8') as file:
|
|
config = json.load(file)[nameModel]
|
|
if type(dataOut) is list and len(dataOut)==2:
|
|
Output= config[dataOut[0]][dataOut[1]]
|
|
else:
|
|
Output= config[dataOut]
|
|
return Output
|
|
|
|
def remove_emoji(string):
|
|
emoji_pattern = re.compile("["
|
|
u"\U0001F600-\U0001F64F" # emoticons
|
|
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
|
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
|
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
|
u"\U00002702-\U000027B0"
|
|
u"\U000024C2-\U0001F251"
|
|
"]+", flags=re.UNICODE)
|
|
return emoji_pattern.sub(r' ', string)
|
|
|
|
def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
|
|
if punctuationOK:
|
|
# remove punctuation
|
|
for sig in [".",",","!","¿","?","=","(",")"]:
|
|
document=document.replace(sig," ")
|
|
|
|
if xtrasOK:
|
|
# remove user mentions
|
|
document = re.sub("@[A-Za-z0-9_]+"," ", document)
|
|
# remove URLS
|
|
document = re.sub(r'http\S+', ' ', document)
|
|
# remove hashtags
|
|
document = re.sub("#[A-Za-z0-9_]+","", document)
|
|
if emojiOk:
|
|
# remove emoji's
|
|
document = remove_emoji(document)
|
|
|
|
#document = re.sub("[^0-9A-Za-z ]", "" , document)
|
|
# remove double spaces
|
|
#print(document)
|
|
if unidecodeOK:
|
|
document=unidecode(document)
|
|
|
|
|
|
if stopOK:
|
|
words=document.split(" ")
|
|
stop_words = set(stopwords.words('spanish'))
|
|
words = [w for w in words if not w in stop_words]
|
|
document=" ".join(words)
|
|
|
|
|
|
|
|
document = document.replace(' ',"")
|
|
#print(document)
|
|
return document.strip().lower()
|
|
|
|
output=[]
|
|
for row in dbvotes(dbvotes.votes.id).select():
|
|
if int(row.vote)==1:
|
|
Sal={}
|
|
#print(row.message, row.copy_id,row.vote)
|
|
query = (dbvotes.messages.id==row.message)
|
|
messagequery = dbvotes(query).select(dbvotes.messages.ALL)
|
|
Sal["texto"]=messagequery[0].message
|
|
Sal["etiqueta"]=row.copy_id
|
|
query = (dbcopies.copies.id==row.copy_id)
|
|
copiesquery =dbcopies(query).select(dbcopies.copies.ALL)
|
|
#Sal["copy_message"]=copiesquery[0].copy_message
|
|
Sal["intentionality"]=copiesquery[0].intentionality
|
|
#print(copiesquery)
|
|
output.append(Sal)
|
|
|
|
|
|
df=pd.DataFrame(output)
|
|
|
|
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
|
|
|
|
def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
|
|
st = SentenceTransformer(model_name)
|
|
return st
|
|
|
|
class CustomEmbedding(Embeddings, BaseModel,):
|
|
"""embedding model with preprocessing"""
|
|
def _get_embedding(self,text) -> List[float]:
|
|
#print(text,"text")
|
|
text=remove_unwanted(text,punctuationOK=True,stopOK=True)
|
|
Sal=emb.encode(text)
|
|
return Sal
|
|
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
Sal=[]
|
|
for text in texts:
|
|
Sal.append(self._get_embedding(text))
|
|
|
|
return Sal
|
|
def embed_query(self, text: str) -> List[float]:
|
|
return self._get_embedding(text)
|
|
|
|
nameModel="Modelo_embedding_Mexico_Puebla"
|
|
valid_path = extractConfig(dataOut="valid_dataset")
|
|
baseModel= extractConfig(dataOut="base_model")
|
|
with open(valid_path, 'r', encoding='utf-8') as file:
|
|
queries_Categoricos = json.load(file)
|
|
model="./%s/%s/model"%(nameModel,baseModel)
|
|
|
|
emb=loadmodelEmb(model_name = model)
|
|
emb2=CustomEmbedding()
|
|
train_embeddings = pd.DataFrame(emb2.embed_documents(train_data['texto'].tolist()))
|
|
test_embeddings = pd.DataFrame(emb2.embed_documents(test_data['texto'].tolist()))
|
|
print(pd.DataFrame(test_embeddings))
|
|
|
|
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
|
|
rf_model.fit(train_embeddings, train_data['etiqueta'])
|
|
|
|
# Hacer predicciones en el conjunto de prueba
|
|
predictions = rf_model.predict(test_embeddings)
|
|
|
|
# Calcular la precisión
|
|
accuracy = accuracy_score(test_data['etiqueta'], predictions)
|
|
print(f'Precisión del modelo: {accuracy:.2f}')
|
|
|
|
|
|
# verificar características importantes
|
|
feature_importances_df = pd.DataFrame(
|
|
{"feature": list(test_embeddings.columns), "importance": rf_model.feature_importances_}
|
|
).sort_values("importance", ascending=False)
|
|
|
|
# Mostrar
|
|
print(feature_importances_df)
|