LLm2Node/general.py

from sentence_transformers import SentenceTransformer
from fastapi import FastAPI
from unidecode import unidecode
from nltk.corpus import stopwords
from langchain.schema.embeddings import Embeddings
from langchain.document_loaders import DataFrameLoader
import re
from pathlib import Path
from typing import List
import json
import time
from pydantic import BaseModel
from langchain.vectorstores import FAISS
from typing import Optional
import sqlite3
import pandas as pd
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
    configPath=Path(relPath)
    with open(configPath, 'r', encoding='utf-8') as file:
        config = json.load(file)[nameModel]
    Output= config[dataOut]
    return Output


def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
    if punctuationOK:
    # remove punctuation
        for sig in [".",",","!","¿","?","=","(",")"]:
            document=document.replace(sig," ")

    if xtrasOK:
        # remove user mentions
        document = re.sub("@[A-Za-z0-9_]+"," ", document)
        # remove URLS
        document = re.sub(r'http\S+', ' ', document)
        # remove hashtags
        document = re.sub("#[A-Za-z0-9_]+","", document)
    if emojiOk:
        # remove emoji's
        document = remove_emoji(document)

    #document = re.sub("[^0-9A-Za-z ]", "" , document)
    # remove double spaces
    #print(document)
    if unidecodeOK:
        document=unidecode(document)

    if stopOK:
        words=document.split(" ")
        stop_words = set(stopwords.words('spanish'))
        words = [w for w in words if not w in stop_words]
        document=" ".join(words)
    document = document.replace('  ',"")
    #print(document)
    return document.strip().lower()

def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
    st = SentenceTransformer(model_name,device='cpu')
    return st


def loadCopysAndData(pathsqlite):
    con = sqlite3.connect(pathsqlite)
    copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
    copiesT = copies_df
    copiesT=copiesT[["copy_message","id","name","intentionality"]]
    #print(copiesT)
    data = copiesT
    #print(data)
    B=DataFrameLoader(data,page_content_column="copy_message")
    B2=DataFrameLoader(data,page_content_column="intentionality")
    documents=B.load()
    documents2=B2.load()
    return documents,documents2

def makeFaissdb(documents,folder_path,embedding):
    try:
        db=FAISS.load_local(folder_path=folder_path,embeddings=embedding)
    except:
        db = FAISS.from_documents(documents, embedding)
        FAISS.save_local(db,folder_path=folder_path)
    return db

class Response(BaseModel):
    query: str
    filtred : Optional[float] = -9.0


def FinderDbs(query,dbs,filtred=0.4):
    AllData={}
    for dbt in dbs:
        Sal = dbt.similarity_search_with_score(query,4)
        for output in Sal:
            if output[0].metadata["id"] in AllData.keys():
                AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]])
            else:
                AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
    if filtred>0:
        filtredData={}
        for row in AllData.keys():
            if AllData[row]["d"]<filtred:
                filtredData[row]=AllData[row]
        filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
        return  filtredData,filtredData.keys()
    else:
        AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
        return  AllData,AllData.keys()