LLm2Node/main.py

#import gradio as gr
from faiss import write_index, read_index
from typing import List
#from langchain import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain.document_loaders import UnstructuredURLLoader
from langchain.document_loaders.csv_loader import CSVLoader
#from langchain import  LLMChain
from pydantic import BaseModel
from langchain.schema.embeddings import Embeddings
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
import pandas as pd
import sqlite3
from sentence_transformers import SentenceTransformer
from fastapi import FastAPI
from unidecode import unidecode
from nltk.corpus import stopwords
from typing import Optional
#from cleantext import clean
import re
from langid.langid import LanguageIdentifier
from langid.langid import  model as modellangid
import time
model="Modelo_embedding_Mexico_Puebla/all-mpnet-base-v2/model"
entrenamiento="V1.3"


class CustomEmbedding(Embeddings, BaseModel,):
    """embedding model with preprocessing"""
    def _get_embedding(self,text) -> List[float]:
        #print(text,"text")
        text=remove_unwanted(text,punctuationOK=True,stopOK=True)
        Sal=emb.encode(text)
        return Sal
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        Sal=[]
        for text in texts:
            Sal.append(self._get_embedding(text))

        return Sal
    def embed_query(self, text: str) -> List[float]:
        return self._get_embedding(text)

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
    if punctuationOK:
    # remove punctuation
        for sig in [".",",","!","¿","?","=","(",")"]:
            document=document.replace(sig," ")

    if xtrasOK:
        # remove user mentions
        document = re.sub("@[A-Za-z0-9_]+"," ", document)
        # remove URLS
        document = re.sub(r'http\S+', ' ', document)
        # remove hashtags
        document = re.sub("#[A-Za-z0-9_]+","", document)
    if emojiOk:
        # remove emoji's
        document = remove_emoji(document)

    #document = re.sub("[^0-9A-Za-z ]", "" , document)
    # remove double spaces
    #print(document)
    if unidecodeOK:
        document=unidecode(document)


    if stopOK:
        words=document.split(" ")
        stop_words = set(stopwords.words('spanish'))
        words = [w for w in words if not w in stop_words]
        document=" ".join(words)


    document = document.replace('  ',"")
    #print(document)
    return document.strip().lower()

def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
    st = SentenceTransformer(model_name,device='cpu')
    return st


def loadCopysAndData(pathsqlite="/opt/web2py/applications/MotorAngela/databases/storage.sqlite"):
    con = sqlite3.connect(pathsqlite)
    copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
    copiesT = copies_df
    copiesT=copiesT[["copy_message","id","name","intentionality"]]
    print(copiesT)
    data = copiesT
    #print(data)
    B=DataFrameLoader(data,page_content_column="copy_message")
    B2=DataFrameLoader(data,page_content_column="intentionality")
    documents=B.load()
    documents2=B2.load()
    return documents,documents2

def makeFaissdb(documents,folder_path,embedding):
    try:
        db=FAISS.load_local(folder_path=folder_path,embeddings=embedding)

    except:
        db = FAISS.from_documents(documents, embedding)
        FAISS.save_local(db,folder_path=folder_path)
    return db

#llm,emb=loadModels()

documents,documents2=loadCopysAndData()
emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding()
db=makeFaissdb(documents,"Copies3",emb2)
db2=makeFaissdb(documents2,"Intentionality3",emb2)
#db3=makeFaissdb(documents2,"nameshf",hf)
identifier = LanguageIdentifier.from_modelstring(modellangid, norm_probs=True)


def FinderDbs(query,dbs,filtred=0.4):
    AllData={}
    for dbt in dbs:
        Sal = dbt.similarity_search_with_score(query,4)
        for output in Sal:
            if output[0].metadata["id"] in AllData.keys():
                AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]])
            else:
                AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
    #for item in AllData.items():
    #    print(item)

    if filtred>0:
        filtredData={}
        for row in AllData.keys():
            if AllData[row]["d"]<filtred:
                filtredData[row]=AllData[row]
        filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
        return  filtredData,filtredData.keys()


    else:
        AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
        return  AllData,AllData.keys()

app = FastAPI()

@app.get("/")
def read_main():
    return {"message": "This is your main app"}

class Response(BaseModel):
    query: str
    filtred : Optional[float] = -9.0


@app.post("/angela-api/")
def calculate_api(response: Response):
    query = response.query
    try:
        filtred = response.filtred
    except:
        filtred = -9.0

    AllData=FinderDbs(query,[db2,db],filtred)
    print(AllData)
    versionL="_".join([model,entrenamiento])
    #tt=time.time()
    #if identifier.classify(query)[1]< 0.3:
        #print(identifier.classify(query))
        #print(time.time()-tt)
        #return {"ids": [],"DC":[],"modelo":versionL}
    #print(time.time()-tt)
    if AllData:
        AllData = list(AllData)
        dis=[]
        id=[]
        for k,i in enumerate(AllData[0].items()):
            dis.append(str(i[1]['d']))
            id.append(i[0])
    return {"ids": id,"DC":dis,"modelo":versionL}