Refactoring of code in libs.

2023-12-21 22:04:37 -05:00 · 2023-12-21 22:04:37 -05:00 · 502cf96292
parent ceaa20af06
commit 502cf96292
2 changed files with 232 additions and 150 deletions
--- a/general.py
+++ b/general.py
@ -0,0 +1,121 @@
 from sentence_transformers import SentenceTransformer
 from fastapi import FastAPI
 from unidecode import unidecode
 from nltk.corpus import stopwords
 from langchain.schema.embeddings import Embeddings
 from langchain.document_loaders import DataFrameLoader
 import re
 from pathlib import Path
 from typing import List
 import json
 import time
 from pydantic import BaseModel
 from langchain.vectorstores import FAISS
 from typing import Optional
 import sqlite3
 import pandas as pd
 def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
    configPath=Path(relPath)
    with open(configPath, 'r', encoding='utf-8') as file:
        config = json.load(file)[nameModel]
    Output= config[dataOut]
    return Output
 def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)
 def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
    if punctuationOK:
    # remove punctuation
        for sig in [".",",","!","¿","?","=","(",")"]:
            document=document.replace(sig," ")
    if xtrasOK:
        # remove user mentions
        document = re.sub("@[A-Za-z0-9_]+"," ", document)
        # remove URLS 
        document = re.sub(r'http\S+', ' ', document)
        # remove hashtags
        document = re.sub("#[A-Za-z0-9_]+","", document)
    if emojiOk:
        # remove emoji's
        document = remove_emoji(document)
    #document = re.sub("[^0-9A-Za-z ]", "" , document)
    # remove double spaces
    #print(document)
    if unidecodeOK:
        document=unidecode(document)
    if stopOK:
        words=document.split(" ")
        stop_words = set(stopwords.words('spanish'))
        words = [w for w in words if not w in stop_words]
        document=" ".join(words)
    document = document.replace('  ',"")
    #print(document)
    return document.strip().lower()
 def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
    st = SentenceTransformer(model_name,device='cpu')
    return st
 def loadCopysAndData(pathsqlite):
    con = sqlite3.connect(pathsqlite)
    copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
    copiesT = copies_df
    copiesT=copiesT[["copy_message","id","name","intentionality"]]
    #print(copiesT)
    data = copiesT 
    #print(data)
    B=DataFrameLoader(data,page_content_column="copy_message")
    B2=DataFrameLoader(data,page_content_column="intentionality")
    documents=B.load()
    documents2=B2.load()
    return documents,documents2
 def makeFaissdb(documents,folder_path,embedding):
    try:
        db=FAISS.load_local(folder_path=folder_path,embeddings=embedding) 
    except:
        db = FAISS.from_documents(documents, embedding)
        FAISS.save_local(db,folder_path=folder_path)
    return db
 class Response(BaseModel):
    query: str
    filtred : Optional[float] = -9.0
 def FinderDbs(query,dbs,filtred=0.4):
    AllData={}
    for dbt in dbs:
        Sal = dbt.similarity_search_with_score(query,4)
        for output in Sal:
            if output[0].metadata["id"] in AllData.keys():
                AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]])
            else:
                AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
    if filtred>0:
        filtredData={}
        for row in AllData.keys():
            if AllData[row]["d"]<filtred:
                filtredData[row]=AllData[row]
        filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
        return  filtredData,filtredData.keys()
    else:
        AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
        return  AllData,AllData.keys()
--- a/main.py
+++ b/main.py
@ -1,49 +1,46 @@
 #import gradio as gr
 from faiss import write_index, read_index
 from typing import List
 #from langchain import PromptTemplate
 from langchain.document_loaders import TextLoader
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.vectorstores import FAISS
-from langchain.text_splitter import RecursiveCharacterTextSplitter
+from typing import List
 from langchain.document_loaders import UnstructuredFileLoader
 from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
 from langchain.document_loaders import UnstructuredURLLoader
 from langchain.document_loaders.csv_loader import CSVLoader
 #from langchain import  LLMChain
 from pydantic import BaseModel
-from langchain.schema.embeddings import Embeddings
+from typing import Optional
-from langchain.document_loaders import DataFrameLoader
+import re
-from langchain.embeddings import HuggingFaceEmbeddings
+from pathlib import Path
 import time
 from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
 import json
 import pandas as pd
 import sqlite3
 from sentence_transformers import SentenceTransformer
 from fastapi import FastAPI
 from unidecode import unidecode
 from nltk.corpus import stopwords
-from typing import Optional
+from langchain.schema.embeddings import Embeddings
 from langchain.document_loaders import DataFrameLoader
 #from langchain import PromptTemplate
 # from langchain.document_loaders import TextLoader
 # from langchain.text_splitter import CharacterTextSplitter
 # from langchain.text_splitter import RecursiveCharacterTextSplitter
 # from langchain.document_loaders import UnstructuredFileLoader
 # from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
 # from langchain.document_loaders import UnstructuredURLLoader
 # from langchain.document_loaders.csv_loader import CSVLoader
 # #from langchain import  LLMChain
 # 
 # 
 # from langchain.embeddings import HuggingFaceEmbeddings
 #from cleantext import clean
 import re
 from pathlib import Path
 import json
 #from langid.langid import LanguageIdentifier
 #from langid.langid import  model as modellangid
 import time
 model="Modelo_embedding_Mexico_Puebla/all-mpnet-base-v2/model"
 entrenamiento="V1.3"
 def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
    configPath=Path(relPath)
    with open(configPath, 'r', encoding='utf-8') as file:
        config = json.load(file)[nameModel]
    Output= Path(config[dataOut])
    return Output
 from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted
-class CustomEmbedding(Embeddings, BaseModel,):
+class CustomEmbedding(Embeddings, BaseModel):
    """embedding model with preprocessing"""
    def _get_embedding(self,text) -> List[float]:
        #print(text,"text")
@ -54,135 +51,25 @@ class CustomEmbedding(Embeddings, BaseModel,):
        Sal=[]
        for text in texts:
            Sal.append(self._get_embedding(text))
        return Sal
    def embed_query(self, text: str) -> List[float]:
        return self._get_embedding(text)
-def remove_emoji(string):
+nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"  
-    emoji_pattern = re.compile("["
+model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model"
-                           u"\U0001F600-\U0001F64F"  # emoticons
+entrenamiento="V1.3"
-                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite")
-                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
+keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
-                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+documents,documents2=loadCopysAndData(pathsqlite)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)
 def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
    if punctuationOK:
    # remove punctuation
        for sig in [".",",","!","¿","?","=","(",")"]:
            document=document.replace(sig," ")
    if xtrasOK:
        # remove user mentions
        document = re.sub("@[A-Za-z0-9_]+"," ", document)
        # remove URLS 
        document = re.sub(r'http\S+', ' ', document)
        # remove hashtags
        document = re.sub("#[A-Za-z0-9_]+","", document)
    if emojiOk:
        # remove emoji's
        document = remove_emoji(document)
    #document = re.sub("[^0-9A-Za-z ]", "" , document)
    # remove double spaces
    #print(document)
    if unidecodeOK:
        document=unidecode(document)
    if stopOK:
        words=document.split(" ")
        stop_words = set(stopwords.words('spanish'))
        words = [w for w in words if not w in stop_words]
        document=" ".join(words)
    document = document.replace('  ',"")
    #print(document)
    return document.strip().lower()
 def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
    st = SentenceTransformer(model_name,device='cpu')
    return st
 def loadCopysAndData(pathsqlite=pathDb):
    con = sqlite3.connect(pathsqlite)
    copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
    copiesT = copies_df
    copiesT=copiesT[["copy_message","id","name","intentionality"]]
    #print(copiesT)
    data = copiesT 
    #print(data)
    B=DataFrameLoader(data,page_content_column="copy_message")
    B2=DataFrameLoader(data,page_content_column="intentionality")
    documents=B.load()
    documents2=B2.load()
    return documents,documents2
 def makeFaissdb(documents,folder_path,embedding):
    try:
        db=FAISS.load_local(folder_path=folder_path,embeddings=embedding) 
    except:
        db = FAISS.from_documents(documents, embedding)
        FAISS.save_local(db,folder_path=folder_path)
    return db
 #llm,emb=loadModels()
 documents,documents2=loadCopysAndData()
 emb=loadmodelEmb(model_name = model)
 emb2=CustomEmbedding()
 db=makeFaissdb(documents,"Copies3",emb2)
 db2=makeFaissdb(documents2,"Intentionality3",emb2)
 #db3=makeFaissdb(documents2,"nameshf",hf)
 #identifier = LanguageIdentifier.from_modelstring(modellangid, norm_probs=True)
 def FinderDbs(query,dbs,filtred=0.4):
    AllData={}
    for dbt in dbs:
        Sal = dbt.similarity_search_with_score(query,4)
        for output in Sal:
            if output[0].metadata["id"] in AllData.keys():
                AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]])
            else:
                AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
    #for item in AllData.items():
    #    print(item)
    if filtred>0:
        filtredData={}
        for row in AllData.keys():
            if AllData[row]["d"]<filtred:
                filtredData[row]=AllData[row]
        filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
        return  filtredData,filtredData.keys()
    else:
        AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
        return  AllData,AllData.keys()
 app = FastAPI()
@app.get("/")
 def read_main():
    return {"message": "This is your main app"}
 class Response(BaseModel):
    query: str
    filtred : Optional[float] = -9.0
@app.post("/angela-api/")
 def calculate_api(response: Response):
    query = response.query
@ -190,9 +77,8 @@ def calculate_api(response: Response):
        filtred = response.filtred
    except:
        filtred = -9.0
    AllData=FinderDbs(query,[db2,db],filtred)
-    print(AllData)
+    #print(AllData)
    versionL="_".join([model,entrenamiento])
    #tt=time.time()
    #if identifier.classify(query)[1]< 0.3:
@ -209,7 +95,82 @@ def calculate_api(response: Response):
            id.append(i[0])
    return {"ids": id,"DC":dis,"modelo":versionL}
-
+@app.post("/angela-api-claude/")
 def calculate_api_claude(response: Response):
    anthropic = Anthropic(api_key=keyanthropic)
    query = response.query
    try:
        filtred = response.filtred
    except:
        filtred = -9.0
    AllData=FinderDbs(query,[db2,db],filtred)
    versionL="_".join([model,entrenamiento])
    if AllData:
        AllData = list(AllData)
        dis=[]
        id=[]
        for k,i in enumerate(AllData[0].items()):
            dis.append(str(i[1]['d']))
            id.append(i[0])
    if len(id)<1:
        return {"text": {"completion": "No tengo información sobre este tema",
    "model": "claude-2.1",
    "stop_reason": "stop_sequence",
    "type": "completion",
    "id": "1",
    "stop": "\n\nHuman:",
    "log_id": "1"
  },"text2": {
    "completion":"No tengo información sobre este tema",
    "model": "claude-2.1",
    "stop_reason": "stop_sequence",
    "type": "completion",
    "id": "1",
    "stop": "\n\nHuman:",
    "log_id": "1"
    }
    }
    con = sqlite3.connect(pathsqlite)
    copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
    copie = copies_df[copies_df["id"]==id[0]]["copy_message"].values[0]
    promptF=f"""{HUMAN_PROMPT} Tengo un contexto  por favor generame un resumen, el resumen deben ser con lenguaje amable para un publico mexicano y como si fuera una conversacion con la persona.
    """
    promptF3=promptF+f"""
 <contexto>%s</contexto>
 {AI_PROMPT}<resumen>"""%(copie)
    completion = anthropic.completions.create(
                model="claude-2",
                max_tokens_to_sample=600,
                prompt=promptF3,
    )
    pregunta=query
    promptFv2=f"""Tu eres un asistente de IA en chatbot llamado Angela, como asistente tu labor es  ayudar a los usuarios de la pagina web de la alcaldia de puebla respondiendo sus preguntas.
 Aqui te dare las reglas que debes seguir durante la conversacion:
 <reglas>
 - Siempre te mantendras en el personaje Angela.
 - Si no estas seguro de la respuesta basada en el contexto responde el suigiente texto: "Lo siento, podrias formular la pregunta de nuevo es que no entendi tu pregunta por que soy un sistema que esta en mejora en este momento".
 - No menciones el contexto si la pregunta no puede ser contestada con el.
 - Siempres responderas de manera amable pero formal.
 </reglas>
 <contexto>
 %s
 </contexto>
 {HUMAN_PROMPT} Tengo la siguiente pregunta entre la etiqueta <pregunta></pregunta> y basandote en el contexto que esta en la etiqueta <contexto></contexto> responde la pregunta entre la etiqueta <respuesta></respuesta>:
 <pregunta>
 %s
 </pregunta>
 """%(copie,pregunta)
    promptF3v2=promptFv2+f"""
 {AI_PROMPT}<respuesta>"""
    completionv2 = anthropic.completions.create(
                model="claude-2.1",
                max_tokens_to_sample=600,
                prompt=promptF3v2,
    )
    return {"text":completion,"text2":completionv2}