From 502cf96292a2ef6317d35e1fd392aeeb370fc4cb Mon Sep 17 00:00:00 2001
From: marioggil <marioggil@gmail.com>
Date: Thu, 21 Dec 2023 22:04:37 -0500
Subject: [PATCH] Refactoring of code in libs.

---
 general.py | 121 +++++++++++++++++++++++++
 main.py    | 261 +++++++++++++++++++++++------------------------------
 2 files changed, 232 insertions(+), 150 deletions(-)
 create mode 100644 general.py

diff --git a/general.py b/general.py
new file mode 100644
index 0000000..a666d6f
--- /dev/null
+++ b/general.py
@@ -0,0 +1,121 @@
+from sentence_transformers import SentenceTransformer
+from fastapi import FastAPI
+from unidecode import unidecode
+from nltk.corpus import stopwords
+from langchain.schema.embeddings import Embeddings
+from langchain.document_loaders import DataFrameLoader
+import re
+from pathlib import Path
+from typing import List
+import json
+import time
+from pydantic import BaseModel
+from langchain.vectorstores import FAISS
+from typing import Optional
+import sqlite3
+import pandas as pd
+def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
+    configPath=Path(relPath)
+    with open(configPath, 'r', encoding='utf-8') as file:
+        config = json.load(file)[nameModel]
+    Output= config[dataOut]
+    return Output
+
+
+
+def remove_emoji(string):
+    emoji_pattern = re.compile("["
+                           u"\U0001F600-\U0001F64F"  # emoticons
+                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                           u"\U00002702-\U000027B0"
+                           u"\U000024C2-\U0001F251"
+                           "]+", flags=re.UNICODE)
+    return emoji_pattern.sub(r' ', string)
+
+def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
+    if punctuationOK:
+    # remove punctuation
+        for sig in [".",",","!","¿","?","=","(",")"]:
+            document=document.replace(sig," ")
+
+    if xtrasOK:
+        # remove user mentions
+        document = re.sub("@[A-Za-z0-9_]+"," ", document)
+        # remove URLS 
+        document = re.sub(r'http\S+', ' ', document)
+        # remove hashtags
+        document = re.sub("#[A-Za-z0-9_]+","", document)
+    if emojiOk:
+        # remove emoji's
+        document = remove_emoji(document)
+
+    #document = re.sub("[^0-9A-Za-z ]", "" , document)
+    # remove double spaces
+    #print(document)
+    if unidecodeOK:
+        document=unidecode(document)
+
+    if stopOK:
+        words=document.split(" ")
+        stop_words = set(stopwords.words('spanish'))
+        words = [w for w in words if not w in stop_words]
+        document=" ".join(words)
+    document = document.replace('  ',"")
+    #print(document)
+    return document.strip().lower()
+
+def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
+    st = SentenceTransformer(model_name,device='cpu')
+    return st
+
+
+def loadCopysAndData(pathsqlite):
+    con = sqlite3.connect(pathsqlite)
+    copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
+    copiesT = copies_df
+    copiesT=copiesT[["copy_message","id","name","intentionality"]]
+    #print(copiesT)
+    data = copiesT 
+    #print(data)
+    B=DataFrameLoader(data,page_content_column="copy_message")
+    B2=DataFrameLoader(data,page_content_column="intentionality")
+    documents=B.load()
+    documents2=B2.load()
+    return documents,documents2
+
+def makeFaissdb(documents,folder_path,embedding):
+    try:
+        db=FAISS.load_local(folder_path=folder_path,embeddings=embedding) 
+    except:
+        db = FAISS.from_documents(documents, embedding)
+        FAISS.save_local(db,folder_path=folder_path)
+    return db
+
+class Response(BaseModel):
+    query: str
+    filtred : Optional[float] = -9.0
+
+
+
+
+def FinderDbs(query,dbs,filtred=0.4):
+    AllData={}
+    for dbt in dbs:
+        Sal = dbt.similarity_search_with_score(query,4)
+        for output in Sal:
+            if output[0].metadata["id"] in AllData.keys():
+                AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]])
+            else:
+                AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
+    if filtred>0:
+        filtredData={}
+        for row in AllData.keys():
+            if AllData[row]["d"]<filtred:
+                filtredData[row]=AllData[row]
+        filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
+        return  filtredData,filtredData.keys()
+    else:
+        AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
+        return  AllData,AllData.keys()
\ No newline at end of file
diff --git a/main.py b/main.py
index 6e94f03..2878a23 100644
--- a/main.py
+++ b/main.py
@@ -1,49 +1,46 @@
 #import gradio as gr
 from faiss import write_index, read_index
-from typing import List
-#from langchain import PromptTemplate
-from langchain.document_loaders import TextLoader
-from langchain.text_splitter import CharacterTextSplitter
 from langchain.vectorstores import FAISS
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.document_loaders import UnstructuredFileLoader
-from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
-from langchain.document_loaders import UnstructuredURLLoader
-from langchain.document_loaders.csv_loader import CSVLoader
-#from langchain import  LLMChain
+from typing import List
 from pydantic import BaseModel
-from langchain.schema.embeddings import Embeddings
-from langchain.document_loaders import DataFrameLoader
-from langchain.embeddings import HuggingFaceEmbeddings
+from typing import Optional
+import re
+from pathlib import Path
+import time
+from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
+import json
 import pandas as pd
 import sqlite3
 from sentence_transformers import SentenceTransformer
 from fastapi import FastAPI
 from unidecode import unidecode
 from nltk.corpus import stopwords
-from typing import Optional
+from langchain.schema.embeddings import Embeddings
+from langchain.document_loaders import DataFrameLoader
+
+#from langchain import PromptTemplate
+# from langchain.document_loaders import TextLoader
+# from langchain.text_splitter import CharacterTextSplitter
+
+# from langchain.text_splitter import RecursiveCharacterTextSplitter
+# from langchain.document_loaders import UnstructuredFileLoader
+# from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
+# from langchain.document_loaders import UnstructuredURLLoader
+# from langchain.document_loaders.csv_loader import CSVLoader
+# #from langchain import  LLMChain
+
+# 
+# 
+# from langchain.embeddings import HuggingFaceEmbeddings
+
+
 #from cleantext import clean
-import re
-from pathlib import Path
-import json
-#from langid.langid import LanguageIdentifier
-#from langid.langid import  model as modellangid
-import time
-model="Modelo_embedding_Mexico_Puebla/all-mpnet-base-v2/model"
-entrenamiento="V1.3"
-
-def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
-    configPath=Path(relPath)
-    with open(configPath, 'r', encoding='utf-8') as file:
-        config = json.load(file)[nameModel]
-
-    Output= Path(config[dataOut])
-    return Output
 
 
 
+from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted
 
-class CustomEmbedding(Embeddings, BaseModel,):
+class CustomEmbedding(Embeddings, BaseModel):
     """embedding model with preprocessing"""
     def _get_embedding(self,text) -> List[float]:
         #print(text,"text")
@@ -54,135 +51,25 @@ class CustomEmbedding(Embeddings, BaseModel,):
         Sal=[]
         for text in texts:
             Sal.append(self._get_embedding(text))
-
         return Sal
     def embed_query(self, text: str) -> List[float]:
         return self._get_embedding(text)
 
-def remove_emoji(string):
-    emoji_pattern = re.compile("["
-                           u"\U0001F600-\U0001F64F"  # emoticons
-                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
-                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
-                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
-                           u"\U00002702-\U000027B0"
-                           u"\U000024C2-\U0001F251"
-                           "]+", flags=re.UNICODE)
-    return emoji_pattern.sub(r' ', string)
-
-def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
-    if punctuationOK:
-    # remove punctuation
-        for sig in [".",",","!","¿","?","=","(",")"]:
-            document=document.replace(sig," ")
-
-    if xtrasOK:
-        # remove user mentions
-        document = re.sub("@[A-Za-z0-9_]+"," ", document)
-        # remove URLS 
-        document = re.sub(r'http\S+', ' ', document)
-        # remove hashtags
-        document = re.sub("#[A-Za-z0-9_]+","", document)
-    if emojiOk:
-        # remove emoji's
-        document = remove_emoji(document)
-
-    #document = re.sub("[^0-9A-Za-z ]", "" , document)
-    # remove double spaces
-    #print(document)
-    if unidecodeOK:
-        document=unidecode(document)
-    
-
-    if stopOK:
-        words=document.split(" ")
-        stop_words = set(stopwords.words('spanish'))
-        words = [w for w in words if not w in stop_words]
-        document=" ".join(words)
-    
-    
-
-    document = document.replace('  ',"")
-    #print(document)
-    return document.strip().lower()
-
-def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
-    st = SentenceTransformer(model_name,device='cpu')
-    return st
-
-
-def loadCopysAndData(pathsqlite=pathDb):
-    con = sqlite3.connect(pathsqlite)
-    copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
-    copiesT = copies_df
-    copiesT=copiesT[["copy_message","id","name","intentionality"]]
-    #print(copiesT)
-    data = copiesT 
-    #print(data)
-    B=DataFrameLoader(data,page_content_column="copy_message")
-    B2=DataFrameLoader(data,page_content_column="intentionality")
-    documents=B.load()
-    documents2=B2.load()
-    return documents,documents2
-
-def makeFaissdb(documents,folder_path,embedding):
-    try:
-        db=FAISS.load_local(folder_path=folder_path,embeddings=embedding) 
-    except:
-        db = FAISS.from_documents(documents, embedding)
-        FAISS.save_local(db,folder_path=folder_path)
-    return db
-
-#llm,emb=loadModels()
-
-documents,documents2=loadCopysAndData()
+nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"  
+model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model"
+entrenamiento="V1.3"
+pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite")
+keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
+documents,documents2=loadCopysAndData(pathsqlite)
 emb=loadmodelEmb(model_name = model)
 emb2=CustomEmbedding()
 db=makeFaissdb(documents,"Copies3",emb2)
 db2=makeFaissdb(documents2,"Intentionality3",emb2)
-#db3=makeFaissdb(documents2,"nameshf",hf)
-#identifier = LanguageIdentifier.from_modelstring(modellangid, norm_probs=True)
 
-
-
-
-def FinderDbs(query,dbs,filtred=0.4):
-    AllData={}
-    for dbt in dbs:
-        Sal = dbt.similarity_search_with_score(query,4)
-        for output in Sal:
-            if output[0].metadata["id"] in AllData.keys():
-                AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]])
-            else:
-                AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
-    #for item in AllData.items():
-    #    print(item)
-    
-    if filtred>0:
-        filtredData={}
-        for row in AllData.keys():
-            if AllData[row]["d"]<filtred:
-                filtredData[row]=AllData[row]
-        filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
-        return  filtredData,filtredData.keys()
-
-
-    else:
-        AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
-        return  AllData,AllData.keys()
-    
 app = FastAPI()
-
 @app.get("/")
 def read_main():
     return {"message": "This is your main app"}
-
-class Response(BaseModel):
-    query: str
-    filtred : Optional[float] = -9.0
-
-
-
 @app.post("/angela-api/")
 def calculate_api(response: Response):
     query = response.query
@@ -190,9 +77,8 @@ def calculate_api(response: Response):
         filtred = response.filtred
     except:
         filtred = -9.0
-    
     AllData=FinderDbs(query,[db2,db],filtred)
-    print(AllData)
+    #print(AllData)
     versionL="_".join([model,entrenamiento])
     #tt=time.time()
     #if identifier.classify(query)[1]< 0.3:
@@ -209,7 +95,82 @@ def calculate_api(response: Response):
             id.append(i[0])
     return {"ids": id,"DC":dis,"modelo":versionL}
 
-
+@app.post("/angela-api-claude/")
+def calculate_api_claude(response: Response):
+    anthropic = Anthropic(api_key=keyanthropic)
+    query = response.query
+    try:
+        filtred = response.filtred
+    except:
+        filtred = -9.0
+    
+    AllData=FinderDbs(query,[db2,db],filtred)
+    versionL="_".join([model,entrenamiento])
+    if AllData:
+        AllData = list(AllData)
+        dis=[]
+        id=[]
+        for k,i in enumerate(AllData[0].items()):
+            dis.append(str(i[1]['d']))
+            id.append(i[0])
+    if len(id)<1:
+        return {"text": {"completion": "No tengo información sobre este tema",
+    "model": "claude-2.1",
+    "stop_reason": "stop_sequence",
+    "type": "completion",
+    "id": "1",
+    "stop": "\n\nHuman:",
+    "log_id": "1"
+  },"text2": {
+    "completion":"No tengo información sobre este tema",
+    "model": "claude-2.1",
+    "stop_reason": "stop_sequence",
+    "type": "completion",
+    "id": "1",
+    "stop": "\n\nHuman:",
+    "log_id": "1"
+    }
+    }
+    con = sqlite3.connect(pathsqlite)
+    copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
+    copie = copies_df[copies_df["id"]==id[0]]["copy_message"].values[0]
+    promptF=f"""{HUMAN_PROMPT} Tengo un contexto  por favor generame un resumen, el resumen deben ser con lenguaje amable para un publico mexicano y como si fuera una conversacion con la persona.
+    """
+    promptF3=promptF+f"""
+<contexto>%s</contexto>
+{AI_PROMPT}<resumen>"""%(copie)
+    completion = anthropic.completions.create(
+                model="claude-2",
+                max_tokens_to_sample=600,
+                prompt=promptF3,
+    )
+
+    pregunta=query
+    promptFv2=f"""Tu eres un asistente de IA en chatbot llamado Angela, como asistente tu labor es  ayudar a los usuarios de la pagina web de la alcaldia de puebla respondiendo sus preguntas.
+Aqui te dare las reglas que debes seguir durante la conversacion:
+<reglas>
+- Siempre te mantendras en el personaje Angela.
+- Si no estas seguro de la respuesta basada en el contexto responde el suigiente texto: "Lo siento, podrias formular la pregunta de nuevo es que no entendi tu pregunta por que soy un sistema que esta en mejora en este momento".
+- No menciones el contexto si la pregunta no puede ser contestada con el.
+- Siempres responderas de manera amable pero formal.
+</reglas>
+<contexto>
+%s
+</contexto>
+{HUMAN_PROMPT} Tengo la siguiente pregunta entre la etiqueta <pregunta></pregunta> y basandote en el contexto que esta en la etiqueta <contexto></contexto> responde la pregunta entre la etiqueta <respuesta></respuesta>:
+<pregunta>
+%s
+</pregunta>
+"""%(copie,pregunta)
+
+    promptF3v2=promptFv2+f"""
+{AI_PROMPT}<respuesta>"""
+    completionv2 = anthropic.completions.create(
+                model="claude-2.1",
+                max_tokens_to_sample=600,
+                prompt=promptF3v2,
+    )
+    return {"text":completion,"text2":completionv2}