Refactoring of code in libs.

This commit is contained in:
Mario Gil 2023-12-21 22:04:37 -05:00
parent ceaa20af06
commit 502cf96292
2 changed files with 232 additions and 150 deletions

121
general.py Normal file
View File

@ -0,0 +1,121 @@
from sentence_transformers import SentenceTransformer
from fastapi import FastAPI
from unidecode import unidecode
from nltk.corpus import stopwords
from langchain.schema.embeddings import Embeddings
from langchain.document_loaders import DataFrameLoader
import re
from pathlib import Path
from typing import List
import json
import time
from pydantic import BaseModel
from langchain.vectorstores import FAISS
from typing import Optional
import sqlite3
import pandas as pd
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
configPath=Path(relPath)
with open(configPath, 'r', encoding='utf-8') as file:
config = json.load(file)[nameModel]
Output= config[dataOut]
return Output
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r' ', string)
def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
if punctuationOK:
# remove punctuation
for sig in [".",",","!","¿","?","=","(",")"]:
document=document.replace(sig," ")
if xtrasOK:
# remove user mentions
document = re.sub("@[A-Za-z0-9_]+"," ", document)
# remove URLS
document = re.sub(r'http\S+', ' ', document)
# remove hashtags
document = re.sub("#[A-Za-z0-9_]+","", document)
if emojiOk:
# remove emoji's
document = remove_emoji(document)
#document = re.sub("[^0-9A-Za-z ]", "" , document)
# remove double spaces
#print(document)
if unidecodeOK:
document=unidecode(document)
if stopOK:
words=document.split(" ")
stop_words = set(stopwords.words('spanish'))
words = [w for w in words if not w in stop_words]
document=" ".join(words)
document = document.replace(' ',"")
#print(document)
return document.strip().lower()
def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
st = SentenceTransformer(model_name,device='cpu')
return st
def loadCopysAndData(pathsqlite):
con = sqlite3.connect(pathsqlite)
copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
copiesT = copies_df
copiesT=copiesT[["copy_message","id","name","intentionality"]]
#print(copiesT)
data = copiesT
#print(data)
B=DataFrameLoader(data,page_content_column="copy_message")
B2=DataFrameLoader(data,page_content_column="intentionality")
documents=B.load()
documents2=B2.load()
return documents,documents2
def makeFaissdb(documents,folder_path,embedding):
try:
db=FAISS.load_local(folder_path=folder_path,embeddings=embedding)
except:
db = FAISS.from_documents(documents, embedding)
FAISS.save_local(db,folder_path=folder_path)
return db
class Response(BaseModel):
query: str
filtred : Optional[float] = -9.0
def FinderDbs(query,dbs,filtred=0.4):
AllData={}
for dbt in dbs:
Sal = dbt.similarity_search_with_score(query,4)
for output in Sal:
if output[0].metadata["id"] in AllData.keys():
AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]])
else:
AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
if filtred>0:
filtredData={}
for row in AllData.keys():
if AllData[row]["d"]<filtred:
filtredData[row]=AllData[row]
filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
return filtredData,filtredData.keys()
else:
AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
return AllData,AllData.keys()

261
main.py
View File

@ -1,49 +1,46 @@
#import gradio as gr #import gradio as gr
from faiss import write_index, read_index from faiss import write_index, read_index
from typing import List
#from langchain import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter from typing import List
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain.document_loaders import UnstructuredURLLoader
from langchain.document_loaders.csv_loader import CSVLoader
#from langchain import LLMChain
from pydantic import BaseModel from pydantic import BaseModel
from langchain.schema.embeddings import Embeddings from typing import Optional
from langchain.document_loaders import DataFrameLoader import re
from langchain.embeddings import HuggingFaceEmbeddings from pathlib import Path
import time
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
import json
import pandas as pd import pandas as pd
import sqlite3 import sqlite3
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
from fastapi import FastAPI from fastapi import FastAPI
from unidecode import unidecode from unidecode import unidecode
from nltk.corpus import stopwords from nltk.corpus import stopwords
from typing import Optional from langchain.schema.embeddings import Embeddings
from langchain.document_loaders import DataFrameLoader
#from langchain import PromptTemplate
# from langchain.document_loaders import TextLoader
# from langchain.text_splitter import CharacterTextSplitter
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.document_loaders import UnstructuredFileLoader
# from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
# from langchain.document_loaders import UnstructuredURLLoader
# from langchain.document_loaders.csv_loader import CSVLoader
# #from langchain import LLMChain
#
#
# from langchain.embeddings import HuggingFaceEmbeddings
#from cleantext import clean #from cleantext import clean
import re
from pathlib import Path
import json
#from langid.langid import LanguageIdentifier
#from langid.langid import model as modellangid
import time
model="Modelo_embedding_Mexico_Puebla/all-mpnet-base-v2/model"
entrenamiento="V1.3"
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
configPath=Path(relPath)
with open(configPath, 'r', encoding='utf-8') as file:
config = json.load(file)[nameModel]
Output= Path(config[dataOut])
return Output
from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted
class CustomEmbedding(Embeddings, BaseModel,): class CustomEmbedding(Embeddings, BaseModel):
"""embedding model with preprocessing""" """embedding model with preprocessing"""
def _get_embedding(self,text) -> List[float]: def _get_embedding(self,text) -> List[float]:
#print(text,"text") #print(text,"text")
@ -54,135 +51,25 @@ class CustomEmbedding(Embeddings, BaseModel,):
Sal=[] Sal=[]
for text in texts: for text in texts:
Sal.append(self._get_embedding(text)) Sal.append(self._get_embedding(text))
return Sal return Sal
def embed_query(self, text: str) -> List[float]: def embed_query(self, text: str) -> List[float]:
return self._get_embedding(text) return self._get_embedding(text)
def remove_emoji(string): nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"
emoji_pattern = re.compile("[" model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model"
u"\U0001F600-\U0001F64F" # emoticons entrenamiento="V1.3"
u"\U0001F300-\U0001F5FF" # symbols & pictographs pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite")
u"\U0001F680-\U0001F6FF" # transport & map symbols keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
u"\U0001F1E0-\U0001F1FF" # flags (iOS) documents,documents2=loadCopysAndData(pathsqlite)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r' ', string)
def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
if punctuationOK:
# remove punctuation
for sig in [".",",","!","¿","?","=","(",")"]:
document=document.replace(sig," ")
if xtrasOK:
# remove user mentions
document = re.sub("@[A-Za-z0-9_]+"," ", document)
# remove URLS
document = re.sub(r'http\S+', ' ', document)
# remove hashtags
document = re.sub("#[A-Za-z0-9_]+","", document)
if emojiOk:
# remove emoji's
document = remove_emoji(document)
#document = re.sub("[^0-9A-Za-z ]", "" , document)
# remove double spaces
#print(document)
if unidecodeOK:
document=unidecode(document)
if stopOK:
words=document.split(" ")
stop_words = set(stopwords.words('spanish'))
words = [w for w in words if not w in stop_words]
document=" ".join(words)
document = document.replace(' ',"")
#print(document)
return document.strip().lower()
def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
st = SentenceTransformer(model_name,device='cpu')
return st
def loadCopysAndData(pathsqlite=pathDb):
con = sqlite3.connect(pathsqlite)
copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
copiesT = copies_df
copiesT=copiesT[["copy_message","id","name","intentionality"]]
#print(copiesT)
data = copiesT
#print(data)
B=DataFrameLoader(data,page_content_column="copy_message")
B2=DataFrameLoader(data,page_content_column="intentionality")
documents=B.load()
documents2=B2.load()
return documents,documents2
def makeFaissdb(documents,folder_path,embedding):
try:
db=FAISS.load_local(folder_path=folder_path,embeddings=embedding)
except:
db = FAISS.from_documents(documents, embedding)
FAISS.save_local(db,folder_path=folder_path)
return db
#llm,emb=loadModels()
documents,documents2=loadCopysAndData()
emb=loadmodelEmb(model_name = model) emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding() emb2=CustomEmbedding()
db=makeFaissdb(documents,"Copies3",emb2) db=makeFaissdb(documents,"Copies3",emb2)
db2=makeFaissdb(documents2,"Intentionality3",emb2) db2=makeFaissdb(documents2,"Intentionality3",emb2)
#db3=makeFaissdb(documents2,"nameshf",hf)
#identifier = LanguageIdentifier.from_modelstring(modellangid, norm_probs=True)
def FinderDbs(query,dbs,filtred=0.4):
AllData={}
for dbt in dbs:
Sal = dbt.similarity_search_with_score(query,4)
for output in Sal:
if output[0].metadata["id"] in AllData.keys():
AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]])
else:
AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
#for item in AllData.items():
# print(item)
if filtred>0:
filtredData={}
for row in AllData.keys():
if AllData[row]["d"]<filtred:
filtredData[row]=AllData[row]
filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
return filtredData,filtredData.keys()
else:
AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
return AllData,AllData.keys()
app = FastAPI() app = FastAPI()
@app.get("/") @app.get("/")
def read_main(): def read_main():
return {"message": "This is your main app"} return {"message": "This is your main app"}
class Response(BaseModel):
query: str
filtred : Optional[float] = -9.0
@app.post("/angela-api/") @app.post("/angela-api/")
def calculate_api(response: Response): def calculate_api(response: Response):
query = response.query query = response.query
@ -190,9 +77,8 @@ def calculate_api(response: Response):
filtred = response.filtred filtred = response.filtred
except: except:
filtred = -9.0 filtred = -9.0
AllData=FinderDbs(query,[db2,db],filtred) AllData=FinderDbs(query,[db2,db],filtred)
print(AllData) #print(AllData)
versionL="_".join([model,entrenamiento]) versionL="_".join([model,entrenamiento])
#tt=time.time() #tt=time.time()
#if identifier.classify(query)[1]< 0.3: #if identifier.classify(query)[1]< 0.3:
@ -209,7 +95,82 @@ def calculate_api(response: Response):
id.append(i[0]) id.append(i[0])
return {"ids": id,"DC":dis,"modelo":versionL} return {"ids": id,"DC":dis,"modelo":versionL}
@app.post("/angela-api-claude/")
def calculate_api_claude(response: Response):
anthropic = Anthropic(api_key=keyanthropic)
query = response.query
try:
filtred = response.filtred
except:
filtred = -9.0
AllData=FinderDbs(query,[db2,db],filtred)
versionL="_".join([model,entrenamiento])
if AllData:
AllData = list(AllData)
dis=[]
id=[]
for k,i in enumerate(AllData[0].items()):
dis.append(str(i[1]['d']))
id.append(i[0])
if len(id)<1:
return {"text": {"completion": "No tengo información sobre este tema",
"model": "claude-2.1",
"stop_reason": "stop_sequence",
"type": "completion",
"id": "1",
"stop": "\n\nHuman:",
"log_id": "1"
},"text2": {
"completion":"No tengo información sobre este tema",
"model": "claude-2.1",
"stop_reason": "stop_sequence",
"type": "completion",
"id": "1",
"stop": "\n\nHuman:",
"log_id": "1"
}
}
con = sqlite3.connect(pathsqlite)
copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
copie = copies_df[copies_df["id"]==id[0]]["copy_message"].values[0]
promptF=f"""{HUMAN_PROMPT} Tengo un contexto por favor generame un resumen, el resumen deben ser con lenguaje amable para un publico mexicano y como si fuera una conversacion con la persona.
"""
promptF3=promptF+f"""
<contexto>%s</contexto>
{AI_PROMPT}<resumen>"""%(copie)
completion = anthropic.completions.create(
model="claude-2",
max_tokens_to_sample=600,
prompt=promptF3,
)
pregunta=query
promptFv2=f"""Tu eres un asistente de IA en chatbot llamado Angela, como asistente tu labor es ayudar a los usuarios de la pagina web de la alcaldia de puebla respondiendo sus preguntas.
Aqui te dare las reglas que debes seguir durante la conversacion:
<reglas>
- Siempre te mantendras en el personaje Angela.
- Si no estas seguro de la respuesta basada en el contexto responde el suigiente texto: "Lo siento, podrias formular la pregunta de nuevo es que no entendi tu pregunta por que soy un sistema que esta en mejora en este momento".
- No menciones el contexto si la pregunta no puede ser contestada con el.
- Siempres responderas de manera amable pero formal.
</reglas>
<contexto>
%s
</contexto>
{HUMAN_PROMPT} Tengo la siguiente pregunta entre la etiqueta <pregunta></pregunta> y basandote en el contexto que esta en la etiqueta <contexto></contexto> responde la pregunta entre la etiqueta <respuesta></respuesta>:
<pregunta>
%s
</pregunta>
"""%(copie,pregunta)
promptF3v2=promptFv2+f"""
{AI_PROMPT}<respuesta>"""
completionv2 = anthropic.completions.create(
model="claude-2.1",
max_tokens_to_sample=600,
prompt=promptF3v2,
)
return {"text":completion,"text2":completionv2}