Refactoring of code in libs.
This commit is contained in:
parent
ceaa20af06
commit
502cf96292
|
@ -0,0 +1,121 @@
|
|||
from sentence_transformers import SentenceTransformer
|
||||
from fastapi import FastAPI
|
||||
from unidecode import unidecode
|
||||
from nltk.corpus import stopwords
|
||||
from langchain.schema.embeddings import Embeddings
|
||||
from langchain.document_loaders import DataFrameLoader
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
import json
|
||||
import time
|
||||
from pydantic import BaseModel
|
||||
from langchain.vectorstores import FAISS
|
||||
from typing import Optional
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
|
||||
configPath=Path(relPath)
|
||||
with open(configPath, 'r', encoding='utf-8') as file:
|
||||
config = json.load(file)[nameModel]
|
||||
Output= config[dataOut]
|
||||
return Output
|
||||
|
||||
|
||||
|
||||
def remove_emoji(string):
|
||||
emoji_pattern = re.compile("["
|
||||
u"\U0001F600-\U0001F64F" # emoticons
|
||||
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
||||
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
||||
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
||||
u"\U00002702-\U000027B0"
|
||||
u"\U000024C2-\U0001F251"
|
||||
"]+", flags=re.UNICODE)
|
||||
return emoji_pattern.sub(r' ', string)
|
||||
|
||||
def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
|
||||
if punctuationOK:
|
||||
# remove punctuation
|
||||
for sig in [".",",","!","¿","?","=","(",")"]:
|
||||
document=document.replace(sig," ")
|
||||
|
||||
if xtrasOK:
|
||||
# remove user mentions
|
||||
document = re.sub("@[A-Za-z0-9_]+"," ", document)
|
||||
# remove URLS
|
||||
document = re.sub(r'http\S+', ' ', document)
|
||||
# remove hashtags
|
||||
document = re.sub("#[A-Za-z0-9_]+","", document)
|
||||
if emojiOk:
|
||||
# remove emoji's
|
||||
document = remove_emoji(document)
|
||||
|
||||
#document = re.sub("[^0-9A-Za-z ]", "" , document)
|
||||
# remove double spaces
|
||||
#print(document)
|
||||
if unidecodeOK:
|
||||
document=unidecode(document)
|
||||
|
||||
if stopOK:
|
||||
words=document.split(" ")
|
||||
stop_words = set(stopwords.words('spanish'))
|
||||
words = [w for w in words if not w in stop_words]
|
||||
document=" ".join(words)
|
||||
document = document.replace(' ',"")
|
||||
#print(document)
|
||||
return document.strip().lower()
|
||||
|
||||
def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
|
||||
st = SentenceTransformer(model_name,device='cpu')
|
||||
return st
|
||||
|
||||
|
||||
def loadCopysAndData(pathsqlite):
|
||||
con = sqlite3.connect(pathsqlite)
|
||||
copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
|
||||
copiesT = copies_df
|
||||
copiesT=copiesT[["copy_message","id","name","intentionality"]]
|
||||
#print(copiesT)
|
||||
data = copiesT
|
||||
#print(data)
|
||||
B=DataFrameLoader(data,page_content_column="copy_message")
|
||||
B2=DataFrameLoader(data,page_content_column="intentionality")
|
||||
documents=B.load()
|
||||
documents2=B2.load()
|
||||
return documents,documents2
|
||||
|
||||
def makeFaissdb(documents,folder_path,embedding):
|
||||
try:
|
||||
db=FAISS.load_local(folder_path=folder_path,embeddings=embedding)
|
||||
except:
|
||||
db = FAISS.from_documents(documents, embedding)
|
||||
FAISS.save_local(db,folder_path=folder_path)
|
||||
return db
|
||||
|
||||
class Response(BaseModel):
|
||||
query: str
|
||||
filtred : Optional[float] = -9.0
|
||||
|
||||
|
||||
|
||||
|
||||
def FinderDbs(query,dbs,filtred=0.4):
|
||||
AllData={}
|
||||
for dbt in dbs:
|
||||
Sal = dbt.similarity_search_with_score(query,4)
|
||||
for output in Sal:
|
||||
if output[0].metadata["id"] in AllData.keys():
|
||||
AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]])
|
||||
else:
|
||||
AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
|
||||
if filtred>0:
|
||||
filtredData={}
|
||||
for row in AllData.keys():
|
||||
if AllData[row]["d"]<filtred:
|
||||
filtredData[row]=AllData[row]
|
||||
filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
|
||||
return filtredData,filtredData.keys()
|
||||
else:
|
||||
AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
|
||||
return AllData,AllData.keys()
|
261
main.py
261
main.py
|
@ -1,49 +1,46 @@
|
|||
#import gradio as gr
|
||||
from faiss import write_index, read_index
|
||||
from typing import List
|
||||
#from langchain import PromptTemplate
|
||||
from langchain.document_loaders import TextLoader
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
from langchain.vectorstores import FAISS
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain.document_loaders import UnstructuredFileLoader
|
||||
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
||||
from langchain.document_loaders import UnstructuredURLLoader
|
||||
from langchain.document_loaders.csv_loader import CSVLoader
|
||||
#from langchain import LLMChain
|
||||
from typing import List
|
||||
from pydantic import BaseModel
|
||||
from langchain.schema.embeddings import Embeddings
|
||||
from langchain.document_loaders import DataFrameLoader
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
from typing import Optional
|
||||
import re
|
||||
from pathlib import Path
|
||||
import time
|
||||
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
|
||||
import json
|
||||
import pandas as pd
|
||||
import sqlite3
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from fastapi import FastAPI
|
||||
from unidecode import unidecode
|
||||
from nltk.corpus import stopwords
|
||||
from typing import Optional
|
||||
from langchain.schema.embeddings import Embeddings
|
||||
from langchain.document_loaders import DataFrameLoader
|
||||
|
||||
#from langchain import PromptTemplate
|
||||
# from langchain.document_loaders import TextLoader
|
||||
# from langchain.text_splitter import CharacterTextSplitter
|
||||
|
||||
# from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
# from langchain.document_loaders import UnstructuredFileLoader
|
||||
# from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
||||
# from langchain.document_loaders import UnstructuredURLLoader
|
||||
# from langchain.document_loaders.csv_loader import CSVLoader
|
||||
# #from langchain import LLMChain
|
||||
|
||||
#
|
||||
#
|
||||
# from langchain.embeddings import HuggingFaceEmbeddings
|
||||
|
||||
|
||||
#from cleantext import clean
|
||||
import re
|
||||
from pathlib import Path
|
||||
import json
|
||||
#from langid.langid import LanguageIdentifier
|
||||
#from langid.langid import model as modellangid
|
||||
import time
|
||||
model="Modelo_embedding_Mexico_Puebla/all-mpnet-base-v2/model"
|
||||
entrenamiento="V1.3"
|
||||
|
||||
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
|
||||
configPath=Path(relPath)
|
||||
with open(configPath, 'r', encoding='utf-8') as file:
|
||||
config = json.load(file)[nameModel]
|
||||
|
||||
Output= Path(config[dataOut])
|
||||
return Output
|
||||
|
||||
|
||||
|
||||
from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted
|
||||
|
||||
class CustomEmbedding(Embeddings, BaseModel,):
|
||||
class CustomEmbedding(Embeddings, BaseModel):
|
||||
"""embedding model with preprocessing"""
|
||||
def _get_embedding(self,text) -> List[float]:
|
||||
#print(text,"text")
|
||||
|
@ -54,135 +51,25 @@ class CustomEmbedding(Embeddings, BaseModel,):
|
|||
Sal=[]
|
||||
for text in texts:
|
||||
Sal.append(self._get_embedding(text))
|
||||
|
||||
return Sal
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
return self._get_embedding(text)
|
||||
|
||||
def remove_emoji(string):
|
||||
emoji_pattern = re.compile("["
|
||||
u"\U0001F600-\U0001F64F" # emoticons
|
||||
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
||||
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
||||
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
||||
u"\U00002702-\U000027B0"
|
||||
u"\U000024C2-\U0001F251"
|
||||
"]+", flags=re.UNICODE)
|
||||
return emoji_pattern.sub(r' ', string)
|
||||
|
||||
def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
|
||||
if punctuationOK:
|
||||
# remove punctuation
|
||||
for sig in [".",",","!","¿","?","=","(",")"]:
|
||||
document=document.replace(sig," ")
|
||||
|
||||
if xtrasOK:
|
||||
# remove user mentions
|
||||
document = re.sub("@[A-Za-z0-9_]+"," ", document)
|
||||
# remove URLS
|
||||
document = re.sub(r'http\S+', ' ', document)
|
||||
# remove hashtags
|
||||
document = re.sub("#[A-Za-z0-9_]+","", document)
|
||||
if emojiOk:
|
||||
# remove emoji's
|
||||
document = remove_emoji(document)
|
||||
|
||||
#document = re.sub("[^0-9A-Za-z ]", "" , document)
|
||||
# remove double spaces
|
||||
#print(document)
|
||||
if unidecodeOK:
|
||||
document=unidecode(document)
|
||||
|
||||
|
||||
if stopOK:
|
||||
words=document.split(" ")
|
||||
stop_words = set(stopwords.words('spanish'))
|
||||
words = [w for w in words if not w in stop_words]
|
||||
document=" ".join(words)
|
||||
|
||||
|
||||
|
||||
document = document.replace(' ',"")
|
||||
#print(document)
|
||||
return document.strip().lower()
|
||||
|
||||
def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
|
||||
st = SentenceTransformer(model_name,device='cpu')
|
||||
return st
|
||||
|
||||
|
||||
def loadCopysAndData(pathsqlite=pathDb):
|
||||
con = sqlite3.connect(pathsqlite)
|
||||
copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
|
||||
copiesT = copies_df
|
||||
copiesT=copiesT[["copy_message","id","name","intentionality"]]
|
||||
#print(copiesT)
|
||||
data = copiesT
|
||||
#print(data)
|
||||
B=DataFrameLoader(data,page_content_column="copy_message")
|
||||
B2=DataFrameLoader(data,page_content_column="intentionality")
|
||||
documents=B.load()
|
||||
documents2=B2.load()
|
||||
return documents,documents2
|
||||
|
||||
def makeFaissdb(documents,folder_path,embedding):
|
||||
try:
|
||||
db=FAISS.load_local(folder_path=folder_path,embeddings=embedding)
|
||||
except:
|
||||
db = FAISS.from_documents(documents, embedding)
|
||||
FAISS.save_local(db,folder_path=folder_path)
|
||||
return db
|
||||
|
||||
#llm,emb=loadModels()
|
||||
|
||||
documents,documents2=loadCopysAndData()
|
||||
nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"
|
||||
model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model"
|
||||
entrenamiento="V1.3"
|
||||
pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite")
|
||||
keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
|
||||
documents,documents2=loadCopysAndData(pathsqlite)
|
||||
emb=loadmodelEmb(model_name = model)
|
||||
emb2=CustomEmbedding()
|
||||
db=makeFaissdb(documents,"Copies3",emb2)
|
||||
db2=makeFaissdb(documents2,"Intentionality3",emb2)
|
||||
#db3=makeFaissdb(documents2,"nameshf",hf)
|
||||
#identifier = LanguageIdentifier.from_modelstring(modellangid, norm_probs=True)
|
||||
|
||||
|
||||
|
||||
|
||||
def FinderDbs(query,dbs,filtred=0.4):
|
||||
AllData={}
|
||||
for dbt in dbs:
|
||||
Sal = dbt.similarity_search_with_score(query,4)
|
||||
for output in Sal:
|
||||
if output[0].metadata["id"] in AllData.keys():
|
||||
AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]])
|
||||
else:
|
||||
AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
|
||||
#for item in AllData.items():
|
||||
# print(item)
|
||||
|
||||
if filtred>0:
|
||||
filtredData={}
|
||||
for row in AllData.keys():
|
||||
if AllData[row]["d"]<filtred:
|
||||
filtredData[row]=AllData[row]
|
||||
filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
|
||||
return filtredData,filtredData.keys()
|
||||
|
||||
|
||||
else:
|
||||
AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
|
||||
return AllData,AllData.keys()
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@app.get("/")
|
||||
def read_main():
|
||||
return {"message": "This is your main app"}
|
||||
|
||||
class Response(BaseModel):
|
||||
query: str
|
||||
filtred : Optional[float] = -9.0
|
||||
|
||||
|
||||
|
||||
@app.post("/angela-api/")
|
||||
def calculate_api(response: Response):
|
||||
query = response.query
|
||||
|
@ -190,9 +77,8 @@ def calculate_api(response: Response):
|
|||
filtred = response.filtred
|
||||
except:
|
||||
filtred = -9.0
|
||||
|
||||
AllData=FinderDbs(query,[db2,db],filtred)
|
||||
print(AllData)
|
||||
#print(AllData)
|
||||
versionL="_".join([model,entrenamiento])
|
||||
#tt=time.time()
|
||||
#if identifier.classify(query)[1]< 0.3:
|
||||
|
@ -209,7 +95,82 @@ def calculate_api(response: Response):
|
|||
id.append(i[0])
|
||||
return {"ids": id,"DC":dis,"modelo":versionL}
|
||||
|
||||
|
||||
@app.post("/angela-api-claude/")
|
||||
def calculate_api_claude(response: Response):
|
||||
anthropic = Anthropic(api_key=keyanthropic)
|
||||
query = response.query
|
||||
try:
|
||||
filtred = response.filtred
|
||||
except:
|
||||
filtred = -9.0
|
||||
|
||||
AllData=FinderDbs(query,[db2,db],filtred)
|
||||
versionL="_".join([model,entrenamiento])
|
||||
if AllData:
|
||||
AllData = list(AllData)
|
||||
dis=[]
|
||||
id=[]
|
||||
for k,i in enumerate(AllData[0].items()):
|
||||
dis.append(str(i[1]['d']))
|
||||
id.append(i[0])
|
||||
if len(id)<1:
|
||||
return {"text": {"completion": "No tengo información sobre este tema",
|
||||
"model": "claude-2.1",
|
||||
"stop_reason": "stop_sequence",
|
||||
"type": "completion",
|
||||
"id": "1",
|
||||
"stop": "\n\nHuman:",
|
||||
"log_id": "1"
|
||||
},"text2": {
|
||||
"completion":"No tengo información sobre este tema",
|
||||
"model": "claude-2.1",
|
||||
"stop_reason": "stop_sequence",
|
||||
"type": "completion",
|
||||
"id": "1",
|
||||
"stop": "\n\nHuman:",
|
||||
"log_id": "1"
|
||||
}
|
||||
}
|
||||
con = sqlite3.connect(pathsqlite)
|
||||
copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
|
||||
copie = copies_df[copies_df["id"]==id[0]]["copy_message"].values[0]
|
||||
promptF=f"""{HUMAN_PROMPT} Tengo un contexto por favor generame un resumen, el resumen deben ser con lenguaje amable para un publico mexicano y como si fuera una conversacion con la persona.
|
||||
"""
|
||||
promptF3=promptF+f"""
|
||||
<contexto>%s</contexto>
|
||||
{AI_PROMPT}<resumen>"""%(copie)
|
||||
completion = anthropic.completions.create(
|
||||
model="claude-2",
|
||||
max_tokens_to_sample=600,
|
||||
prompt=promptF3,
|
||||
)
|
||||
|
||||
pregunta=query
|
||||
promptFv2=f"""Tu eres un asistente de IA en chatbot llamado Angela, como asistente tu labor es ayudar a los usuarios de la pagina web de la alcaldia de puebla respondiendo sus preguntas.
|
||||
Aqui te dare las reglas que debes seguir durante la conversacion:
|
||||
<reglas>
|
||||
- Siempre te mantendras en el personaje Angela.
|
||||
- Si no estas seguro de la respuesta basada en el contexto responde el suigiente texto: "Lo siento, podrias formular la pregunta de nuevo es que no entendi tu pregunta por que soy un sistema que esta en mejora en este momento".
|
||||
- No menciones el contexto si la pregunta no puede ser contestada con el.
|
||||
- Siempres responderas de manera amable pero formal.
|
||||
</reglas>
|
||||
<contexto>
|
||||
%s
|
||||
</contexto>
|
||||
{HUMAN_PROMPT} Tengo la siguiente pregunta entre la etiqueta <pregunta></pregunta> y basandote en el contexto que esta en la etiqueta <contexto></contexto> responde la pregunta entre la etiqueta <respuesta></respuesta>:
|
||||
<pregunta>
|
||||
%s
|
||||
</pregunta>
|
||||
"""%(copie,pregunta)
|
||||
|
||||
promptF3v2=promptFv2+f"""
|
||||
{AI_PROMPT}<respuesta>"""
|
||||
completionv2 = anthropic.completions.create(
|
||||
model="claude-2.1",
|
||||
max_tokens_to_sample=600,
|
||||
prompt=promptF3v2,
|
||||
)
|
||||
return {"text":completion,"text2":completionv2}
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue