195 lines
5.7 KiB
Python
195 lines
5.7 KiB
Python
import gradio as gr
|
|
from faiss import write_index, read_index
|
|
|
|
from langchain import PromptTemplate
|
|
from langchain.chains import LLMChain
|
|
from langchain.document_loaders import TextLoader
|
|
from langchain.text_splitter import CharacterTextSplitter
|
|
from langchain.vectorstores import FAISS
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain.document_loaders import UnstructuredFileLoader
|
|
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
|
from langchain.document_loaders import UnstructuredURLLoader
|
|
from langchain.document_loaders.csv_loader import CSVLoader
|
|
from langchain import LLMChain
|
|
#from langchain.llms import GPT4All
|
|
from gpt4all import GPT4All
|
|
from langchain.embeddings import GPT4AllEmbeddings
|
|
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
|
from langchain.callbacks.base import BaseCallbackManager
|
|
from langchain.document_loaders import DataFrameLoader
|
|
from langchain.embeddings import HuggingFaceEmbeddings
|
|
import pandas as pd
|
|
import sqlite3
|
|
from sentence_transformers import SentenceTransformer
|
|
#from cleantext import clean
|
|
import time
|
|
import re
|
|
|
|
def loadModels():
|
|
llm = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin")
|
|
|
|
#llm = GPT4All(model="orca-mini-3b.ggmlv3.q4_0.bin",temp=0.1,streaming=True,callbacks=[StreamingStdOutCallbackHandler()])# verbose=True,repeat_last_n=0
|
|
embeddings = GPT4AllEmbeddings()
|
|
return llm, embeddings
|
|
|
|
st=time.time()
|
|
llm,emb=loadModels()
|
|
st2=time.time()
|
|
|
|
Archives=[["https://www.gob.mx/sectur","Web"],
|
|
["https://centrohistorico.pueblacapital.gob.mx/nuestro-centro-historico/nuestro-blog/item/33-capilla-de-la-virgen-del-rosario","Web"],
|
|
#["https://capilladelrosariopuebla.com.mx/","Web"],
|
|
#["https://www.tripadvisor.es/Tourism-g150768-Mexico-Vacations.html","Web"],
|
|
["https://www.mexicodestinos.com/blog/destinos-mexico/","Web"],
|
|
["https://visitmexico.com/","Web"],
|
|
["https://www.turismomexico.es/","Web"],
|
|
["https://es.wikipedia.org/wiki/Capilla_del_Rosario_(Puebla)","Web"],
|
|
["https://www.mexicodesconocido.com.mx/capilla-del-rosario-puebla.html","Web"]
|
|
|
|
|
|
]
|
|
|
|
def makeDb(Archives,max_depth=2):
|
|
data=[]
|
|
for Archive in Archives:
|
|
if Archive[1]=="PDF":
|
|
loader = UnstructuredFileLoader(Archive[0])
|
|
data2 = loader.load()
|
|
elif Archive[1]=="RecursiveWeb":
|
|
loader = RecursiveUrlLoader(url=Archive[0], max_depth=max_depth)
|
|
data2 = loader.load()
|
|
elif Archive[1]=="Web":
|
|
#print("Web")
|
|
loader = UnstructuredURLLoader(urls=[Archive[0]])
|
|
data2 = loader.load()
|
|
#print(data2)
|
|
|
|
if data==[]:
|
|
data=data2
|
|
else:
|
|
data=data+data2
|
|
return data
|
|
A=makeDb(Archives)
|
|
|
|
|
|
def makeFinder():
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size = 250,
|
|
length_function=len,
|
|
chunk_overlap=50
|
|
)
|
|
documents = text_splitter.split_documents(A)
|
|
try:
|
|
db=FAISS.load_local(folder_path="PaginasExternas",embeddings=emb)
|
|
|
|
except:
|
|
db = FAISS.from_documents(documents, emb)
|
|
FAISS.save_local(db,folder_path="PaginasExternas")
|
|
|
|
return db
|
|
|
|
db=makeFinder()
|
|
|
|
|
|
prompt_template = f"""
|
|
### System:
|
|
Answer the question based on the contexts below.
|
|
If the question cannot be answered using the information
|
|
provided answer with "I don't know". Ever answer in spanish.
|
|
|
|
### User:
|
|
|
|
Contexts:
|
|
{{relevant_context}}
|
|
|
|
Question:
|
|
{{user_query}}
|
|
|
|
### Response:
|
|
|
|
Respuesta:"""
|
|
|
|
# prompt_template = f"""
|
|
# ### System:
|
|
# Reponde la pregunta basado en el Contexto dado.
|
|
# Si la pregunta no puede ser contestada usando la informacion dada
|
|
# responder con "No poseo conocimiento sobre ese tema". Responder siempre en español.
|
|
|
|
# ### User:
|
|
|
|
# Contexto:
|
|
# {{relevant_context}}
|
|
|
|
# Pregunta:
|
|
# {{user_query}}
|
|
|
|
# ### Response:
|
|
|
|
# Respuesta:"""
|
|
|
|
prompt_template = PromptTemplate(
|
|
input_variables=["relevant_context","user_query"], template=prompt_template
|
|
)
|
|
|
|
# prompt=prompt_template.format(
|
|
# user_query="Que lugar de mexico es el mas lindo?",
|
|
# relevant_context="Otaisa es el lugar mas lindo de mexico pero esta muy cerca de nethai que no es tan lindo pero la comida es muy buena"
|
|
# )
|
|
# print(prompt)
|
|
#llm_chain = LLMChain(llm=llm,prompt=prompt_template)
|
|
|
|
|
|
def FinderDb(query,dbs,filtred=False):
|
|
Sal = dbs.similarity_search_with_score(query,9)
|
|
page_content=[]
|
|
d=[]
|
|
if filtred:
|
|
lim=1.5
|
|
else:
|
|
lim=9000000
|
|
for output in Sal:
|
|
if output[1]<lim:
|
|
page_content.append(output[0].page_content)
|
|
d.append(output[1])
|
|
if len(page_content)<1:
|
|
return None,None
|
|
|
|
return "/n".join(page_content),d
|
|
|
|
|
|
def QARequest(message, history):
|
|
text=""
|
|
|
|
query = message
|
|
context,d=FinderDb(query,db,filtred=True)
|
|
prompt=prompt_template.format(
|
|
user_query=query,
|
|
relevant_context=context
|
|
)
|
|
#print("#############",prompt,"#############",)
|
|
text2=""
|
|
# for i in text:
|
|
# time.sleep(0.7)
|
|
# text2=text2+i
|
|
# yield text2
|
|
for i in llm.generate(prompt, max_tokens=200, streaming=True,temp=0.01, top_k=40, top_p=0.4):
|
|
time.sleep(0.7)
|
|
text2=text2+i
|
|
yield text2
|
|
|
|
|
|
#question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"
|
|
# with gr.Blocks() as demo:
|
|
# Pregunta = gr.Textbox(label="Pregunta")
|
|
# Respuesta = gr.Textbox(label="Respuesta")
|
|
# metrica=gr.Textbox(label="metrica")
|
|
# Enviar_btn = gr.Button("Responder")
|
|
# Enviar_btn.click(fn=QARequest, inputs=[Pregunta], outputs=[Respuesta], api_name="Angela")
|
|
gr.ChatInterface(QARequest).queue().launch()
|
|
|
|
|
|
#demo.queue().launch() #
|
|
#print(A,time.time()-st,time.time()-st2)
|
|
|