Pdf and gitignore
This commit is contained in:
parent
430d347c0b
commit
125a7137d6
|
@ -0,0 +1 @@
|
|||
PaginasExternas/*
|
|
@ -0,0 +1,158 @@
|
|||
import gradio as gr
|
||||
from faiss import write_index, read_index
|
||||
|
||||
from langchain import PromptTemplate
|
||||
from langchain.chains import LLMChain
|
||||
from langchain.document_loaders import TextLoader
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
from langchain.vectorstores import FAISS
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain.document_loaders import UnstructuredFileLoader
|
||||
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
||||
from langchain.document_loaders import UnstructuredURLLoader
|
||||
from langchain.document_loaders.csv_loader import CSVLoader
|
||||
from langchain import LLMChain
|
||||
from langchain.llms import GPT4All
|
||||
from langchain.embeddings import GPT4AllEmbeddings
|
||||
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
||||
from langchain.callbacks.base import BaseCallbackManager
|
||||
from langchain.document_loaders import DataFrameLoader
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
import pandas as pd
|
||||
import sqlite3
|
||||
from sentence_transformers import SentenceTransformer
|
||||
#from cleantext import clean
|
||||
import time
|
||||
import re
|
||||
|
||||
def loadModels():
|
||||
#model = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin")
|
||||
callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])
|
||||
llm = GPT4All(model="orca-mini-3b.ggmlv3.q4_0.bin",temp=0.1,streaming=True,callback_manager=callback_manager,)# verbose=True,repeat_last_n=0
|
||||
embeddings = GPT4AllEmbeddings()
|
||||
return llm, embeddings
|
||||
|
||||
st=time.time()
|
||||
llm,emb=loadModels()
|
||||
st2=time.time()
|
||||
|
||||
Archives=[["https://www.gob.mx/sectur","Web"],
|
||||
["https://centrohistorico.pueblacapital.gob.mx/nuestro-centro-historico/nuestro-blog/item/33-capilla-de-la-virgen-del-rosario","Web"],
|
||||
["https://capilladelrosariopuebla.com.mx/","Web"],
|
||||
#["https://www.tripadvisor.es/Tourism-g150768-Mexico-Vacations.html","Web"],
|
||||
["https://www.mexicodestinos.com/blog/destinos-mexico/","Web"],
|
||||
["https://visitmexico.com/","Web"],
|
||||
["https://www.turismomexico.es/","Web"],
|
||||
|
||||
]
|
||||
|
||||
def makeDb(Archives,max_depth=2):
|
||||
data=[]
|
||||
for Archive in Archives:
|
||||
if Archive[1]=="PDF":
|
||||
loader = UnstructuredFileLoader(Archive[0])
|
||||
data2 = loader.load()
|
||||
elif Archive[1]=="RecursiveWeb":
|
||||
loader = RecursiveUrlLoader(url=Archive[0], max_depth=max_depth)
|
||||
data2 = loader.load()
|
||||
elif Archive[1]=="Web":
|
||||
#print("Web")
|
||||
loader = UnstructuredURLLoader(urls=[Archive[0]])
|
||||
data2 = loader.load()
|
||||
#print(data2)
|
||||
|
||||
if data==[]:
|
||||
data=data2
|
||||
else:
|
||||
data=data+data2
|
||||
return data
|
||||
A=makeDb(Archives)
|
||||
|
||||
|
||||
def makeFinder():
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size = 1000,
|
||||
length_function=len,
|
||||
chunk_overlap=200
|
||||
)
|
||||
documents = text_splitter.split_documents(A)
|
||||
try:
|
||||
db=FAISS.load_local(folder_path="PaginasExternas",embeddings=emb)
|
||||
|
||||
except:
|
||||
db = FAISS.from_documents(documents, emb)
|
||||
FAISS.save_local(db,folder_path="PaginasExternas")
|
||||
|
||||
return db
|
||||
|
||||
db=makeFinder()
|
||||
|
||||
|
||||
prompt_template = f"""
|
||||
### System:
|
||||
Answer the question based on the contexts below.
|
||||
If the question cannot be answered using the information
|
||||
provided answer with "I don't know". Ever answer in spanish.
|
||||
|
||||
### User:
|
||||
|
||||
Contexts:
|
||||
{{relevant_context}}
|
||||
|
||||
Question:
|
||||
{{user_query}}
|
||||
|
||||
### Response:
|
||||
|
||||
Respuesta:"""
|
||||
|
||||
|
||||
prompt_template = PromptTemplate(
|
||||
input_variables=["relevant_context","user_query"], template=prompt_template
|
||||
)
|
||||
|
||||
# prompt=prompt_template.format(
|
||||
# user_query="Que lugar de mexico es el mas lindo?",
|
||||
# relevant_context="Otaisa es el lugar mas lindo de mexico pero esta muy cerca de nethai que no es tan lindo pero la comida es muy buena"
|
||||
# )
|
||||
# print(prompt)
|
||||
llm_chain = LLMChain(llm=llm,prompt=prompt_template)
|
||||
|
||||
|
||||
def FinderDb(query,dbs,filtred=False):
|
||||
Sal = dbs.similarity_search_with_score(query,2)
|
||||
page_content=[]
|
||||
d=[]
|
||||
if filtred:
|
||||
lim=1.5
|
||||
else:
|
||||
lim=9000000
|
||||
for output in Sal:
|
||||
print(output)
|
||||
if output[1]<lim:
|
||||
page_content.append(output[0].page_content)
|
||||
d.append(output[1])
|
||||
if len(page_content)<1:
|
||||
return None,None
|
||||
|
||||
return "/n".join(page_content),d
|
||||
|
||||
|
||||
def QARequest(Pregunta,filtred=False):
|
||||
query = Pregunta
|
||||
context,d=FinderDb(query,db,filtred=True)
|
||||
#A=llm_chain.run(user_query=query,relevant_context=context)
|
||||
return context,d#A
|
||||
|
||||
|
||||
#question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"
|
||||
with gr.Blocks() as demo:
|
||||
Pregunta = gr.Textbox(label="Pregunta")
|
||||
Respuesta = gr.Textbox(label="Respuesta")
|
||||
metrica=gr.Textbox(label="metrica")
|
||||
Enviar_btn = gr.Button("Responder")
|
||||
Enviar_btn.click(fn=QARequest, inputs=[Pregunta], outputs=[Respuesta,metrica], api_name="Angela")
|
||||
|
||||
demo.launch() #
|
||||
#print(A,time.time()-st,time.time()-st2)
|
||||
|
Binary file not shown.
Loading…
Reference in New Issue