import gradio as gr from faiss import write_index, read_index from langchain import PromptTemplate from langchain.chains import LLMChain from langchain.document_loaders import TextLoader from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import UnstructuredFileLoader from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader from langchain.document_loaders import UnstructuredURLLoader from langchain.document_loaders.csv_loader import CSVLoader from langchain import LLMChain from langchain.llms import GPT4All from langchain.embeddings import GPT4AllEmbeddings from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.callbacks.base import BaseCallbackManager from langchain.document_loaders import DataFrameLoader from langchain.embeddings import HuggingFaceEmbeddings import pandas as pd import sqlite3 from sentence_transformers import SentenceTransformer #from cleantext import clean import time import re def loadModels(): #model = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin") callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()]) llm = GPT4All(model="orca-mini-3b.ggmlv3.q4_0.bin",temp=0.1,streaming=True,callback_manager=callback_manager,)# verbose=True,repeat_last_n=0 embeddings = GPT4AllEmbeddings() return llm, embeddings st=time.time() llm,emb=loadModels() st2=time.time() Archives=[["https://www.gob.mx/sectur","Web"], ["https://centrohistorico.pueblacapital.gob.mx/nuestro-centro-historico/nuestro-blog/item/33-capilla-de-la-virgen-del-rosario","Web"], ["https://capilladelrosariopuebla.com.mx/","Web"], #["https://www.tripadvisor.es/Tourism-g150768-Mexico-Vacations.html","Web"], ["https://www.mexicodestinos.com/blog/destinos-mexico/","Web"], ["https://visitmexico.com/","Web"], ["https://www.turismomexico.es/","Web"], ] def makeDb(Archives,max_depth=2): data=[] for Archive in Archives: if Archive[1]=="PDF": loader = UnstructuredFileLoader(Archive[0]) data2 = loader.load() elif Archive[1]=="RecursiveWeb": loader = RecursiveUrlLoader(url=Archive[0], max_depth=max_depth) data2 = loader.load() elif Archive[1]=="Web": #print("Web") loader = UnstructuredURLLoader(urls=[Archive[0]]) data2 = loader.load() #print(data2) if data==[]: data=data2 else: data=data+data2 return data A=makeDb(Archives) def makeFinder(): text_splitter = RecursiveCharacterTextSplitter( chunk_size = 1000, length_function=len, chunk_overlap=200 ) documents = text_splitter.split_documents(A) try: db=FAISS.load_local(folder_path="PaginasExternas",embeddings=emb) except: db = FAISS.from_documents(documents, emb) FAISS.save_local(db,folder_path="PaginasExternas") return db db=makeFinder() prompt_template = f""" ### System: Answer the question based on the contexts below. If the question cannot be answered using the information provided answer with "I don't know". Ever answer in spanish. ### User: Contexts: {{relevant_context}} Question: {{user_query}} ### Response: Respuesta:""" prompt_template = PromptTemplate( input_variables=["relevant_context","user_query"], template=prompt_template ) # prompt=prompt_template.format( # user_query="Que lugar de mexico es el mas lindo?", # relevant_context="Otaisa es el lugar mas lindo de mexico pero esta muy cerca de nethai que no es tan lindo pero la comida es muy buena" # ) # print(prompt) llm_chain = LLMChain(llm=llm,prompt=prompt_template) def FinderDb(query,dbs,filtred=False): Sal = dbs.similarity_search_with_score(query,2) page_content=[] d=[] if filtred: lim=1.5 else: lim=9000000 for output in Sal: print(output) if output[1]