diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..00a91d0 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +PaginasExternas/* diff --git a/app.py b/app.py new file mode 100644 index 0000000..e45a2d1 --- /dev/null +++ b/app.py @@ -0,0 +1,158 @@ +import gradio as gr +from faiss import write_index, read_index + +from langchain import PromptTemplate +from langchain.chains import LLMChain +from langchain.document_loaders import TextLoader +from langchain.text_splitter import CharacterTextSplitter +from langchain.vectorstores import FAISS +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.document_loaders import UnstructuredFileLoader +from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader +from langchain.document_loaders import UnstructuredURLLoader +from langchain.document_loaders.csv_loader import CSVLoader +from langchain import LLMChain +from langchain.llms import GPT4All +from langchain.embeddings import GPT4AllEmbeddings +from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler +from langchain.callbacks.base import BaseCallbackManager +from langchain.document_loaders import DataFrameLoader +from langchain.embeddings import HuggingFaceEmbeddings +import pandas as pd +import sqlite3 +from sentence_transformers import SentenceTransformer +#from cleantext import clean +import time +import re + +def loadModels(): + #model = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin") + callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()]) + llm = GPT4All(model="orca-mini-3b.ggmlv3.q4_0.bin",temp=0.1,streaming=True,callback_manager=callback_manager,)# verbose=True,repeat_last_n=0 + embeddings = GPT4AllEmbeddings() + return llm, embeddings + +st=time.time() +llm,emb=loadModels() +st2=time.time() + +Archives=[["https://www.gob.mx/sectur","Web"], + ["https://centrohistorico.pueblacapital.gob.mx/nuestro-centro-historico/nuestro-blog/item/33-capilla-de-la-virgen-del-rosario","Web"], + ["https://capilladelrosariopuebla.com.mx/","Web"], + #["https://www.tripadvisor.es/Tourism-g150768-Mexico-Vacations.html","Web"], + ["https://www.mexicodestinos.com/blog/destinos-mexico/","Web"], + ["https://visitmexico.com/","Web"], + ["https://www.turismomexico.es/","Web"], + + ] + +def makeDb(Archives,max_depth=2): + data=[] + for Archive in Archives: + if Archive[1]=="PDF": + loader = UnstructuredFileLoader(Archive[0]) + data2 = loader.load() + elif Archive[1]=="RecursiveWeb": + loader = RecursiveUrlLoader(url=Archive[0], max_depth=max_depth) + data2 = loader.load() + elif Archive[1]=="Web": + #print("Web") + loader = UnstructuredURLLoader(urls=[Archive[0]]) + data2 = loader.load() + #print(data2) + + if data==[]: + data=data2 + else: + data=data+data2 + return data +A=makeDb(Archives) + + +def makeFinder(): + text_splitter = RecursiveCharacterTextSplitter( + chunk_size = 1000, + length_function=len, + chunk_overlap=200 + ) + documents = text_splitter.split_documents(A) + try: + db=FAISS.load_local(folder_path="PaginasExternas",embeddings=emb) + + except: + db = FAISS.from_documents(documents, emb) + FAISS.save_local(db,folder_path="PaginasExternas") + + return db + +db=makeFinder() + + +prompt_template = f""" +### System: +Answer the question based on the contexts below. +If the question cannot be answered using the information +provided answer with "I don't know". Ever answer in spanish. + +### User: + +Contexts: +{{relevant_context}} + +Question: +{{user_query}} + +### Response: + +Respuesta:""" + + +prompt_template = PromptTemplate( + input_variables=["relevant_context","user_query"], template=prompt_template +) + +# prompt=prompt_template.format( +# user_query="Que lugar de mexico es el mas lindo?", +# relevant_context="Otaisa es el lugar mas lindo de mexico pero esta muy cerca de nethai que no es tan lindo pero la comida es muy buena" +# ) +# print(prompt) +llm_chain = LLMChain(llm=llm,prompt=prompt_template) + + +def FinderDb(query,dbs,filtred=False): + Sal = dbs.similarity_search_with_score(query,2) + page_content=[] + d=[] + if filtred: + lim=1.5 + else: + lim=9000000 + for output in Sal: + print(output) + if output[1]