diff --git a/FindinDB.py b/FindinDB.py index 67eb0e6..9811710 100644 --- a/FindinDB.py +++ b/FindinDB.py @@ -17,80 +17,114 @@ from langchain.embeddings import GPT4AllEmbeddings from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.callbacks.base import BaseCallbackManager from langchain.document_loaders import DataFrameLoader +from langchain.embeddings import HuggingFaceEmbeddings import pandas as pd import sqlite3 +from sentence_transformers import SentenceTransformer +#from cleantext import clean +import re +model_name = 'hiiamsid/sentence_similarity_spanish_es' +model_kwargs = {'device': 'cpu'} +encode_kwargs = {'normalize_embeddings': True} +hf = HuggingFaceEmbeddings( + model_name=model_name, + model_kwargs=model_kwargs, + encode_kwargs=encode_kwargs +) + + def loadModels(): #model = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin") callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()]) llm = GPT4All(model="orca-mini-3b.ggmlv3.q4_0.bin",temp=0.1,streaming=True)#callback_manager=callback_manager, verbose=True,repeat_last_n=0 embeddings = GPT4AllEmbeddings() return llm, embeddings + + +def loadCopysAndData(pathsqlite="motor.sqlite"): + con = sqlite3.connect(pathsqlite) + copies_df = pd.read_sql_query("SELECT * from copies", con) + copiesT = copies_df[copies_df.copy_start =="T"] + copiesT=copiesT[["copy_message","id","name"]] + data = copiesT + B=DataFrameLoader(data,page_content_column="copy_message") + B2=DataFrameLoader(data,page_content_column="name") + documents=B.load() + documents2=B2.load() + return documents,documents2 + +def makeFaissdb(documents,folder_path,embedding): + try: + db=FAISS.load_local(folder_path=folder_path,embeddings=embedding) + + except: + db = FAISS.from_documents(documents, embedding) + FAISS.save_local(db,folder_path=folder_path) + return db + llm,emb=loadModels() +documents,documents2=loadCopysAndData() +db=makeFaissdb(documents,"Copies",emb) +db2=makeFaissdb(documents2,"names",emb) +db3=makeFaissdb(documents2,"nameshf",hf) -con = sqlite3.connect("motor.sqlite") -copies_df = pd.read_sql_query("SELECT * from copies", con) -copiesT = copies_df[copies_df.copy_start =="T"] -copiesT=copiesT[["copy_message","id","name"]] -data = copiesT -B=DataFrameLoader(data,page_content_column="copy_message") -B2=DataFrameLoader(data,page_content_column="name") -documents=B.load() -documents2=B2.load() - -try: - db=FAISS.load_local(folder_path="Copies",embeddings=emb) - -except: - db = FAISS.from_documents(documents, emb) - FAISS.save_local(db,folder_path="Copies") - -try: - db2=FAISS.load_local(folder_path="names",embeddings=emb) - -except: - db2 = FAISS.from_documents(documents2, emb) - FAISS.save_local(db2,folder_path="names") - -def FinderDb(query,dbs): - Sal = dbs.similarity_search_with_score(query,3) - page_content=[] - id=[] - d=[] - for output in Sal: - page_content.append(output[0].page_content) - id.append(output[0].metadata["id"]) - d.append(output[1]) - espacio=""" - - -######################## +def FinderDbs(query,dbs,filtred=False,th=1.2): + AllData={} + for dbt in dbs: + Sal = dbt.similarity_search_with_score(query,4) + for output in Sal: + if output[0].metadata["id"] in AllData.keys(): + AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"]-0.1,output[1]-0.1]) + else: + AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content} + for item in AllData.items(): + print(item) + if filtred: + filtredData={} + for row in AllData.keys(): + if AllData[row]["d"]<1.2: + filtredData[row]=AllData[row] + filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"])) + return filtredData,filtredData.keys() -""" - page_content=espacio.join(page_content) - return page_content,d,id + else: + AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"])) + return AllData,AllData.keys() -def QARequest(Pregunta): +def QARequest(Pregunta,filtred=False): query = Pregunta - page_content,d,id=FinderDb(query,db) - page_content2,d2,id2=FinderDb(query,db2) - return page_content,d,id,page_content2,d2,id2 + AllData=FinderDbs(query,[db,db2],filtred) + return AllData with gr.Blocks() as demo: + gr.Image("logo.jpg",height=100) + gr.Markdown("Esta es la busqueda que hace el usuario") Pregunta = gr.Textbox(label="Pregunta") - #Respuesta = gr.Textbox(label="Respuesta") - #id = gr.Textbox(label="id") - #metrica=gr.Textbox(label="metrica") - Respuesta2 = gr.Textbox(label="Respuesta2") - id2 = gr.Textbox(label="id2") - metrica2=gr.Textbox(label="metrica2") + #Pregunta = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", Pregunta) + #Pregunta=Pregunta.strip().lower() + + filtred=gr.Checkbox(label="filtrado") + + gr.Markdown("Respuestas para orca desde los copys") + Respuesta = gr.Textbox(label="Respuesta") + id = gr.Textbox(label="id") + # metrica=gr.Textbox(label="metrica") + # gr.Markdown("Respuestas para orca desde los names") + # Respuesta2 = gr.Textbox(label="Respuesta2") + # id2 = gr.Textbox(label="id2") + # metrica2=gr.Textbox(label="metrica2") + # gr.Markdown("Respuestas para hf desde los names") + # Respuesta3 = gr.Textbox(label="Respuesta3") + # id3 = gr.Textbox(label="id3") + # metrica3=gr.Textbox(label="metrica3") Enviar_btn = gr.Button("Responder") - Enviar_btn.click(fn=QARequest, inputs=Pregunta, outputs=[Respuesta2,metrica2,id2], api_name="Respuestas") # Respuesta,metrica,id, + Enviar_btn.click(fn=QARequest, inputs=[Pregunta,filtred], outputs=[Respuesta,id], api_name="Angela") # demo.launch() # diff --git a/logo.jpg b/logo.jpg new file mode 100644 index 0000000..1043589 Binary files /dev/null and b/logo.jpg differ