From 502cf96292a2ef6317d35e1fd392aeeb370fc4cb Mon Sep 17 00:00:00 2001 From: marioggil Date: Thu, 21 Dec 2023 22:04:37 -0500 Subject: [PATCH] Refactoring of code in libs. --- general.py | 121 +++++++++++++++++++++++++ main.py | 261 +++++++++++++++++++++++------------------------------ 2 files changed, 232 insertions(+), 150 deletions(-) create mode 100644 general.py diff --git a/general.py b/general.py new file mode 100644 index 0000000..a666d6f --- /dev/null +++ b/general.py @@ -0,0 +1,121 @@ +from sentence_transformers import SentenceTransformer +from fastapi import FastAPI +from unidecode import unidecode +from nltk.corpus import stopwords +from langchain.schema.embeddings import Embeddings +from langchain.document_loaders import DataFrameLoader +import re +from pathlib import Path +from typing import List +import json +import time +from pydantic import BaseModel +from langchain.vectorstores import FAISS +from typing import Optional +import sqlite3 +import pandas as pd +def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"): + configPath=Path(relPath) + with open(configPath, 'r', encoding='utf-8') as file: + config = json.load(file)[nameModel] + Output= config[dataOut] + return Output + + + +def remove_emoji(string): + emoji_pattern = re.compile("[" + u"\U0001F600-\U0001F64F" # emoticons + u"\U0001F300-\U0001F5FF" # symbols & pictographs + u"\U0001F680-\U0001F6FF" # transport & map symbols + u"\U0001F1E0-\U0001F1FF" # flags (iOS) + u"\U00002702-\U000027B0" + u"\U000024C2-\U0001F251" + "]+", flags=re.UNICODE) + return emoji_pattern.sub(r' ', string) + +def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False): + if punctuationOK: + # remove punctuation + for sig in [".",",","!","¿","?","=","(",")"]: + document=document.replace(sig," ") + + if xtrasOK: + # remove user mentions + document = re.sub("@[A-Za-z0-9_]+"," ", document) + # remove URLS + document = re.sub(r'http\S+', ' ', document) + # remove hashtags + document = re.sub("#[A-Za-z0-9_]+","", document) + if emojiOk: + # remove emoji's + document = remove_emoji(document) + + #document = re.sub("[^0-9A-Za-z ]", "" , document) + # remove double spaces + #print(document) + if unidecodeOK: + document=unidecode(document) + + if stopOK: + words=document.split(" ") + stop_words = set(stopwords.words('spanish')) + words = [w for w in words if not w in stop_words] + document=" ".join(words) + document = document.replace(' ',"") + #print(document) + return document.strip().lower() + +def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}): + st = SentenceTransformer(model_name,device='cpu') + return st + + +def loadCopysAndData(pathsqlite): + con = sqlite3.connect(pathsqlite) + copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con) + copiesT = copies_df + copiesT=copiesT[["copy_message","id","name","intentionality"]] + #print(copiesT) + data = copiesT + #print(data) + B=DataFrameLoader(data,page_content_column="copy_message") + B2=DataFrameLoader(data,page_content_column="intentionality") + documents=B.load() + documents2=B2.load() + return documents,documents2 + +def makeFaissdb(documents,folder_path,embedding): + try: + db=FAISS.load_local(folder_path=folder_path,embeddings=embedding) + except: + db = FAISS.from_documents(documents, embedding) + FAISS.save_local(db,folder_path=folder_path) + return db + +class Response(BaseModel): + query: str + filtred : Optional[float] = -9.0 + + + + +def FinderDbs(query,dbs,filtred=0.4): + AllData={} + for dbt in dbs: + Sal = dbt.similarity_search_with_score(query,4) + for output in Sal: + if output[0].metadata["id"] in AllData.keys(): + AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]]) + else: + AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content} + if filtred>0: + filtredData={} + for row in AllData.keys(): + if AllData[row]["d"] List[float]: #print(text,"text") @@ -54,135 +51,25 @@ class CustomEmbedding(Embeddings, BaseModel,): Sal=[] for text in texts: Sal.append(self._get_embedding(text)) - return Sal def embed_query(self, text: str) -> List[float]: return self._get_embedding(text) -def remove_emoji(string): - emoji_pattern = re.compile("[" - u"\U0001F600-\U0001F64F" # emoticons - u"\U0001F300-\U0001F5FF" # symbols & pictographs - u"\U0001F680-\U0001F6FF" # transport & map symbols - u"\U0001F1E0-\U0001F1FF" # flags (iOS) - u"\U00002702-\U000027B0" - u"\U000024C2-\U0001F251" - "]+", flags=re.UNICODE) - return emoji_pattern.sub(r' ', string) - -def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False): - if punctuationOK: - # remove punctuation - for sig in [".",",","!","¿","?","=","(",")"]: - document=document.replace(sig," ") - - if xtrasOK: - # remove user mentions - document = re.sub("@[A-Za-z0-9_]+"," ", document) - # remove URLS - document = re.sub(r'http\S+', ' ', document) - # remove hashtags - document = re.sub("#[A-Za-z0-9_]+","", document) - if emojiOk: - # remove emoji's - document = remove_emoji(document) - - #document = re.sub("[^0-9A-Za-z ]", "" , document) - # remove double spaces - #print(document) - if unidecodeOK: - document=unidecode(document) - - - if stopOK: - words=document.split(" ") - stop_words = set(stopwords.words('spanish')) - words = [w for w in words if not w in stop_words] - document=" ".join(words) - - - - document = document.replace(' ',"") - #print(document) - return document.strip().lower() - -def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}): - st = SentenceTransformer(model_name,device='cpu') - return st - - -def loadCopysAndData(pathsqlite=pathDb): - con = sqlite3.connect(pathsqlite) - copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con) - copiesT = copies_df - copiesT=copiesT[["copy_message","id","name","intentionality"]] - #print(copiesT) - data = copiesT - #print(data) - B=DataFrameLoader(data,page_content_column="copy_message") - B2=DataFrameLoader(data,page_content_column="intentionality") - documents=B.load() - documents2=B2.load() - return documents,documents2 - -def makeFaissdb(documents,folder_path,embedding): - try: - db=FAISS.load_local(folder_path=folder_path,embeddings=embedding) - except: - db = FAISS.from_documents(documents, embedding) - FAISS.save_local(db,folder_path=folder_path) - return db - -#llm,emb=loadModels() - -documents,documents2=loadCopysAndData() +nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid" +model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model" +entrenamiento="V1.3" +pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite") +keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics") +documents,documents2=loadCopysAndData(pathsqlite) emb=loadmodelEmb(model_name = model) emb2=CustomEmbedding() db=makeFaissdb(documents,"Copies3",emb2) db2=makeFaissdb(documents2,"Intentionality3",emb2) -#db3=makeFaissdb(documents2,"nameshf",hf) -#identifier = LanguageIdentifier.from_modelstring(modellangid, norm_probs=True) - - - -def FinderDbs(query,dbs,filtred=0.4): - AllData={} - for dbt in dbs: - Sal = dbt.similarity_search_with_score(query,4) - for output in Sal: - if output[0].metadata["id"] in AllData.keys(): - AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]]) - else: - AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content} - #for item in AllData.items(): - # print(item) - - if filtred>0: - filtredData={} - for row in AllData.keys(): - if AllData[row]["d"]%s +{AI_PROMPT}"""%(copie) + completion = anthropic.completions.create( + model="claude-2", + max_tokens_to_sample=600, + prompt=promptF3, + ) + + pregunta=query + promptFv2=f"""Tu eres un asistente de IA en chatbot llamado Angela, como asistente tu labor es ayudar a los usuarios de la pagina web de la alcaldia de puebla respondiendo sus preguntas. +Aqui te dare las reglas que debes seguir durante la conversacion: + +- Siempre te mantendras en el personaje Angela. +- Si no estas seguro de la respuesta basada en el contexto responde el suigiente texto: "Lo siento, podrias formular la pregunta de nuevo es que no entendi tu pregunta por que soy un sistema que esta en mejora en este momento". +- No menciones el contexto si la pregunta no puede ser contestada con el. +- Siempres responderas de manera amable pero formal. + + +%s + +{HUMAN_PROMPT} Tengo la siguiente pregunta entre la etiqueta y basandote en el contexto que esta en la etiqueta responde la pregunta entre la etiqueta : + +%s + +"""%(copie,pregunta) + + promptF3v2=promptFv2+f""" +{AI_PROMPT}""" + completionv2 = anthropic.completions.create( + model="claude-2.1", + max_tokens_to_sample=600, + prompt=promptF3v2, + ) + return {"text":completion,"text2":completionv2}