from sentence_transformers import SentenceTransformer from fastapi import FastAPI from unidecode import unidecode from nltk.corpus import stopwords from langchain.schema.embeddings import Embeddings from langchain.document_loaders import DataFrameLoader import re from pathlib import Path from typing import List import json import time from pydantic import BaseModel from langchain.vectorstores import FAISS from typing import Optional import sqlite3 import pandas as pd def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"): configPath=Path(relPath) with open(configPath, 'r', encoding='utf-8') as file: config = json.load(file)[nameModel] Output= config[dataOut] return Output def remove_emoji(string): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) return emoji_pattern.sub(r' ', string) def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False): if punctuationOK: # remove punctuation for sig in [".",",","!","¿","?","=","(",")"]: document=document.replace(sig," ") if xtrasOK: # remove user mentions document = re.sub("@[A-Za-z0-9_]+"," ", document) # remove URLS document = re.sub(r'http\S+', ' ', document) # remove hashtags document = re.sub("#[A-Za-z0-9_]+","", document) if emojiOk: # remove emoji's document = remove_emoji(document) #document = re.sub("[^0-9A-Za-z ]", "" , document) # remove double spaces #print(document) if unidecodeOK: document=unidecode(document) if stopOK: words=document.split(" ") stop_words = set(stopwords.words('spanish')) words = [w for w in words if not w in stop_words] document=" ".join(words) document = document.replace(' ',"") #print(document) return document.strip().lower() def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}): st = SentenceTransformer(model_name,device='cpu') return st def loadCopysAndData(pathsqlite): con = sqlite3.connect(pathsqlite) copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con) copiesT = copies_df copiesT=copiesT[["copy_message","id","name","intentionality"]] #print(copiesT) data = copiesT #print(data) B=DataFrameLoader(data,page_content_column="copy_message") B2=DataFrameLoader(data,page_content_column="intentionality") documents=B.load() documents2=B2.load() return documents,documents2 def makeFaissdb(documents,folder_path,embedding): try: db=FAISS.load_local(folder_path=folder_path,embeddings=embedding) except: db = FAISS.from_documents(documents, embedding) FAISS.save_local(db,folder_path=folder_path) return db class Response(BaseModel): query: str filtred : Optional[float] = -9.0 def FinderDbs(query,dbs,filtred=0.4): AllData={} for dbt in dbs: Sal = dbt.similarity_search_with_score(query,4) for output in Sal: if output[0].metadata["id"] in AllData.keys(): AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]]) else: AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content} if filtred>0: filtredData={} for row in AllData.keys(): if AllData[row]["d"]