from langchain.document_loaders import DataFrameLoader from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.pydantic_v1 import BaseModel from langchain.schema.embeddings import Embeddings from sentence_transformers import SentenceTransformer from typing import List import sqlite3 import pandas as pd import shutil import re import numpy as np import inspect import time from unidecode import unidecode from nltk.corpus import stopwords import seaborn as sns import argparse from scipy.spatial import distance from pathlib import Path import json import os from nltk.corpus import stopwords import nltk parser = argparse.ArgumentParser() parser.add_argument("-f", "--file", help="Nombre de archivo a procesar") parser.add_argument("-d", "--distance", default="distance") parser.add_argument("-m", "--models", default="All") args = parser.parse_args() def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"): configPath=Path(relPath) with open(configPath, 'r', encoding='utf-8') as file: config = json.load(file)[nameModel] if type(dataOut) is list and len(dataOut)==2: Output= config[dataOut[0]][dataOut[1]] else: Output= config[dataOut] return Output # if args.file: # print ("El nombre de archivo a procesar es: ", ) class CustomEmbedding(Embeddings, BaseModel,): """embedding model with preprocessing""" def _get_embedding(self,text) -> List[float]: #print(text,"text") text=remove_unwanted(text,punctuationOK=True,stopOK=True) Sal=emb.encode(text) return Sal def embed_documents(self, texts: List[str]) -> List[List[float]]: Sal=[] for text in texts: Sal.append(self._get_embedding(text)) return Sal def embed_query(self, text: str) -> List[float]: return self._get_embedding(text) def remove_emoji(string): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) return emoji_pattern.sub(r' ', string) def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False): if punctuationOK: # remove punctuation for sig in [".",",","!","¿","?","=","(",")"]: document=document.replace(sig," ") if xtrasOK: # remove user mentions document = re.sub("@[A-Za-z0-9_]+"," ", document) # remove URLS document = re.sub(r'http\S+', ' ', document) # remove hashtags document = re.sub("#[A-Za-z0-9_]+","", document) if emojiOk: # remove emoji's document = remove_emoji(document) #document = re.sub("[^0-9A-Za-z ]", "" , document) # remove double spaces #print(document) if unidecodeOK: document=unidecode(document) if stopOK: words=document.split(" ") stop_words = set(stopwords.words('spanish')) words = [w for w in words if not w in stop_words] document=" ".join(words) document = document.replace(' ',"") #print(document) return document.strip().lower() def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}): st = SentenceTransformer(model_name) return st pathsqlite=extractConfig(dataOut="pathsqlite") def loadCopysAndData(pathsqlite=pathsqlite): con = sqlite3.connect(pathsqlite) copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con) copiesT = copies_df copiesT=copiesT[["copy_message","id","name","intentionality"]] #print(copiesT) data = copiesT #print(data) B=DataFrameLoader(data,page_content_column="copy_message") B2=DataFrameLoader(data,page_content_column="intentionality") documents=B.load() documents2=B2.load() return documents,documents2 def makeFaissdb(documents,folder_path,embedding): try: shutil.rmtree(folder_path) except: pass db = FAISS.from_documents(documents, embedding) FAISS.save_local(db,folder_path=folder_path) return db def FinderDbs(query,dbs,filtred=False,th=5000): AllData={} for dbt in dbs: Sal = dbt.similarity_search_with_score(query,4) for output in Sal: #print(output) if output[0].metadata["id"] in AllData.keys(): AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"]-0.1,output[1]-0.1]) else: AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content} #for item in AllData.items(): # print(item) if filtred: filtredData={} for row in AllData.keys(): if AllData[row]["d"]