Compare commits

..

25 Commits

Author SHA1 Message Date
Mario Gil 32066fa9fe feat: Option Ciditel, Code to googlecolab,BotCidit 2024-01-11 09:26:48 -05:00
Mario Gil 502cf96292 Refactoring of code in libs. 2023-12-21 22:04:37 -05:00
Mario Gil ceaa20af06 All Code unstable, feat: code with config archive, finetuning, download models and extract data of db. 2023-12-21 13:23:01 -05:00
Mario Gil 9c0b1ce654 New branch unstable 2023-12-21 13:16:49 -05:00
Mario Gonzalez Gil 87cd0f780f mod base code in server 2023-12-04 20:35:11 +01:00
Mario Gonzalez Gil 9a9597ec31 delete reqs 2023-11-27 19:13:02 +01:00
Mario Gonzalez Gil b591a666f6 delete reqs 2023-11-27 19:04:33 +01:00
Mario Gonzalez Gil d8307375ae clean req 2023-11-27 19:02:44 +01:00
Mario Gonzalez Gil 74b63c7b65 V2 new model with text nlp 2023-11-27 18:47:46 +01:00
Mario Gonzalez Gil 55df233473 New version of main.py with data, strucutre and others 2023-11-21 21:47:37 +01:00
Mario Gil 739036e038 changes in model with finetuning 2023-10-28 06:36:39 -05:00
Mario Gil d7429cda09 feat: get float in filtred to threshold 2023-10-25 16:51:05 -05:00
Mario Gil 07c6fe9889 Change end point 2023-10-23 07:42:36 -05:00
Mario Gil 5fbed52018 Add filtred 2023-10-23 01:14:32 -05:00
Mario Gil d7a0141a83 delete fake endpoint 2023-10-14 13:05:22 -05:00
Mario Gil 09504d3744 feat: Only fastapy 2023-10-14 13:03:08 -05:00
Mario Gil aaad0dc7c2 feat: Version in api 2023-10-11 10:32:32 -05:00
Mario Gil 811a7e9213 add run.sh in gitignore 2023-10-11 10:03:34 -05:00
Mario Gonzalez Gil 4f518fffa7 Server funtional 2023-10-11 16:13:52 +02:00
Mario Gil f22df1a7cf Bugfix: UUpgrade requeriments 2023-10-10 15:44:29 -05:00
Mario Gil 841d1383a7 Bugfix: Change name of app 2023-10-10 15:04:18 -05:00
Mario Gil d8da757d0a Feat: Only Id and metric 2023-10-10 14:53:35 -05:00
Mario Gil 583afcd268 Feat new format 2023-10-10 07:17:52 -05:00
Mario Gil 1245cb22fe With intencionality 2023-10-06 00:42:06 -05:00
Mario Gil 3ddeacd83f Custom models of huginface 2023-09-28 14:48:59 -05:00
15 changed files with 1373 additions and 214 deletions

30
.gitignore vendored
View File

@ -4,3 +4,33 @@ names/*
nameshf/* nameshf/*
photo_2023-09-24_00-25-17.jpg photo_2023-09-24_00-25-17.jpg
__pycache__/FindinDB.cpython-38.pyc __pycache__/FindinDB.cpython-38.pyc
embeddings/*
tuned_models/*
Copies3/*
Copies2/*
Tnames/*
TCopies/*
names2/*
Intencionality3/*
__pycache__/FindinDB.cpython-311.pyc
__pycache__/main.cpython-311.pyc
sns_violin*
NewData*
motor05102023.csv
run.sh
Modelo_embedding_Mexico_Puebla/all-mpnet-base-v2/model/*
3pasos/paraphrase-multilingual-mpnet-base-v2/Sta/EvalClass.csv
3pasos/paraphrase-multilingual-mpnet-base-v2/model/*
9pasos/paraphrase-multilingual-mpnet-base-v2/Sta/EvalClass.csv
9pasos/paraphrase-multilingual-mpnet-base-v2/model/*
50pasos/paraphrase-multilingual-mpnet-base-v2/Sta/EvalClass.csv
50pasos/paraphrase-multilingual-mpnet-base-v2/model/*
100pasos/paraphrase-multilingual-mpnet-base-v2/Sta/EvalClass.csv
100pasos/paraphrase-multilingual-mpnet-base-v2/model/*
Argument/*
__pycache__/models.cpython-311.pyc
data/raw/__pycache__/models.cpython-311.pyc
Modelo_embedding_Mexico_Puebla/*
Intentionality3/index.faiss
Intentionality3/index.pkl
conf/experiment_config.json

31
DownloadModels.py Normal file
View File

@ -0,0 +1,31 @@
from sentence_transformers import SentenceTransformer
# Preguntas y respuestas especializado en eso "multi-qa-mpnet-base-dot-v1"
# uno de uso gereal el de mejor desempeño all-mpnet-base-v2
# el mas rapido "paraphrase-MiniLM-L3-v2" y "all-MiniLM-L6-v2"
# muy rappudo y muy acertado "all-MiniLM-L12-v2"
#models=["all-MiniLM-L12-v2","paraphrase-MiniLM-L3-v2" , "all-MiniLM-L6-v2",
from pathlib import Path
import json
#"paraphrase-multilingual-mpnet-base-v2",'hackathon-pln-es/paraphrase-spanish-distilroberta'
nameModel="Modelo_embedding_CIDITEL"
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
configPath=Path(relPath)
with open(configPath, 'r', encoding='utf-8') as file:
config = json.load(file)[nameModel]
if dataOut is list and len(dataOut)==2:
Output= config[dataOut[0]][dataOut[1]]
else:
Output= config[dataOut]
return Output
baseModel=extractConfig(nameModel=nameModel,dataOut="base_model")
models=[baseModel]
for model in models:
modelST = SentenceTransformer(model)
# Define the path where you want to save the model
save_path = './embeddings/%s/model'%(model)
print(save_path)
# Save the model
modelST.save(save_path)

View File

@ -1,158 +0,0 @@
import gradio as gr
from faiss import write_index, read_index
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain.document_loaders import UnstructuredURLLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain import LLMChain
from langchain.llms import GPT4All
from langchain.embeddings import GPT4AllEmbeddings
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.base import BaseCallbackManager
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
import pandas as pd
import sqlite3
from sentence_transformers import SentenceTransformer
from fastapi import FastAPI
#from cleantext import clean
import re
model_name = 'hiiamsid/sentence_similarity_spanish_es'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
CUSTOM_PATH = "/angela"
app = FastAPI()
@app.get("/")
def read_main():
return {"message": "This is your main app"}
def loadModels():
#model = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin")
callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])
llm = GPT4All(model="orca-mini-3b.ggmlv3.q4_0.bin",temp=0.1,streaming=True)#callback_manager=callback_manager, verbose=True,repeat_last_n=0
embeddings = GPT4AllEmbeddings()
return llm, embeddings
def loadCopysAndData(pathsqlite="motor.sqlite"):
con = sqlite3.connect(pathsqlite)
copies_df = pd.read_sql_query("SELECT * from copies", con)
copiesT = copies_df[copies_df.copy_start =="T"]
copiesT=copiesT[["copy_message","id","name"]]
data = copiesT
B=DataFrameLoader(data,page_content_column="copy_message")
B2=DataFrameLoader(data,page_content_column="name")
documents=B.load()
documents2=B2.load()
return documents,documents2
def makeFaissdb(documents,folder_path,embedding):
try:
db=FAISS.load_local(folder_path=folder_path,embeddings=embedding)
except:
db = FAISS.from_documents(documents, embedding)
FAISS.save_local(db,folder_path=folder_path)
return db
llm,emb=loadModels()
documents,documents2=loadCopysAndData()
db=makeFaissdb(documents,"Copies",emb)
db2=makeFaissdb(documents2,"names",emb)
db3=makeFaissdb(documents2,"nameshf",hf)
def FinderDbs(query,dbs,filtred=False,th=1.2):
AllData={}
for dbt in dbs:
Sal = dbt.similarity_search_with_score(query,4)
for output in Sal:
if output[0].metadata["id"] in AllData.keys():
AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"]-0.1,output[1]-0.1])
else:
AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
#for item in AllData.items():
# print(item)
if filtred:
filtredData={}
for row in AllData.keys():
if AllData[row]["d"]<1.2:
filtredData[row]=AllData[row]
filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
return filtredData,filtredData.keys()
else:
AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
return AllData,AllData.keys()
def QARequest(Pregunta,filtred=False):
query = Pregunta
AllData=FinderDbs(query,[db,db2],filtred)
if AllData:
import markdown
AllData = list(AllData)
#lista = "<div style='border-style = solid;border-width:1px;border-radius:10px'>"
lista = ""
for k,i in enumerate(AllData[0].items()):
titulo = f"<div style='border-style = solid;border-width:1px;border-radius:10px;margin:14px;padding:14px'><h2>Respuesta {k+1}</h2>"
to_append = markdown.markdown(i[1]['page_content'])
lista = lista + titulo + to_append + '</div>'
#lista.append('<br>')
#lista = lista + '</div>'
AllData[0] = lista
return AllData
with gr.Blocks() as demo:
gr.Image("logo.jpg",height=100)
gr.Markdown("Esta es la busqueda que hace el usuario")
Pregunta = gr.Textbox(label="Pregunta")
#Pregunta = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", Pregunta)
#Pregunta=Pregunta.strip().lower()
filtred=gr.Checkbox(label="filtrado")
gr.Markdown("Respuestas para orca desde los copys")
Respuesta = gr.Textbox(label="Respuesta")
id = gr.Textbox(label="id")
# metrica=gr.Textbox(label="metrica")
# gr.Markdown("Respuestas para orca desde los names")
# Respuesta2 = gr.Textbox(label="Respuesta2")
# id2 = gr.Textbox(label="id2")
# metrica2=gr.Textbox(label="metrica2")
# gr.Markdown("Respuestas para hf desde los names")
# Respuesta3 = gr.Textbox(label="Respuesta3")
# id3 = gr.Textbox(label="id3")
# metrica3=gr.Textbox(label="metrica3")
Enviar_btn = gr.Button("Responder")
Enviar_btn.click(fn=QARequest, inputs=[Pregunta,filtred], outputs=[gr.HTML(Respuesta),id], api_name="api_angela") #
#demo.launch(root_path="angela") #
gradio_app = gr.routes.App.create_app(demo)
app.mount(CUSTOM_PATH, gradio_app)
#app = demo.mount_gradio_app(app, io, path=CUSTOM_PATH)

60
data/raw/validClass.json Normal file
View File

@ -0,0 +1,60 @@
{
"32": [
"Necesito informar sobre un deterioro en la carretera cerca de donde vivo.",
"Quiero expresar mi preocupación acerca de un bache en la calle que está cerca de mi domicilio.",
"Estoy solicitando la atención de las autoridades para solucionar un problema vial en las inmediaciones de mi vivienda.",
"Necesito poner en conocimiento de las autoridades competentes un bache en la vía cercana a mi residencia.",
"reportar bache"
],
"171": [
"¿Me podrías decir cuáles son las opciones culturales en estos días?",
"Quiero informarme sobre los eventos culturales que no me puedo perder.",
"¿Puedes recomendarme eventos culturales?",
"Estoy buscando información sobre la agenda cultural y artística de Puebla."
],
"273": [
"Necesito información sobre programas de educación musical para la infancia.",
"¿Puede proporcionarme detalles sobre cursos de producción de música electrónica?",
"Me gustaría saber más sobre clases de música para adultos mayores.",
"Estoy interesado en talleres de música étnica y world music."
],
"239": [
"Quiero indicar que un semáforo no muestra la señal de alto constante.",
"Necesito reportar un semáforo que no muestra la señal de alto intermitente.",
"Estoy interesado en notificar sobre un semáforo que presenta un mal funcionamiento general.",
"Quiero comunicar que un semáforo no muestra ninguna señal de luz."
],
"452": [
"Estoy aquí para alertar sobre un coche en estado de abandono.",
"Quiero comunicar que un vehículo ha sido descuidado y está estacionado.",
"Necesito reportar un vehículo sin supervisión.",
"Estoy dispuesto a dar aviso sobre un automóvil abandonado en la vía pública."
],
"23": [
"¿Que actividades de cine hay esta semana?",
"¿Que actividades de club de lectrura hay en puebla?",
"¿Donde puedo participar en talleres de escritura?"
],
"1194":
["¿Cómo llegar a la zona arqueológica de Yohualichan desde el centro de Cuetzalan?",
"¿Cuál es la mejor ruta para visitar la cascada de Apulco desde Cuetzalan?",
"¿Qué transporte recomiendan para llegar a La Gloria desde el centro de Cuetzalan?"],
"1315":[
"quien es el alcalde de la ciudad de puebla",
"quien es el presidente municipal de puebla",
"qué estudios tiene eduardo rivera pérez",
"qué cargos ha ocupado eduardo rivera pérez",
"como se llama el presidente municipal ",
"cual es el nombre del alcalde del municipio",
"como se llama el alcalde"
],
"0":[
"dfjhnr9o",
"fgrrd dfgres",
"Estoy molesto",
"No funciona"
]
}

View File

@ -0,0 +1,145 @@
from models import dbvotes,dbcopies
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from typing import List
from langchain.pydantic_v1 import BaseModel
from langchain.schema.embeddings import Embeddings
from unidecode import unidecode
from nltk.corpus import stopwords
import re
from pathlib import Path
import json
import matplotlib.pyplot as plt
import seaborn as sns
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
configPath=Path(relPath)
with open(configPath, 'r', encoding='utf-8') as file:
config = json.load(file)[nameModel]
if type(dataOut) is list and len(dataOut)==2:
Output= config[dataOut[0]][dataOut[1]]
else:
Output= config[dataOut]
return Output
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r' ', string)
def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
if punctuationOK:
# remove punctuation
for sig in [".",",","!","¿","?","=","(",")"]:
document=document.replace(sig," ")
if xtrasOK:
# remove user mentions
document = re.sub("@[A-Za-z0-9_]+"," ", document)
# remove URLS
document = re.sub(r'http\S+', ' ', document)
# remove hashtags
document = re.sub("#[A-Za-z0-9_]+","", document)
if emojiOk:
# remove emoji's
document = remove_emoji(document)
#document = re.sub("[^0-9A-Za-z ]", "" , document)
# remove double spaces
#print(document)
if unidecodeOK:
document=unidecode(document)
if stopOK:
words=document.split(" ")
stop_words = set(stopwords.words('spanish'))
words = [w for w in words if not w in stop_words]
document=" ".join(words)
document = document.replace(' ',"")
#print(document)
return document.strip().lower()
output=[]
for row in dbvotes(dbvotes.votes.id).select():
if int(row.vote)==1:
Sal={}
#print(row.message, row.copy_id,row.vote)
query = (dbvotes.messages.id==row.message)
messagequery = dbvotes(query).select(dbvotes.messages.ALL)
Sal["texto"]=messagequery[0].message
Sal["etiqueta"]=row.copy_id
query = (dbcopies.copies.id==row.copy_id)
copiesquery =dbcopies(query).select(dbcopies.copies.ALL)
#Sal["copy_message"]=copiesquery[0].copy_message
Sal["intentionality"]=copiesquery[0].intentionality
#print(copiesquery)
output.append(Sal)
df=pd.DataFrame(output)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
st = SentenceTransformer(model_name)
return st
class CustomEmbedding(Embeddings, BaseModel,):
"""embedding model with preprocessing"""
def _get_embedding(self,text) -> List[float]:
#print(text,"text")
text=remove_unwanted(text,punctuationOK=True,stopOK=True)
Sal=emb.encode(text)
return Sal
def embed_documents(self, texts: List[str]) -> List[List[float]]:
Sal=[]
for text in texts:
Sal.append(self._get_embedding(text))
return Sal
def embed_query(self, text: str) -> List[float]:
return self._get_embedding(text)
nameModel="Modelo_embedding_Mexico_Puebla"
valid_path = extractConfig(dataOut="valid_dataset")
baseModel= extractConfig(dataOut="base_model")
with open(valid_path, 'r', encoding='utf-8') as file:
queries_Categoricos = json.load(file)
model="./%s/%s/model"%(nameModel,baseModel)
emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding()
train_embeddings = pd.DataFrame(emb2.embed_documents(train_data['texto'].tolist()))
test_embeddings = pd.DataFrame(emb2.embed_documents(test_data['texto'].tolist()))
print(pd.DataFrame(test_embeddings))
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_embeddings, train_data['etiqueta'])
# Hacer predicciones en el conjunto de prueba
predictions = rf_model.predict(test_embeddings)
# Calcular la precisión
accuracy = accuracy_score(test_data['etiqueta'], predictions)
print(f'Precisión del modelo: {accuracy:.2f}')
# verificar características importantes
feature_importances_df = pd.DataFrame(
{"feature": list(test_embeddings.columns), "importance": rf_model.feature_importances_}
).sort_values("importance", ascending=False)
# Mostrar
print(feature_importances_df)

2
downloaddb.sh Executable file
View File

@ -0,0 +1,2 @@
scp mgil@apollo.latinux.net:/home/jbenitez/www/py4web/apps/AngelaSmartBot/databases/storage.db ./data/raw/databases
scp mgil@apollo.latinux.net:/opt/web2py/applications/MotorAngela/databases/storage.sqlite ./data/raw/databases

53
finetrainCollabversion.py Normal file
View File

@ -0,0 +1,53 @@
!pip install sentence_transformers
!pip install unidecode
!pip install langchain
!pip install faiss-cpu
from torch.utils.data import DataLoader
import math
import logging
from unidecode import unidecode
from pathlib import Path
import json
from sentence_transformers import SentenceTransformer, losses, InputExample
model="paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model)
batch_size = 32
num_epochs = 50
train_path = Path("/content/train.json")
with open(train_path, 'r', encoding='utf-8') as file:
queries_Categoricos = json.load(file)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
train_examples = []
for i in queries_Categoricos.keys():
for j in queries_Categoricos[i]:
i=unidecode(i).strip().lower()
j=unidecode(j).strip().lower()
score = 1.0
#print(i)
train_examples.append(InputExample(texts=[ i,j], label=score))
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
#evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev')
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the cross-encoder model
model.fit(train_objectives=[(train_dataloader, train_loss)],
#evaluator=evaluator,
epochs=num_epochs,
#evaluation_steps=1000,
warmup_steps=warmup_steps)
save_path = "./%spasos/paraphrase-multilingual-mpnet-base-v2/model/"%(str(num_epochs))
model.save(save_path)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!zip "./%sp.zip"%(str(num_epochs)) "/content/%spasos"%(str(num_epochs))

122
finetrainmodel.py Normal file
View File

@ -0,0 +1,122 @@
""" from sentence_transformers import SentenceTransformer, models
## Step 1: use an existing language model
word_embedding_model = models.Transformer('distilroberta-base')
## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
## Join steps 1 and 2 using the modules argument
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
from sentence_transformers import InputExample
from datasets import load_dataset
dataset_id = "embedding-data/QQP_triplets"
# dataset_id = "embedding-data/sentence-compression"
dataset = load_dataset(dataset_id)
train_examples = []
train_data = dataset['train']['set']
# For agility we only 1/2 of our available data
n_examples = dataset['train'].num_rows // 2
for i in range(10):
example = train_data[i]
train_examples.append(InputExample(texts=[example['query'], example['pos'][0]]))
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)
from sentence_transformers import losses
train_loss = losses.MultipleNegativesRankingLoss(model=model)
num_epochs = 10
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data
model.fit(train_objectives=[(train_dataloader, train_loss)],epochs=num_epochs,warmup_steps=2)
"""
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
from unidecode import unidecode
from pathlib import Path
import json
import os
from datetime import datetime
nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
configPath=Path(relPath)
with open(configPath, 'r', encoding='utf-8') as file:
config = json.load(file)[nameModel]
if type(dataOut) is list and len(dataOut)==2:
Output= config[dataOut[0]][dataOut[1]]
else:
Output= config[dataOut]
return Output
def saveConfig(dictionary):
pathOutfile='./%s/%s/params/'%(nameModel,baseModel)
if not os.path.exists(pathOutfile):
os.makedirs(pathOutfile)
with open(pathOutfile+"params.json", "w",encoding='utf-8') as outfile:
json.dump(dictionary, outfile)
def saveData(dictionary):
Sal={}
pathOutfile='./%s/%s/data/'%(nameModel,baseModel)
if not os.path.exists(pathOutfile):
os.makedirs(pathOutfile)
with open(pathOutfile+"train.json", "w",encoding='utf-8') as outfile:
json.dump(dictionary, outfile)
now = datetime.now()
entrenamiento="V_%s_%s_%s"%(now.year,now.month,now.day)
baseModel=extractConfig(nameModel=nameModel,dataOut="base_model")
trainDatasetPos=extractConfig(nameModel=nameModel,dataOut="train_dataset_pos")
model=extractConfig(nameModel=nameModel,dataOut="path_model")
modelST = SentenceTransformer(model+"/model")
train_loss = losses.MultipleNegativesRankingLoss(model=modelST)
train_path = Path(trainDatasetPos)
with open(train_path, 'r', encoding='utf-8') as file:
queries_Categoricos = json.load(file)
train_examples = []
for i in queries_Categoricos.keys():
for j in queries_Categoricos[i]:
i=unidecode(i).strip().lower()
j=unidecode(j).strip().lower()
train_examples.append(InputExample(texts=[ i,j]))
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=5)#16
print(len(train_dataloader))
modelST.fit(train_objectives=[(train_dataloader, train_loss)],epochs=extractConfig(dataOut=["params","num_epochs"]),warmup_steps=extractConfig(dataOut=["params","warmup_steps"]))
save_path = './%s/%s/model/'%(nameModel,baseModel)
modelST.save(save_path)
params={"entrenamiento":entrenamiento,"baseModel":baseModel}
params.update(extractConfig(dataOut="params"))
saveConfig(params)
saveData(queries_Categoricos)

121
general.py Normal file
View File

@ -0,0 +1,121 @@
from sentence_transformers import SentenceTransformer
from fastapi import FastAPI
from unidecode import unidecode
from nltk.corpus import stopwords
from langchain.schema.embeddings import Embeddings
from langchain.document_loaders import DataFrameLoader
import re
from pathlib import Path
from typing import List
import json
import time
from pydantic import BaseModel
from langchain.vectorstores import FAISS
from typing import Optional
import sqlite3
import pandas as pd
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
configPath=Path(relPath)
with open(configPath, 'r', encoding='utf-8') as file:
config = json.load(file)[nameModel]
Output= config[dataOut]
return Output
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r' ', string)
def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
if punctuationOK:
# remove punctuation
for sig in [".",",","!","¿","?","=","(",")"]:
document=document.replace(sig," ")
if xtrasOK:
# remove user mentions
document = re.sub("@[A-Za-z0-9_]+"," ", document)
# remove URLS
document = re.sub(r'http\S+', ' ', document)
# remove hashtags
document = re.sub("#[A-Za-z0-9_]+","", document)
if emojiOk:
# remove emoji's
document = remove_emoji(document)
#document = re.sub("[^0-9A-Za-z ]", "" , document)
# remove double spaces
#print(document)
if unidecodeOK:
document=unidecode(document)
if stopOK:
words=document.split(" ")
stop_words = set(stopwords.words('spanish'))
words = [w for w in words if not w in stop_words]
document=" ".join(words)
document = document.replace(' ',"")
#print(document)
return document.strip().lower()
def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
st = SentenceTransformer(model_name,device='cpu')
return st
def loadCopysAndData(pathsqlite):
con = sqlite3.connect(pathsqlite)
copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
copiesT = copies_df
copiesT=copiesT[["copy_message","id","name","intentionality"]]
#print(copiesT)
data = copiesT
#print(data)
B=DataFrameLoader(data,page_content_column="copy_message")
B2=DataFrameLoader(data,page_content_column="intentionality")
documents=B.load()
documents2=B2.load()
return documents,documents2
def makeFaissdb(documents,folder_path,embedding):
try:
db=FAISS.load_local(folder_path=folder_path,embeddings=embedding)
except:
db = FAISS.from_documents(documents, embedding)
FAISS.save_local(db,folder_path=folder_path)
return db
class Response(BaseModel):
query: str
filtred : Optional[float] = -9.0
def FinderDbs(query,dbs,filtred=0.4):
AllData={}
for dbt in dbs:
Sal = dbt.similarity_search_with_score(query,4)
for output in Sal:
if output[0].metadata["id"] in AllData.keys():
AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"],output[1]])
else:
AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
if filtred>0:
filtredData={}
for row in AllData.keys():
if AllData[row]["d"]<filtred:
filtredData[row]=AllData[row]
filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
return filtredData,filtredData.keys()
else:
AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
return AllData,AllData.keys()

171
main.py Normal file
View File

@ -0,0 +1,171 @@
#import gradio as gr
from faiss import write_index, read_index
from langchain.vectorstores import FAISS
from typing import List
from pydantic import BaseModel
from typing import Optional
import re
from pathlib import Path
import time
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
import json
import pandas as pd
import sqlite3
from sentence_transformers import SentenceTransformer
from fastapi import FastAPI
from unidecode import unidecode
from nltk.corpus import stopwords
from langchain.schema.embeddings import Embeddings
from langchain.document_loaders import DataFrameLoader
from general import FinderDbs,loadCopysAndData,loadmodelEmb,makeFaissdb,extractConfig,Response,remove_unwanted
#from langchain import PromptTemplate
# from langchain.document_loaders import TextLoader
# from langchain.text_splitter import CharacterTextSplitter
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.document_loaders import UnstructuredFileLoader
# from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
# from langchain.document_loaders import UnstructuredURLLoader
# from langchain.document_loaders.csv_loader import CSVLoader
# #from langchain import LLMChain
# from langchain.embeddings import HuggingFaceEmbeddings
#from cleantext import clean
class CustomEmbedding(Embeddings, BaseModel):
"""embedding model with preprocessing"""
def _get_embedding(self,text) -> List[float]:
#print(text,"text")
text=remove_unwanted(text,punctuationOK=True,stopOK=True)
Sal=emb.encode(text)
return Sal
def embed_documents(self, texts: List[str]) -> List[List[float]]:
Sal=[]
for text in texts:
Sal.append(self._get_embedding(text))
return Sal
def embed_query(self, text: str) -> List[float]:
return self._get_embedding(text)
nameModel="Modelo_embedding_CIDITEL"
model=extractConfig(nameModel=nameModel,dataOut="path_model")+"/model"
print(model)
entrenamiento="V1.3"
pathsqlite=extractConfig(nameModel=nameModel,dataOut="pathsqlite")
keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
documents,documents2=loadCopysAndData(pathsqlite)
emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding()
db=makeFaissdb(documents,"Copies3",emb2)
db2=makeFaissdb(documents2,"Intentionality3",emb2)
app = FastAPI()
@app.get("/")
def read_main():
return {"message": "This is your main app"}
@app.post("/angela-api/")
def calculate_api(response: Response):
query = response.query
try:
filtred = response.filtred
except:
filtred = -9.0
AllData=FinderDbs(query,[db2,db],filtred)
#print(AllData)
versionL="_".join([model,entrenamiento])
#tt=time.time()
#if identifier.classify(query)[1]< 0.3:
#print(identifier.classify(query))
#print(time.time()-tt)
#return {"ids": [],"DC":[],"modelo":versionL}
#print(time.time()-tt)
if AllData:
AllData = list(AllData)
dis=[]
id=[]
for k,i in enumerate(AllData[0].items()):
dis.append(str(i[1]['d']))
id.append(i[0])
return {"ids": id,"DC":dis,"modelo":versionL}
@app.post("/angela-api-claude/")
def calculate_api_claude(response: Response):
anthropic = Anthropic(api_key=keyanthropic)
query = response.query
try:
filtred = response.filtred
except:
filtred = -9.0
AllData=FinderDbs(query,[db2,db],filtred)
versionL="_".join([model,entrenamiento])
if AllData:
AllData = list(AllData)
dis=[]
id=[]
for k,i in enumerate(AllData[0].items()):
dis.append(str(i[1]['d']))
id.append(i[0])
if len(id)<1:
return {"text": {"completion": "No tengo información sobre este tema",
"model": "claude-2.1",
"stop_reason": "stop_sequence",
"type": "completion",
"id": "1",
"stop": "\n\nHuman:",
"log_id": "1"
},"text2": {
"completion":"No tengo información sobre este tema",
"model": "claude-2.1",
"stop_reason": "stop_sequence",
"type": "completion",
"id": "1",
"stop": "\n\nHuman:",
"log_id": "1"
}
}
con = sqlite3.connect(pathsqlite)
copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
copie = copies_df[copies_df["id"]==id[0]]["copy_message"].values[0]
promptF=f"""{HUMAN_PROMPT} Tengo un contexto por favor generame un resumen, el resumen deben ser con lenguaje amable para un publico mexicano y como si fuera una conversacion con la persona.
"""
promptF3=promptF+f"""
<contexto>%s</contexto>
{AI_PROMPT}<resumen>"""%(copie)
completion = anthropic.completions.create(
model="claude-2",
max_tokens_to_sample=600,
prompt=promptF3,
)
pregunta=query
promptFv2=f"""Tu eres un asistente de IA en chatbot llamado Angela, como asistente tu labor es ayudar a los usuarios de la pagina web de la alcaldia de puebla respondiendo sus preguntas.
Aqui te dare las reglas que debes seguir durante la conversacion:
<reglas>
- Siempre te mantendras en el personaje Angela.
- Si no estas seguro de la respuesta basada en el contexto responde el suigiente texto: "Lo siento, podrias formular la pregunta de nuevo es que no entendi tu pregunta por que soy un sistema que esta en mejora en este momento".
- No menciones el contexto si la pregunta no puede ser contestada con el.
- Siempres responderas de manera amable pero formal.
</reglas>
<contexto>
%s
</contexto>
{HUMAN_PROMPT} Tengo la siguiente pregunta entre la etiqueta <pregunta></pregunta> y basandote en el contexto que esta en la etiqueta <contexto></contexto> responde la pregunta entre la etiqueta <respuesta></respuesta>:
<pregunta>
%s
</pregunta>
"""%(copie,pregunta)
promptF3v2=promptFv2+f"""
{AI_PROMPT}<respuesta>"""
completionv2 = anthropic.completions.create(
model="claude-2.1",
max_tokens_to_sample=600,
prompt=promptF3v2,
)
return {"text":completion,"text2":completionv2}

389
metrics.py Normal file
View File

@ -0,0 +1,389 @@
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.pydantic_v1 import BaseModel
from langchain.schema.embeddings import Embeddings
from sentence_transformers import SentenceTransformer
from typing import List
import sqlite3
import pandas as pd
import shutil
import re
import numpy as np
import inspect
import time
from unidecode import unidecode
from nltk.corpus import stopwords
import seaborn as sns
import argparse
from scipy.spatial import distance
from pathlib import Path
import json
import os
from nltk.corpus import stopwords
import nltk
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file", help="Nombre de archivo a procesar")
parser.add_argument("-d", "--distance", default="distance")
parser.add_argument("-m", "--models", default="All")
args = parser.parse_args()
def extractConfig(nameModel="Modelo_embedding_Mexico_Puebla",relPath="./conf/experiment_config.json",dataOut="train_dataset_pos"):
configPath=Path(relPath)
with open(configPath, 'r', encoding='utf-8') as file:
config = json.load(file)[nameModel]
if type(dataOut) is list and len(dataOut)==2:
Output= config[dataOut[0]][dataOut[1]]
else:
Output= config[dataOut]
return Output
# if args.file:
# print ("El nombre de archivo a procesar es: ", )
class CustomEmbedding(Embeddings, BaseModel,):
"""embedding model with preprocessing"""
def _get_embedding(self,text) -> List[float]:
#print(text,"text")
text=remove_unwanted(text,punctuationOK=True,stopOK=True)
Sal=emb.encode(text)
return Sal
def embed_documents(self, texts: List[str]) -> List[List[float]]:
Sal=[]
for text in texts:
Sal.append(self._get_embedding(text))
return Sal
def embed_query(self, text: str) -> List[float]:
return self._get_embedding(text)
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r' ', string)
def remove_unwanted(document,stopOK=False,punctuationOK=False,xtrasOK=False, emojiOk=False, unidecodeOK=False):
if punctuationOK:
# remove punctuation
for sig in [".",",","!","¿","?","=","(",")"]:
document=document.replace(sig," ")
if xtrasOK:
# remove user mentions
document = re.sub("@[A-Za-z0-9_]+"," ", document)
# remove URLS
document = re.sub(r'http\S+', ' ', document)
# remove hashtags
document = re.sub("#[A-Za-z0-9_]+","", document)
if emojiOk:
# remove emoji's
document = remove_emoji(document)
#document = re.sub("[^0-9A-Za-z ]", "" , document)
# remove double spaces
#print(document)
if unidecodeOK:
document=unidecode(document)
if stopOK:
words=document.split(" ")
stop_words = set(stopwords.words('spanish'))
words = [w for w in words if not w in stop_words]
document=" ".join(words)
document = document.replace(' ',"")
#print(document)
return document.strip().lower()
def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}):
st = SentenceTransformer(model_name)
return st
pathsqlite=extractConfig(dataOut="pathsqlite")
def loadCopysAndData(pathsqlite=pathsqlite):
con = sqlite3.connect(pathsqlite)
copies_df = pd.read_sql_query("SELECT * from copies WHERE intentionality IS NOT NULL", con)
copiesT = copies_df
copiesT=copiesT[["copy_message","id","name","intentionality"]]
#print(copiesT)
data = copiesT
#print(data)
B=DataFrameLoader(data,page_content_column="copy_message")
B2=DataFrameLoader(data,page_content_column="intentionality")
documents=B.load()
documents2=B2.load()
return documents,documents2
def makeFaissdb(documents,folder_path,embedding):
try:
shutil.rmtree(folder_path)
except:
pass
db = FAISS.from_documents(documents, embedding)
FAISS.save_local(db,folder_path=folder_path)
return db
def FinderDbs(query,dbs,filtred=False,th=5000):
AllData={}
for dbt in dbs:
Sal = dbt.similarity_search_with_score(query,4)
for output in Sal:
#print(output)
if output[0].metadata["id"] in AllData.keys():
AllData[output[0].metadata["id"]]["d"]=min([AllData[output[0].metadata["id"]]["d"]-0.1,output[1]-0.1])
else:
AllData[output[0].metadata["id"]]={"d":output[1],"page_content":output[0].page_content}
#for item in AllData.items():
# print(item)
if filtred:
filtredData={}
for row in AllData.keys():
if AllData[row]["d"]<th:
filtredData[row]=AllData[row]
filtredData=dict(sorted(filtredData.items(), key=lambda item: item[1]["d"]))
return filtredData,filtredData.keys()
else:
AllData=dict(sorted(AllData.items(), key=lambda item: item[1]["d"]))
return AllData,AllData.keys()
nameModel="Modelo_embedding_Mexico_Puebla_hiiamasid"
valid_path = extractConfig(nameModel=nameModel,dataOut="valid_dataset")
baseModel= extractConfig(nameModel=nameModel,dataOut="base_model")
path_model=extractConfig(nameModel=nameModel,dataOut="path_model")
with open(valid_path, 'r', encoding='utf-8') as file:
queries_Categoricos = json.load(file)
models=["./"+path_model]
#print(1111,models)
copies_text=queries_Categoricos.keys()
try:
os.makedirs("./%s/Sta"%(path_model), exist_ok = True)
except OSError as error:
pass
def plotVioin(Sal,Listqueries):
NewData=pd.DataFrame.from_dict(Sal)
plt=sns.violinplot(data=NewData, x="model", y="distance", hue="copy_test",fill=False,inner=None,width=0.5)
plt.set_xticklabels(plt.get_xticklabels(), rotation=45,horizontalalignment='right')
fig=plt.get_figure()
fig.set_size_inches(17.7, 12.27)
fig.savefig('./%s/%s/Sta/sns_violin_plot%s.png'%(nameModel,baseModel,Listqueries), dpi=300)
NewData.to_csv("./%s/%s/Sta/NewData%s.csv"%(nameModel,baseModel,Listqueries))
def plotViointime(Sal,Listqueries):
NewData=pd.DataFrame.from_dict(Sal)
plt=sns.violinplot(data=NewData, x="model", y="time",fill=False,inner=None,width=0.5)
plt.set_xticklabels(plt.get_xticklabels(), rotation=45,horizontalalignment='right')
fig=plt.get_figure()
fig.set_size_inches(17.7, 12.27)
fig.savefig('./%s/%s/Sta/sns_violin_plot_time%s.png'%(nameModel,baseModel,Listqueries), dpi=300)
NewData.to_csv("./%s/%s/Sta/NewData%s.csv"%(nameModel,baseModel,Listqueries))
def queries_CatPlot(Listqueries):
Sal=[]
queries=queries_Categoricos[Listqueries]
for model in models:
for copy_text in copies_text:
global emb
emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding()
emb2.embed_query("test 123321")
sal=[]
for query in queries:
t=time.time()
A={"model":model,
"query":query,
"copy_test":copy_text,
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(copy_text)),
"time":time.time()-t
}
Sal.append(A)
if args.distance=="distance":
plotVioin(Sal,Listqueries)
if args.distance=="time":
plotViointime(Sal,Listqueries)
def queries_CatSta():
Sal=[]
for model in models:
for copy_text in copies_text:
global emb
#print(2222,model)
emb=loadmodelEmb(model_name = model+"/model")
emb2=CustomEmbedding()
emb2.embed_query("test 123321")
Sal=[]
for query in queries_Categoricos[copy_text]:
t=time.time()
A={"model":model,
"query":query,
"type":"insider",
"copy_test":copy_text,
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(copy_text)),
"time":time.time()-t
}
Sal.append(A)
outdata=set(queries_Categoricos.keys())
outdata.remove(copy_text)
for query in outdata:
t=time.time()
A={"model":model,
"query":query,
"type":"outsider_n1",
"copy_test":copy_text,
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(copy_text)),
"time":time.time()-t
}
Sal.append(A)
outdata2=queries_Categoricos[query]
for query2 in outdata2:
t=time.time()
A={"model":model,
"query":query2,
"type":"outsider_n2",
"copy_test":copy_text,
"distance":distance.cosine(emb2.embed_query(query2),emb2.embed_query(copy_text)),
"time":time.time()-t
}
Sal.append(A)
df=pd.DataFrame(Sal)
df.to_csv("./%s/Sta/NewData%s.csv"%(path_model,copy_text[0:50]))
return Sal
def queries_CatSta_in(queries_Categoricos,model="embeddings/all-mpnet-base-v2"):
global emb
emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding()
emb2.embed_query("test 123321")
Sal=[]
for objetive in queries_Categoricos.keys():
for query in queries_Categoricos[objetive]:
t=time.time()
A={"model":model,
"query":query,
"type":"insider",
"objetive":objetive,
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(objetive)),
"time":time.time()-t
}
Sal.append(A)
return Sal
def queries_CatSta_out(queries_Categoricos,model="embeddings/all-mpnet-base-v2"):
global emb
emb=loadmodelEmb(model_name = model)
emb2=CustomEmbedding()
emb2.embed_query("test 123321")
Sal=[]
for objetive in queries_Categoricos.keys():
outdata=set(queries_Categoricos.keys())
outdata.remove(objetive)
for outdataObj in list(outdata):
for query in queries_Categoricos[outdataObj]:
t=time.time()
A={"model":model,
"query":query,
"type":"outsider",
"objetive":objetive,
"distance":distance.cosine(emb2.embed_query(query),emb2.embed_query(objetive)),
"time":time.time()-t
}
Sal.append(A)
return Sal
queries_CatSta()
def evalDb(text,dbs):
AllData=FinderDbs(text,dbs,filtred=5)
print(AllData)
if AllData:
AllData = list(AllData)
dis=[]
id=[]
for k,i in enumerate(AllData[0].items()):
dis.append(str(i[1]['d']))
id.append(i[0])
return dis,id
def EvalClass(dbs):
valid_path = Path(extractConfig(dataOut="valid_dataset_Class"))
with open(valid_path, 'r', encoding='utf-8') as file:
queries_Categoricos = json.load(file)
Sal = []
for i in queries_Categoricos.keys():
for j in queries_Categoricos[i]:
i=unidecode(i).strip().lower()
j=unidecode(j).strip().lower()
score = 1.0
dis,id=evalDb(j,dbs)
try:
pass
#print(j,i,id, dis[0])
except:
pass
#print(j,i,id)
Top8=0
Top1=0
Distancia=99
if int(i) in id:
Top8=1
try:
if int(i)==id[0]:
Top1=1
Distancia=dis[0]
except:
pass
Sal.append([j,i,Top8,Top1,Distancia])
df=pd.DataFrame(Sal,columns=['query', 'IdDb', 'Top8',"Top1","dist"])
df.to_csv("./%s/Sta/EvalClass.csv"%(path_model))
#queries_CatPlot(copies_text)
nltk.download('stopwords')
#llm,emb=loadModels()
model=models[0]
#print(model)
documents,documents2=loadCopysAndData()
emb=loadmodelEmb(model_name = model+"/model")
emb2=CustomEmbedding()
db=makeFaissdb(documents,"Copies3",emb2)
db2=makeFaissdb(documents2,"Intentionality3",emb2)
EvalClass([db,db2])

34
models.py Normal file
View File

@ -0,0 +1,34 @@
from pydal import DAL, Field
import datetime
dbcopies = DAL('sqlite://storage.sqlite',
pool_size=10,
migrate_enabled=False,
folder='data/raw/databases'
)
dbvotes = DAL('sqlite://storage.db',
pool_size=10,
migrate_enabled=False,
folder='data/raw/databases'
)
dbvotes.define_table('votes',
Field("id"),
Field("message"),
Field("copy_id"),
Field("vote"))
dbvotes.define_table('messages',
Field("id"),
Field("message"))
dbcopies.define_table('copies',
Field("id"),
Field("name"),
Field("copy_message"),
Field("copy_help"),
Field("display_name"),
Field("intentionality"),
Field("context"),
Field("more_info"))

View File

@ -1,88 +1,62 @@
aiofiles==23.2.1 aiofiles==23.2.1
aiohttp==3.8.5 aiohttp==3.8.5
aiosignal==1.3.1 aiosignal==1.3.1
altair==5.1.1 altair==5.1.2
annotated-types==0.5.0 annotated-types==0.5.0
anyio==3.7.1 anyio==3.7.1
async-timeout==4.0.3 async-timeout==4.0.3
attrs==23.1.0 attrs==23.1.0
beautifulsoup4==4.12.2
certifi==2023.7.22 certifi==2023.7.22
chardet==5.2.0
charset-normalizer==3.2.0 charset-normalizer==3.2.0
clean-text==0.4.0
click==8.1.7 click==8.1.7
cmake==3.27.5 cmake==3.27.5
contourpy==1.1.1 contourpy==1.1.1
cycler==0.11.0 cycler==0.12.0
dataclasses-json==0.6.0 dataclasses-json==0.6.1
emoji==2.8.0
exceptiongroup==1.1.3
faiss-cpu==1.7.4 faiss-cpu==1.7.4
fastapi==0.103.1 fastapi==0.103.2
ffmpy==0.3.1 ffmpy==0.3.1
filelock==3.12.4 filelock==3.12.4
filetype==1.2.0 fonttools==4.43.0
fonttools==4.42.1
frozenlist==1.4.0 frozenlist==1.4.0
fsspec==2023.9.1 fsspec==2023.9.2
ftfy==6.1.1
gpt4all==1.0.12
gradio==3.44.4
gradio_client==0.5.1
greenlet==2.0.2 greenlet==2.0.2
h11==0.14.0 h11==0.14.0
httpcore==0.18.0 httpcore==0.18.0
httptools==0.6.0
httpx==0.25.0 httpx==0.25.0
huggingface-hub==0.17.2 huggingface-hub==0.17.3
idna==3.4 idna==3.4
importlib-metadata==6.8.0
importlib-resources==6.1.0 importlib-resources==6.1.0
Jinja2==3.1.2 Jinja2==3.1.2
joblib==1.3.2 joblib==1.3.2
jsonpatch==1.33
jsonpointer==2.4
jsonschema==4.19.1 jsonschema==4.19.1
jsonschema-specifications==2023.7.1 jsonschema-specifications==2023.7.1
kiwisolver==1.4.5 kiwisolver==1.4.5
langchain==0.0.297 langchain==0.0.304
langsmith==0.0.38 langsmith==0.0.41
lit==16.0.6 lit==17.0.1
lxml==4.9.3
Markdown==3.4.4 Markdown==3.4.4
MarkupSafe==2.1.3 MarkupSafe==2.1.3
marshmallow==3.20.1 marshmallow==3.20.1
matplotlib==3.7.3 matplotlib==3.8.0
mpmath==1.3.0 mpmath==1.3.0
multidict==6.0.4 multidict==6.0.4
mypy-extensions==1.0.0 mypy-extensions==1.0.0
networkx==3.1 networkx==3.1
nltk==3.8.1 nltk==3.8.1
numexpr==2.8.6 numexpr==2.8.7
numpy==1.24.4 numpy==1.26.0
nvidia-cublas-cu11==11.10.3.66
nvidia-cuda-cupti-cu11==11.7.101
nvidia-cuda-nvrtc-cu11==11.7.99
nvidia-cuda-runtime-cu11==11.7.99
nvidia-cudnn-cu11==8.5.0.96
nvidia-cufft-cu11==10.9.0.58
nvidia-curand-cu11==10.2.10.91
nvidia-cusolver-cu11==11.4.0.1
nvidia-cusparse-cu11==11.7.4.91
nvidia-nccl-cu11==2.14.3
nvidia-nvtx-cu11==11.7.91
orjson==3.9.7 orjson==3.9.7
packaging==23.1 packaging==23.1
pandas==2.0.3 pandas==2.1.1
Pillow==10.0.1 Pillow==10.0.1
pkgutil_resolve_name==1.3.10 pydantic==2.4.2
pydantic==2.3.0 pydantic_core==2.10.1
pydantic_core==2.6.3
pydub==0.25.1 pydub==0.25.1
pyparsing==3.1.1 pyparsing==3.1.1
python-dateutil==2.8.2 python-dateutil==2.8.2
python-dotenv==1.0.0
python-iso639==2023.6.15
python-magic==0.4.27
python-multipart==0.0.6 python-multipart==0.0.6
pytz==2023.3.post1 pytz==2023.3.post1
PyYAML==6.0.1 PyYAML==6.0.1
@ -91,18 +65,17 @@ regex==2023.8.8
requests==2.31.0 requests==2.31.0
rpds-py==0.10.3 rpds-py==0.10.3
safetensors==0.3.3 safetensors==0.3.3
scikit-learn==1.3.0 scikit-learn==1.3.1
scipy==1.10.1 scipy==1.11.3
seaborn==0.13.0
semantic-version==2.10.0 semantic-version==2.10.0
sentence-transformers==2.2.2 sentence-transformers==2.2.2
sentencepiece==0.1.99 sentencepiece==0.1.99
six==1.16.0 six==1.16.0
sniffio==1.3.0 sniffio==1.3.0
soupsieve==2.5
SQLAlchemy==2.0.21 SQLAlchemy==2.0.21
starlette==0.27.0 starlette==0.27.0
sympy==1.12 sympy==1.12
tabulate==0.9.0
tenacity==8.2.3 tenacity==8.2.3
threadpoolctl==3.2.0 threadpoolctl==3.2.0
tokenizers==0.13.3 tokenizers==0.13.3
@ -110,18 +83,13 @@ toolz==0.12.0
torch==2.0.1 torch==2.0.1
torchvision==0.15.2 torchvision==0.15.2
tqdm==4.66.1 tqdm==4.66.1
transformers==4.33.2 transformers==4.33.3
triton==2.0.0 triton==2.0.0
typing-inspect==0.9.0 typing-inspect==0.9.0
typing_extensions==4.8.0 typing_extensions==4.8.0
tzdata==2023.3 tzdata==2023.3
Unidecode==1.3.6 Unidecode==1.3.7
unstructured==0.10.16
urllib3==2.0.5 urllib3==2.0.5
uvicorn==0.23.2 uvicorn==0.23.2
uvloop==0.17.0
watchfiles==0.20.0
wcwidth==0.2.6
websockets==11.0.3 websockets==11.0.3
yarl==1.9.2 yarl==1.9.2
zipp==3.17.0

191
requirementsnlp.txt Normal file
View File

@ -0,0 +1,191 @@
aiofiles==23.2.1
aiohttp==3.8.6
aiosignal==1.3.1
alembic==1.12.1
altair==5.1.2
annotated-types==0.6.0
antlr4-python3-runtime==4.9.3
anyio==3.7.1
asteroid-filterbanks==0.4.0
async-timeout==4.0.3
attrs==23.1.0
audioread==3.0.1
cachetools==5.3.2
certifi==2023.7.22
cffi==1.16.0
charset-normalizer==3.3.1
click==8.1.7
cmake==3.27.7
colorama==0.4.6
coloredlogs==15.0.1
colorlog==6.7.0
contourpy==1.1.1
cycler==0.12.1
dataclasses-json==0.6.1
datasets==2.14.6
decorator==5.1.1
dill==0.3.7
docopt==0.6.2
einops==0.7.0
faiss-cpu==1.7.4
fastapi==0.104.0
ffmpy==0.3.1
filelock==3.12.4
flatbuffers==23.5.26
fonttools==4.43.1
frozenlist==1.4.0
fsspec==2023.10.0
google-api-core==2.12.0
google-auth==2.23.4
google-cloud==0.34.0
google-cloud-speech==2.22.0
googleapis-common-protos==1.61.0
gradio==3.46.1
gradio_client==0.5.3
greenlet==3.0.1
grpcio==1.59.2
grpcio-status==1.59.2
h11==0.14.0
httpcore==0.18.0
httptools==0.6.0
httpx==0.25.0
huggingface-hub==0.17.3
humanfriendly==10.0
HyperPyYAML==1.2.2
idna==3.4
importlib-resources==6.1.0
Jinja2==3.1.2
joblib==1.3.2
jsonpatch==1.33
jsonpointer==2.4
jsonschema==4.19.1
jsonschema-specifications==2023.7.1
julius==0.2.7
kiwisolver==1.4.5
langchain==0.0.304
langsmith==0.0.41
lazy_loader==0.3
librosa==0.10.1
lightning==2.1.0
lightning-utilities==0.9.0
lit==17.0.3
llvmlite==0.41.1
Mako==1.2.4
Markdown==3.4.4
markdown-it-py==3.0.0
MarkupSafe==2.1.2
marshmallow==3.20.1
matplotlib==3.8.0
mdurl==0.1.2
more-itertools==10.1.0
mpmath==1.3.0
msgpack==1.0.7
multidict==6.0.4
multiprocess==0.70.15
mypy-extensions==1.0.0
networkx==3.0
nltk==3.8.1
numba==0.58.1
numexpr==2.8.7
numpy==1.26.1
nvidia-cublas-cu11==11.10.3.66
nvidia-cuda-cupti-cu11==11.7.101
nvidia-cuda-nvrtc-cu11==11.7.99
nvidia-cuda-runtime-cu11==11.7.99
nvidia-cudnn-cu11==8.5.0.96
nvidia-cufft-cu11==10.9.0.58
nvidia-curand-cu11==10.2.10.91
nvidia-cusolver-cu11==11.4.0.1
nvidia-cusparse-cu11==11.7.4.91
nvidia-nccl-cu11==2.14.3
nvidia-nvtx-cu11==11.7.91
omegaconf==2.3.0
onnxruntime-gpu==1.16.1
openai-whisper==20230918
optuna==3.4.0
orjson==3.9.7
packaging==23.2
pandas==2.1.1
pika==1.3.2
Pillow==9.3.0
platformdirs==3.11.0
pooch==1.8.0
primePy==1.3
proto-plus==1.22.3
protobuf==4.24.4
pyannote.audio==3.0.1
pyannote.core==5.0.0
pyannote.database==5.0.1
pyannote.metrics==3.2.1
pyannote.pipeline==3.0.1
pyarrow==13.0.0
pyasn1==0.5.0
pyasn1-modules==0.3.0
pycparser==2.21
pydantic==2.4.2
pydantic_core==2.10.1
pydub==0.25.1
Pygments==2.16.1
pyparsing==3.1.1
python-dateutil==2.8.2
python-dotenv==1.0.0
python-multipart==0.0.6
pytorch-lightning==2.1.0
pytorch-metric-learning==2.3.0
pytz==2023.3.post1
PyYAML==6.0.1
referencing==0.30.2
regex==2023.10.3
requests==2.31.0
rich==13.6.0
rpds-py==0.10.3
rsa==4.9
ruamel.yaml==0.18.2
ruamel.yaml.clib==0.2.8
safetensors==0.4.0
scikit-learn==1.3.2
scipy==1.11.3
seaborn==0.13.0
semantic-version==2.10.0
semver==3.0.2
sentence-transformers==2.2.2
sentencepiece==0.1.99
shellingham==1.5.4
six==1.16.0
sniffio==1.3.0
sortedcontainers==2.4.0
soundfile==0.12.1
soxr==0.3.7
speechbrain==0.5.15
SQLAlchemy==2.0.22
starlette==0.27.0
sympy==1.12
tabulate==0.9.0
tenacity==8.2.3
tensorboardX==2.6.2.2
threadpoolctl==3.2.0
tiktoken==0.3.3
tokenizer==3.4.3
tokenizers==0.14.1
toolz==0.12.0
torch==2.0.1
torch-audiomentations==0.11.0
torch-pitch-shift==1.2.4
torchaudio==2.0.2+cpu
torchmetrics==1.2.0
torchvision==0.15.2
tqdm==4.66.1
transformers==4.34.1
triton==2.0.0
typer==0.9.0
typing-inspect==0.9.0
typing_extensions==4.8.0
tzdata==2023.3
Unidecode==1.3.7
urllib3==2.0.7
uvicorn==0.23.2
uvloop==0.17.0
watchfiles==0.20.0
websockets==11.0.3
xxhash==3.4.1
yarl==1.9.2

2
run.sh Normal file → Executable file
View File

@ -1 +1 @@
uvicorn FindinDB:app --reload --port 7860 uvicorn main:app --reload --port 7860