From 09504d37441717c7f7e74043421dec4bac4dfdf8 Mon Sep 17 00:00:00 2001 From: marioggil Date: Sat, 14 Oct 2023 13:03:08 -0500 Subject: [PATCH] feat: Only fastapy --- DownloadModels.py | 61 ---------------------------- finetrainmodel.py | 71 ++++++++++++++++++++++++++++---- main.py | 100 +++++++++++++++++++--------------------------- metrics.py | 3 +- 4 files changed, 108 insertions(+), 127 deletions(-) diff --git a/DownloadModels.py b/DownloadModels.py index b8a8218..fea9b3d 100644 --- a/DownloadModels.py +++ b/DownloadModels.py @@ -14,64 +14,3 @@ for model in models: modelST.save(save_path) -# all-MiniLM-L12-v2 - - - - -# all-MiniLM-L12-v2 Saber sobre actividades culturales. 0.6535751457769086 0.05863175772626888 0.12278595510518776 -# all-MiniLM-L12-v2 Saber sobre talleres. 0.751629503845477 0.05310761464124975 0.1831973003891279 -# all-MiniLM-L12-v2 Información sobre talleres de literatura. 0.7224854452006415 0.05215076573219591 0.2247900827875677 -# all-MiniLM-L12-v2 Información sobre talleres de formación artistica. 0.7008979606232822 0.03950918605037314 0.2588270430294973 -# all-MiniLM-L12-v2 Obtener la certificación de uso de suelos. 0.6363654116990891 0.06126748264989437 0.2990496653430867 -# all-MiniLM-L12-v2 Reportar un bacheo. 0.5974184966305134 0.14056650047761457 0.33133007445425355 -# all-MiniLM-L12-v2 Saber dónde pago un parquímetro. 0.7286070458224445 0.04967551271473011 0.36476032688932597 -# paraphrase-MiniLM-L3-v2 - - - - -# paraphrase-MiniLM-L3-v2 Saber sobre actividades culturales. 0.7366279968758482 0.08893400433814432 0.011976916834993183 -# paraphrase-MiniLM-L3-v2 Saber sobre talleres. 0.8040920436803051 0.07181478379134668 0.02360300747853405 -# paraphrase-MiniLM-L3-v2 Información sobre talleres de literatura. 0.7437334052301269 0.04553266191552214 0.036959598649222894 -# paraphrase-MiniLM-L3-v2 Información sobre talleres de formación artistica. 0.743870036748493 0.06526662723048463 0.05061841460893739 -# paraphrase-MiniLM-L3-v2 Obtener la certificación de uso de suelos. 0.7717547355774438 0.06484008413761407 0.062440363865978316 -# paraphrase-MiniLM-L3-v2 Reportar un bacheo. 0.6655234266285941 0.12495720849140243 0.0751793069659539 -# paraphrase-MiniLM-L3-v2 Saber dónde pago un parquímetro. 0.7348896817507707 0.04065274263873351 0.09146604897840968 -# all-MiniLM-L6-v2 - - - - -# all-MiniLM-L6-v2 Saber sobre actividades culturales. 0.5873976949286881 0.054536409831093556 0.02166009399126161 -# all-MiniLM-L6-v2 Saber sobre talleres. 0.705393021384429 0.06415187629245482 0.040732748103591634 -# all-MiniLM-L6-v2 Información sobre talleres de literatura. 0.602608386747181 0.054022995767296696 0.06001406345727309 -# all-MiniLM-L6-v2 Información sobre talleres de formación artistica. 0.6445745034623189 0.05229467148751577 0.07957683869127957 -# all-MiniLM-L6-v2 Obtener la certificación de uso de suelos. 0.5708618561256799 0.0394827821548067 0.09872836436865465 -# all-MiniLM-L6-v2 Reportar un bacheo. 0.5741872079555271 0.13503311454160494 0.11794944529263478 -# all-MiniLM-L6-v2 Saber dónde pago un parquímetro. 0.6594361733956011 0.056983523732601314 0.13696542775855874 -# all-mpnet-base-v2 - - - - -# all-mpnet-base-v2 Saber sobre actividades culturales. 0.5534035540829121 0.06890411125329764 0.08799683372929411 -# all-mpnet-base-v2 Saber sobre talleres. 0.6346356305674484 0.05816374415416716 0.2692093669243579 -# all-mpnet-base-v2 Información sobre talleres de literatura. 0.733400957902919 0.062002638662933096 0.4069641581121481 -# all-mpnet-base-v2 Información sobre talleres de formación artistica. 0.7403190712146518 0.06573001180535122 0.5049155163315108 -# all-mpnet-base-v2 Obtener la certificación de uso de suelos. 0.5057200806006308 0.040962860644441684 0.5848623266759908 -# all-mpnet-base-v2 Reportar un bacheo. 0.4282261685120943 0.1256224113877856 0.6733528038240829 -# all-mpnet-base-v2 Saber dónde pago un parquímetro. 0.5096540066521769 0.06542826690229307 0.7576164344571671 -# multi-qa-mpnet-base-dot-v1 - - - - -# multi-qa-mpnet-base-dot-v1 Saber sobre actividades culturales. 0.5412514848207511 0.049426306929690425 0.08471853328201007 -# multi-qa-mpnet-base-dot-v1 Saber sobre talleres. 0.6004619942650676 0.04068730180147856 0.19998745198519724 -# multi-qa-mpnet-base-dot-v1 Información sobre talleres de literatura. 0.5422846411740877 0.035149354259768846 0.3127848022388962 -# multi-qa-mpnet-base-dot-v1 Información sobre talleres de formación artistica. 0.5810213727598411 0.033041479673933366 0.3915549134308437 -# multi-qa-mpnet-base-dot-v1 Obtener la certificación de uso de suelos. 0.5171735715348054 0.029578046799246076 0.4764851579126322 -# multi-qa-mpnet-base-dot-v1 Reportar un bacheo. 0.4249011819077356 0.12119208621320086 0.559126246650264 -# multi-qa-mpnet-base-dot-v1 Saber dónde pago un parquímetro. 0.49335939772807463 0.047415340138656205 0.65184190138331 - diff --git a/finetrainmodel.py b/finetrainmodel.py index 4cf5990..c41eb83 100644 --- a/finetrainmodel.py +++ b/finetrainmodel.py @@ -1,10 +1,67 @@ +""" from sentence_transformers import SentenceTransformer, models + +## Step 1: use an existing language model +word_embedding_model = models.Transformer('distilroberta-base') + +## Step 2: use a pool function over the token embeddings +pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) + +## Join steps 1 and 2 using the modules argument +model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) + +from sentence_transformers import InputExample + +from datasets import load_dataset + +dataset_id = "embedding-data/QQP_triplets" +# dataset_id = "embedding-data/sentence-compression" + +dataset = load_dataset(dataset_id) + + +train_examples = [] +train_data = dataset['train']['set'] +# For agility we only 1/2 of our available data +n_examples = dataset['train'].num_rows // 2 + +for i in range(10): + example = train_data[i] + train_examples.append(InputExample(texts=[example['query'], example['pos'][0]])) + + + + + + + +from torch.utils.data import DataLoader + +train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2) + + +from sentence_transformers import losses + +train_loss = losses.MultipleNegativesRankingLoss(model=model) + +num_epochs = 10 + +warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data + +model.fit(train_objectives=[(train_dataloader, train_loss)],epochs=num_epochs,warmup_steps=2) + + + """ from sentence_transformers import SentenceTransformer, losses, InputExample from torch.utils.data import DataLoader + + + + model="embeddings/all-mpnet-base-v2" modelST = SentenceTransformer(model) -train_loss = losses.MultipleNegativesRankingLoss(model=model) +train_loss = losses.MultipleNegativesRankingLoss(model=modelST) queries=["reportar un bache en mi comunidad", "¿Como reporto un bacheo en mi comunidad?", @@ -62,15 +119,15 @@ queries=["reportar un bache en mi comunidad", train_examples = [] for q in queries: - train_examples.append(InputExample(texts=[q, 'Reportar un bacheo'])) + train_examples.append(InputExample(texts=[ 'Reportar un bacheo',q])) print(train_examples) -train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=5) +train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2) print(train_dataloader) num_epochs = 2 - warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data -modelST.fit(train_objectives=[(train_dataloader, train_loss)], - epochs=num_epochs, - warmup_steps=warmup_steps) +modelST.fit(train_objectives=[(train_dataloader, train_loss)],epochs=num_epochs,warmup_steps=2) +save_path = './Finetuning/%s/'%(model) +# Save the model +modelST.save(save_path) \ No newline at end of file diff --git a/main.py b/main.py index 7e5caf1..2fbd4cf 100644 --- a/main.py +++ b/main.py @@ -11,7 +11,7 @@ from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader from langchain.document_loaders import UnstructuredURLLoader from langchain.document_loaders.csv_loader import CSVLoader #from langchain import LLMChain -from langchain.pydantic_v1 import BaseModel +from pydantic import BaseModel from langchain.schema.embeddings import Embeddings from langchain.document_loaders import DataFrameLoader from langchain.embeddings import HuggingFaceEmbeddings @@ -25,6 +25,9 @@ from nltk.corpus import stopwords import re model="embeddings/all-mpnet-base-v2" entrenamiento="V0.0" + + + class CustomEmbedding(Embeddings, BaseModel,): """embedding model with preprocessing""" def _get_embedding(self,text) -> List[float]: @@ -92,14 +95,6 @@ def loadmodelEmb(model_name = "embeddings/all-MiniLM-L6-v2",model_kwargs = {'dev st = SentenceTransformer(model_name) return st -#emb=loadmodelEmb() -CUSTOM_PATH = "/angela" -app = FastAPI() - - -@app.get("/") -def read_main(): - return {"message": "This is your main app"} def loadCopysAndData(pathsqlite="motor.sqlite"): con = sqlite3.connect(pathsqlite) @@ -132,6 +127,10 @@ db=makeFaissdb(documents,"Copies3",emb2) db2=makeFaissdb(documents2,"Intencionality3",emb2) #db3=makeFaissdb(documents2,"nameshf",hf) + + + + def FinderDbs(query,dbs,filtred=False,th=1.2): AllData={} for dbt in dbs: @@ -148,7 +147,7 @@ def FinderDbs(query,dbs,filtred=False,th=1.2): if filtred: filtredData={} for row in AllData.keys(): - if AllData[row]["d"]<1.2: + if AllData[row]["d"]

Respuesta {k+1}

" - to_append = markdown.markdown(i[1]['page_content']) dis.append(str(i[1]['d'])) id.append(i[0]) - - #print("NNNN",i,k) - lista = lista + titulo + to_append + '' - #lista.append('
') - #lista = lista + '' - - AllData[0] = lista - - - return id, dis,versionL + return {"ids": id,"DC":dis,"modelo":versionL} -with gr.Blocks() as demo: - gr.Image("logo.jpg",height=100) - gr.Markdown("Esta es la busqueda que hace el usuario") - Pregunta = gr.Textbox(label="Pregunta") - #Pregunta = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", Pregunta) - #Pregunta=Pregunta.strip().lower() - - filtred=gr.Checkbox(label="filtrado") - - gr.Markdown("Respuestas para orca desde los copys") - Respuesta = gr.Textbox(label="Respuesta") - id = gr.Textbox(label="id") - metrica=gr.Textbox(label="metrica") - - version = gr.Textbox(label="version") - # id2 = gr.Textbox(label="id2") - # metrica2=gr.Textbox(label="metrica2") - # gr.Markdown("Respuestas para hf desde los names") - # Respuesta3 = gr.Textbox(label="Respuesta3") - # id3 = gr.Textbox(label="id3") - # metrica3=gr.Textbox(label="metrica3") - Enviar_btn = gr.Button("Responder") - Enviar_btn.click(fn=QARequest, inputs=[Pregunta,filtred], outputs=[id,metrica,version], api_name="api_angela") # - -#demo.launch(root_path="angela") # - -gradio_app = gr.routes.App.create_app(demo) - -app.mount(CUSTOM_PATH, gradio_app) - -#app = demo.mount_gradio_app(app, io, path=CUSTOM_PATH) diff --git a/metrics.py b/metrics.py index 4b407ae..ffcace9 100644 --- a/metrics.py +++ b/metrics.py @@ -143,7 +143,8 @@ def FinderDbs(query,dbs,filtred=False,th=1.2): if args.models=="All": models=["all-MiniLM-L12-v2","paraphrase-MiniLM-L3-v2" , "all-MiniLM-L6-v2","all-mpnet-base-v2","multi-qa-mpnet-base-dot-v1"] else: - models=["all-mpnet-base-v2"] + models=["embeddings/all-mpnet-base-v2","Finetuning/embeddings/all-mpnet-base-v2"] + queries_bacheo=["Quiero reportar un bacheo", "reportar un bache en mi comunidad", "¿Como reporto un bacheo en mi comunidad?",