diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d910e66
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+env/*
+databases/storage.db
+4b751a4425c2884286a92fde2de6427f_trusted.table
+4b751a4425c2884286a92fde2de6427f_analitic.table
+4b751a4425c2884286a92fde2de6427f_analitic_voice.table
+4b751a4425c2884286a92fde2de6427f_analitic_llm.table
+4b751a4425c2884286a92fde2de6427f_analitic_ocr.table
+.vscode/*
+__pycache__/*
+
diff --git a/apis.py b/apis.py
new file mode 100644
index 0000000..1b548e0
--- /dev/null
+++ b/apis.py
@@ -0,0 +1,706 @@
+import fastapi
+from fastapi import FastAPI, Request
+from fastapi.responses import HTMLResponse
+from pydantic import BaseModel
+import time
+from fastapi.staticfiles import StaticFiles
+from fastapi import FastAPI, Query, File, UploadFile
+#from fastapi.middleware.cors import CORSMiddleware
+from starlette.middleware.cors import CORSMiddleware
+import main
+import os
+from databases import db
+import audioread
+import pandas as pd
+import statistics
+pwd = os.getcwd()
+pathAud="example/audio"
+pathFact="example/factura"
+app = FastAPI()
+#app.mount("/statics", StaticFiles(directory="statics"), name="statics")
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+class Response(BaseModel):
+ """Structure of data to querry of make post from X or article blog
+ """
+ path: str = Query("", description="Style and sentiments of text")
+ Trusted: str = Query("", description="Style and sentiments of text")
+ model : str = Query("whisper", description="Style and sentiments of text")
+class Response1(BaseModel):
+ path: str = Query("", description="path file")
+ task_prompt: str = Query("", description="task of model")
+ model: str = Query("", description="model")
+ TrustedOCR: str = Query("", description="truted OCR model")
+ option: str = Query("", description="OCR model option")
+class Response2(BaseModel):
+ path: str = Query("", description="path file")
+ task_prompt: str = Query("", description="task of model")
+ system: str = Query("", description="prompt system LLM model with ocr and image claude")
+ content: str = Query("%s", description="prompt content LLM model with ocr")
+ max_tokens: int = Query(1024, description="maxtoken LLM OCR model")
+ model: str = Query("Claude-sonnet", description="model")
+ prompt: str = Query("", description="prompt in claude with image")
+ TrustedLLmjson: str = Query("", description="truted OCR model")
+
+class Response3(BaseModel):
+ """Structure of data to querry of make post from X or article blog
+ """
+ path: str = Query("", description="Style and sentiments of text")
+ Trusted: str = Query("", description="Style and sentiments of text")
+ mode : str = Query("whisper", description="Style and sentiments of text")
+
+
+@app.get("/addTrusted")
+@app.post("/addTrusted")
+def addTrusted(response:Response3):
+ path=response.path
+ Trusted=response.Trusted
+ mode=response.mode
+ file_stats = os.stat(path)
+ size=file_stats.st_size / (1024 * 1024)
+ if mode=="voice":
+ with audioread.audio_open(path) as f:
+ duration = f.duration
+ else:
+ duration = 0
+ if db(db.trusted.path == path and db.trusted.mode == mode).count()==0:
+ db.trusted.insert(path=path,trusted=Trusted,mode=mode,size=size,duration =duration )
+ db.commit()
+ return "Add %s in mode %s"%(path,mode)
+ else:
+ db(db.trusted.path == path and db.trusted.mode == mode).update(trusted=Trusted,size=size,duration =duration )
+ db.commit()
+ return "Update %s in mode %s"%(path,mode)
+
+def list2tablehtml(listdata,model):
+ html="""
Table of {0}
+
+
+ path |
+ time |
+ similarity |
+ similaritypartial |
+
""".format(model)
+
+ for i in listdata:
+ html=html+"""
+ %s |
+ %s |
+ %s |
+ %s |
+
+"""%(i["path"],i["time"],i["similarity"],i["similaritypartial"])
+ html=html+"""
+ """
+ return html
+
+
+def tableVoice(model):
+ rows = db(db.analitic_voice.model==model).select()
+ rows_list = rows.as_list()
+ data=pd.DataFrame(rows_list)
+ durationL=list()
+ for i in rows_list:
+ durationL.append(db(db.trusted.path == i["path"] ).select().last().duration)
+ duration=statistics.mean(durationL)
+ time=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['time'].values[0]
+ similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0]
+ similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0]
+ efectivetime=time/duration
+ card="""
+
+
+
+
time of process (sg)
+
{1}
+
similarity
+
{2}
+
similaritypartial
+
{3}
+
time of audio(sg)
+
{4}
+
time in process
+
{5}
+
+
+
""".format(model,time,similarity,similaritypartial,duration,efectivetime)
+ return {"duration":duration,"time":time,"similarity":similarity,"similaritypartial":similaritypartial,"card":card,"data":list2tablehtml(rows_list,model)}
+
+
+@app.get("/getmetricsvoice")
+def getMetricsVoice():
+ pass
+ models=list()
+ for row in db().select(db.analitic_voice.model, distinct=True):
+ models.append(row.model)
+ cards=""
+ dataAll=""
+ for model in models:
+
+ Sal=tableVoice(model)
+ cards=cards+Sal["card"]
+ dataAll=dataAll+Sal["data"]
+
+
+ htmlhead="""
+
+
+
+
+ Evaluacion de modelos voice2txt
+
+"""
+
+ htmlbody="""
+ Estadisticas modelos de voice
+
+ {0}
+
+ {1}
+
+
+ """.format(cards,dataAll)
+ html=htmlhead+htmlbody
+ return HTMLResponse(content=html, status_code=200)
+
+
+@app.get("/EvalVoice")
+@app.post("/EvalVoice")
+def EvalVoice(response:Response):
+ path=response.path
+ Trusted=response.Trusted
+ model=response.model
+ if Trusted=="":
+ row=db(db.trusted.path == path and db.trusted.mode == "voice").select().first()
+ try:
+ Trusted=row.trusted
+ except:
+ pass
+
+ if model=="whisper":
+ Sal=main.EvalWhisper(path,Trusted)
+ else:
+ Sal=main.EvalVosk(path,Trusted)
+ if db(db.analitic_voice.path == Sal["path"] and db.analitic_voice.model == Sal["model"]).count()==0:
+ db.analitic_voice.insert(**Sal)
+ db.commit()
+ else:
+ db(db.analitic_voice.path == Sal["path"] and db.analitic_voice.model == Sal["model"]).update(similarity= Sal["similarity"],similaritypartial= Sal["similaritypartial"])
+ db.commit()
+ return Sal
+
+@app.get("/EvalFact")
+@app.post("/EvalFact")
+def EvalFact(response:Response1):
+ path=response.path
+ task_prompt=response.task_prompt
+ option=response.model
+ TrustedOCR=response.TrustedOCR
+ Trusted=TrustedOCR
+ if task_prompt=="":
+ if Trusted=="":
+ row=db(db.trusted.path == path and db.trusted.mode == "OCR").select().first()
+ try:
+ Trusted=row.trusted
+ except:
+ pass
+ Sal=main.EvalFacturas(path,task_prompt,TrustedOCR,option)
+ Sal["path"]=path
+ if db(db.analitic_ocr.path == Sal["path"] and db.analitic_ocr.model == Sal["model"]).count()==0:
+ db.analitic_ocr.insert(**Sal)
+ db.commit()
+ else:
+ db(db.analitic_ocr.path == Sal["path"] and db.analitic_ocr.model == Sal["model"]).update(similarity= Sal["similarity"],similaritypartial= Sal["similaritypartial"],jsonok=Sal["jsonok"])
+ db.commit()
+
+
+ return Sal
+
+@app.get("/EvalLLMFact")
+@app.post("/EvalLLMFact")
+def EvalLLMFact(response:Response2):
+ path=response.path
+ task_prompt=response.task_prompt
+ system=response.system
+ content=response.content
+ max_tokens=response.max_tokens
+ model=response.model
+ prompt=response.prompt
+ TrustedLLmjson=response.TrustedLLmjson
+
+ Sal=main.EvalllmFacturas(path,task_prompt,system,content,max_tokens,model,prompt,TrustedLLmjson)
+ return Sal
+
+@app.get("/evalvoicehtml")
+def EvalVoicehtml():
+ dir_list = os.listdir(pathAud)
+ Sal=""
+ t=1
+ for i in dir_list:
+
+ temp="""
+ """%(str(pwd+"/"+pathAud+"/"+i),str(t),str(i))
+ Sal=Sal+temp
+ t=t+1
+
+
+ html="""
+
+
+
+
+ Evaluacion de modelos voice2txt
+
+
+
+ Petición POST a API
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ """%(Sal)
+ return HTMLResponse(content=html, status_code=200)
+
+
+@app.get("/evalocrfactura")
+def EvalOCRFactura():
+ dir_list = os.listdir(pathFact)
+ Sal=""
+ t=1
+ for i in dir_list:
+ temp="""
+ """%(str(pwd+"/"+pathFact+"/"+i),str(t),str(i))
+ Sal=Sal+temp
+ t=t+1
+ html="""
+
+
+
+
+ Evaluacion de modelos OCR
+
+
+
+ Petición POST a API
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ """%(Sal)
+ return HTMLResponse(content=html, status_code=200)
+
+def list2tablehtmlOCR(listdata,model):
+ html="""Table of {0}
+
+
+ path |
+ time |
+ similarity |
+ similaritypartial |
+
""".format(model)
+
+ for i in listdata:
+ html=html+"""
+ %s |
+ %s |
+ %s |
+ %s |
+
+"""%(i["path"],i["time"],i["similarity"],i["similaritypartial"])
+ html=html+"""
+ """
+ return html
+
+
+def tableOCR(model):
+ rows = db(db.analitic_ocr.model==model).select()
+ rows_list = rows.as_list()
+ data=pd.DataFrame(rows_list)
+ time=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['time'].values[0]
+ similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0]
+ similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0]
+ card="""
+
+
+
+
time of process (sg)
+
{1}
+
similarity
+
{2}
+
similaritypartial
+
{3}
+
+
+
""".format(model,time,similarity,similaritypartial)
+ return {"time":time,"similarity":similarity,"similaritypartial":similaritypartial,"card":card,"data":list2tablehtmlOCR(rows_list,model)}
+
+
+
+@app.get("/getmetricsocr")
+def getMetricsOCR():
+ models=list()
+ for row in db().select(db.analitic_ocr.model, distinct=True):
+ models.append(row.model)
+ cards=""
+ dataAll=""
+ for model in models:
+ Sal=tableOCR(model)
+ cards=cards+Sal["card"]
+ dataAll=dataAll+Sal["data"]
+ htmlhead="""
+
+
+
+
+ Evaluacion de modelos voice2txt
+
+"""
+
+ htmlbody="""
+ Estadisticas modelos de OCR
+
+ {0}
+
+ {1}
+
+
+ """.format(cards,dataAll)
+ html=htmlhead+htmlbody
+ return HTMLResponse(content=html, status_code=200)
+
+
+
+@app.get("/evalllmfacturas")
+def EvalllmFacturas():
+ dir_list = os.listdir(pathFact)
+ Sal=""
+ t=1
+ for i in dir_list:
+ temp="""
+ """%(str(pwd+"/"+pathFact+"/"+i),str(t),str(i))
+ Sal=Sal+temp
+ t=t+1
+ html="""
+
+
+
+
+ Evaluacion modelos LLM
+
+
+
+ Petición POST a API
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ """%(Sal,"%s")
+ return HTMLResponse(content=html, status_code=200)
\ No newline at end of file
diff --git a/databases.py b/databases.py
new file mode 100644
index 0000000..6f24fef
--- /dev/null
+++ b/databases.py
@@ -0,0 +1,43 @@
+from pydal import DAL, Field
+db = DAL("sqlite://databases/storage.db")
+db.define_table(
+ "trusted",
+ Field("path"),
+ Field("mode"),
+ Field("trusted"),
+ Field("duration",type="double"),
+ Field("size",type="double")
+)
+db.define_table(
+ "analitic_voice",
+ Field("content"),
+ Field("trusted"),
+ Field("model"),
+ Field("time", type="double"),
+ Field("path"),
+ Field("similarity", type="double"),
+ Field("similaritypartial", type="double")
+)
+
+db.define_table(
+ "analitic_ocr",
+ Field("content"),
+ Field("trusted"),
+ Field("model"),
+ Field("time", type="double"),
+ Field("path"),
+ Field("similarity", type="double"),
+ Field("similaritypartial", type="double"),
+ Field("jsonok" ,type="integer")
+)
+
+db.define_table(
+ "analitic_llm",
+ Field("content"),
+ Field("trusted"),
+ Field("model"),
+ Field("time", type="double"),
+ Field("path"),
+ Field("similarity", type="double"),
+ Field("similaritypartial", type="double")
+)
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..fc6f87d
--- /dev/null
+++ b/main.py
@@ -0,0 +1,157 @@
+import requests
+import evaluate
+import deepdiff
+import json
+from fuzzywuzzy import fuzz
+from deepdiff import DeepDiff
+from deepdiff import Delta
+import databases
+#print(evaluate.list_evaluation_modules())
+urlAud="http://127.0.0.1:7870/"
+urlText="http://127.0.0.1:7869"
+password="1223Aer*"
+def EvalVoice2Text(endpoint,datajson,Trusted):
+ """Evaluate Voice 2 text
+ """
+ apiUrl=urlAud+endpoint
+ response = requests.get(apiUrl, json=datajson)
+ print(datajson)
+ A=json.loads(response.content)
+ print(A)
+ time=A['time']
+
+ similarity=fuzz.ratio( Trusted.strip().lower(),A['message'].strip().lower())
+ similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['message'].strip().lower())
+ path=datajson["local"]
+ model=datajson["model"]
+ message=A['message']
+ return {"content":message,
+ "trusted":Trusted,
+ "model":model,
+ "time":time,
+ "similarity":similarity,
+ "similaritypartial":similarityPartial,
+ "path":path
+ }
+
+
+def EvalWhisper(path,Trusted=""):
+ endpoint="/voice2txt"
+ datajson={"url":"","password":password ,"model":"whisper","local":path}
+ return EvalVoice2Text(endpoint,datajson,Trusted)
+
+
+# EvalWhisper(path="example/AwACAgEAAxkBAAIBw2YX8o2vGGCNtZCXk7mY1Bm5w__lAAJmBAACxe7ARI1fUWAGcz_RNAQ.ogg",
+# Trusted="Hoy compre dos medicinas Tereleji en Cruz Verde por un monto de 494 mil 400 pesos colombianos.",
+# endpoint="/voice2txt")
+
+def EvalVosk(path,Trusted=""):
+ endpoint="/voice2txtlocal"
+ datajson={"url":"","password":password ,"model":"models/vosk-model-small-es-0.42","local":path}
+ return EvalVoice2Text(endpoint,datajson,Trusted)
+
+
+
+# EvalVosk(path="example/AwACAgEAAxkBAAIBw2YX8o2vGGCNtZCXk7mY1Bm5w__lAAJmBAACxe7ARI1fUWAGcz_RNAQ.ogg",
+# Trusted="Hoy compre dos medicinas Tereleji en Cruz Verde por un monto de 494 mil 400 pesos colombianos.",
+# endpoint="/voice2txtlocal")
+
+
+def ocrfacturas(path,task_prompt):
+ apiUrl=urlText+'/parsedimage3'
+ datajson={"path":path,"task_prompt":task_prompt,"password":password}
+ response = requests.get(apiUrl, json=datajson)
+ return response.content
+
+def llmFacturas(path,task_prompt,system,content,max_tokens,model):
+ apiUrl=urlText+'/parsedimage4'
+ datajson={"path":path,"task_prompt":task_prompt,"system":system,"content":content,"max_tokens":max_tokens,"model":model,"password":password}
+ response = requests.get(apiUrl, json=datajson)
+ return response.content
+
+def llmFacturas2(path,prompt,system,model):
+ apiUrl=urlText+'/parsedimage2'
+ datajson={"path":path,"prompt":prompt,"system":system,"model":model,"password":password}
+ response = requests.get(apiUrl, json=datajson)
+ return response.content
+
+def EvalParsedImage(path="/home/mario/Repositorios/EvalDataSetHugging/example/Gmail/20240530_112812.jpg"):
+ endpoint="/parsedimage"
+ jsonT={"path":path,"password":password}
+ response=requests.get(urlText+endpoint,json=jsonT)
+ return response.content
+
+def EvalParsedImage5(path="/home/mario/Repositorios/EvalDataSetHugging/example/Gmail/20240530_112812.jpg",option="teserac"):
+ endpoint="/parsedimage5"
+ jsonT={"path":path,"password":password,"option":option}
+ response=requests.get(urlText+endpoint,json=jsonT)
+ return response.content
+
+def EvalFacturas(path,task_prompt,TrustedOCR,option=""):
+ if task_prompt=="parsed":
+ OCR=EvalParsedImage(path)
+ if task_prompt=="More Detailed Caption" or task_prompt=='OCR':
+ OCR=ocrfacturas(path,task_prompt)
+ if task_prompt=="scan":
+ OCR=EvalParsedImage5(path,option)
+ model=json.loads(OCR)["model"]
+ content=json.loads(OCR)["content"]
+ time=json.loads(OCR)["time"]
+ try:
+ TrustedOCR=json.loads(TrustedOCR)
+ jsonok=1
+ except:
+ jsonok=0
+ pass
+ similarity=fuzz.ratio( str(TrustedOCR).strip().lower(),str(content).strip().lower())
+ similarityPartial=fuzz.partial_ratio( str(TrustedOCR).strip().lower(),str(content).strip().lower())
+ return {"content":content,
+ "trusted":TrustedOCR,
+ "similarity":similarity,
+ "similaritypartial":similarityPartial,
+ "model":model,
+ "time":time,
+ "jsonok":jsonok
+ }
+def changemodel(model):
+ if model=="Claude-sonnet":
+ model="claude-3-5-sonnet-20240620"
+ elif model=="Claude-opus":
+ model="claude-3-opus-20240229"
+ elif model=="Claude-haiku":
+ model="claude-3-haiku-20240307"
+ return model
+
+def EvalllmFacturas(path,task_prompt,system,content,max_tokens,model,prompt,TrustedLLmjson):
+ model=changemodel(model)
+ if model.count("claude")>0 and task_prompt=="":
+ LLmjson=llmFacturas2(path=path,prompt=prompt,system=system,model=model)
+ else:
+ LLmjson=llmFacturas(path=path,task_prompt=task_prompt,system=system,content=content,max_tokens=max_tokens,model=model)
+ TrustedLLmjson=json.loads(TrustedLLmjson)
+ return {"content":LLmjson,"trusted":TrustedLLmjson}
+
+
+
+
+
+#EvalFacturas(path="example/Factura2.jpg",task_prompt="OCR",system="",content="Analiza el siguiente texto: %s",max_tokens=200,model="claude-sonnet")
+
+def EvalClassImage(path="/home/mario/Repositorios/EvalDataSetHugging/example/Gmail/20240530_112812.jpg",):
+ endpoint="classificateimage"
+ jsonT={"path":path,"password":password}
+ response=requests.get(urlText+endpoint,json=jsonT)
+ print(response.content)
+
+#To Do
+def EvalGeneratedText(prompt="",model="",):
+ pass
+
+def EvalGenerateVoice():
+ def GenerateVoice():
+ pass
+ def Voice2txt():
+ pass
+
+
+