feat: Eval LLm

This commit is contained in:
Mario Gil 2024-08-09 08:15:44 -05:00
parent 881d3074cf
commit 3514733885
5 changed files with 408 additions and 33 deletions

140
apis.py
View File

@ -249,7 +249,7 @@ def EvalVoicehtml():
</style> </style>
</head> </head>
<body> <body>
<h1>Petición Evaluar modelo de voz comtra datos curados</h1> <h1>Petición Evaluar modelo de voz contra datos curados</h1>
<select id="texto1"> <select id="texto1">
%s %s
@ -319,7 +319,7 @@ def EvalLLMCompra(response:Response4):
db.commit() db.commit()
else: else:
print(2,Sal) print(2,Sal)
db((db.analitic_llm_compra.path == Sal["path"]) & (db.analitic_llm_compra.model == Sal["model"])).update(similarity= Sal["similarity"],similaritypartial= Sal["similaritypartial"],last_modified=Sal["last_modified"]) db((db.analitic_llm_compra.path == Sal["path"]) & (db.analitic_llm_compra.model == Sal["model"])).update(last_modified=Sal["last_modified"],relevance=Sal["relevance"],bias=Sal["bias"],toxic=Sal["toxic"],correctness=Sal["correctness"],relevance_r=Sal["relevance_r"],bias_r=Sal["bias_r"],toxic_r=Sal["toxic_r"],correctness_r=Sal["correctness_r"])
db.commit() db.commit()
return Sal return Sal
@ -368,7 +368,7 @@ def EvalLLMComprahtml():
</style> </style>
</head> </head>
<body> <body>
<h1>Petición Evaluar modelo de voz comtra datos curados</h1> <h1>Petición Evaluar modelo de LLM para evaluar compras contra datos curados</h1>
<select id="texto1"> <select id="texto1">
%s %s
@ -424,6 +424,140 @@ def EvalLLMComprahtml():
"""%(Sal,Sal2) """%(Sal,Sal2)
return HTMLResponse(content=html, status_code=200) return HTMLResponse(content=html, status_code=200)
#
@app.get("/EvalLLMGeneracionTexto")
@app.post("/EvalLLMGeneracionTexto")
def EvalLLMGeneracionTexto(response:Response4):
content=response.path
model=response.model
system= response.system
max_tokens= response.max_tokens
path=content
if db((db.trusted.path == path ) & ( db.trusted.mode == "llm_generaciontexto")).count()==0:
return JSONResponse(
status_code=404,
content={"content": "Trusted no found" }
)
Trusted=db((db.trusted.path == path ) & ( db.trusted.mode == "llm_generaciontexto")).select().last().trusted
Sal=main.EvalModelLLMCompra(system,content,model,max_tokens,Trusted)
Sal["last_modified"]=datetime.now()
if db((db.analitic_llm_generaciontexto.path == Sal["path"]) & (db.analitic_llm_generaciontexto.model == Sal["model"])).count()==0:
print(1,Sal)
db.analitic_llm_generaciontexto.insert(**Sal)
db.commit()
else:
print(2,Sal)
db((db.analitic_llm_generaciontexto.path == Sal["path"]) & (db.analitic_llm_generaciontexto.model == Sal["model"])).update(last_modified=Sal["last_modified"],relevance=Sal["relevance"],bias=Sal["bias"],toxic=Sal["toxic"],correctness=Sal["correctness"],relevance_r=Sal["relevance_r"],bias_r=Sal["bias_r"],toxic_r=Sal["toxic_r"],correctness_r=Sal["correctness_r"])
db.commit()
return Sal
@app.get("/evalllmgeneraciontextohtml")
def EvalLLMGeneracionTextohtml():
dir_list = db((db.trusted.mode == "llm_generaciontexto" )).select()
Sal=""
t=1
for i in dir_list:
temp="""<option value="%s">Opción %s, %s</option>
"""%(i.path,str(t),str(i.path))
Sal=Sal+temp
t=t+1
dir_list2 = db((db.prompt.mode == "llm_generaciontexto" )).select()
Sal2=""
t=1
for i in dir_list2:
temp="""<option value="%s">Opción %s, %s</option>
"""%(i.prompt,str(t),str(i.prompt))
Sal2=Sal2+temp
t=t+1
html="""<!DOCTYPE html>
<html lang="es">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Evaluacion de modelos voice2txt</title>
<style>
body {
font-family: Arial, sans-serif;
margin: 20px;
}
input, button {
margin: 10px 0;
padding: 5px;
}
#respuesta {
margin-top: 20px;
padding: 10px;
border: 1px solid #ccc;
background-color: #f9f9f9;
}
</style>
</head>
<body>
<h1>Petición Evaluar modelo de LLM para generar texto contra datos curados</h1>
<select id="texto1">
%s
</select>
<br>
<select id="texto2">
<option value="meta-llama/Meta-Llama-3.1-70B-Instruct">meta-llama/Meta-Llama-3.1-70B-Instruct</option>
<option value="meta-llama/Meta-Llama-3.1-8B-Instruct">meta-llama/Meta-Llama-3.1-8B-Instruct</option>
<option value="Mistral">Mistral</option>
</select>
<br>
<select id="texto3">
%s
</select>
<br>
<input type="text" id="texto4" placeholder="max_tokens">
<br>
<button onclick="enviarPeticion()">Enviar petición</button>
<div id="respuesta"></div>
<script>
function enviarPeticion() {
const texto1 = document.getElementById('texto1').value;
const texto2 = document.getElementById('texto2').value;
const texto3 = document.getElementById('texto3').value;
const datos = {
path: texto1,
model: texto2,
system: texto3
};
fetch('/EvalLLMGeneracionTexto', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(datos)
})
.then(response => response.json())
.then(data => {
document.getElementById('respuesta').innerHTML = JSON.stringify(data, null, 2);
})
.catch(error => {
document.getElementById('respuesta').innerHTML = 'Error: ' + error;
});
}
</script>
</body>
</html>
"""%(Sal,Sal2)
return HTMLResponse(content=html, status_code=200)
#Por revisar #Por revisar

View File

@ -55,8 +55,14 @@ db.define_table(
Field("model"), Field("model"),
Field("time", type="double"), Field("time", type="double"),
Field("path"), Field("path"),
Field("similarity", type="double"), Field("relevance", type="double"),
Field("similaritypartial", type="double"), Field("bias", type="double"),
Field("toxic", type="double"),
Field("correctness", type="double"),
Field("relevance_r"),
Field("bias_r"),
Field("toxic_r"),
Field("correctness_r"),
Field('last_modified', 'datetime') Field('last_modified', 'datetime')
) )
@ -79,8 +85,14 @@ db.define_table(
Field("model"), Field("model"),
Field("time", type="double"), Field("time", type="double"),
Field("path"), Field("path"),
Field("similarity", type="double"), Field("relevance", type="double"),
Field("similaritypartial", type="double"), Field("bias", type="double"),
Field("toxic", type="double"),
Field("correctness", type="double"),
Field("relevance_r"),
Field("bias_r"),
Field("toxic_r"),
Field("correctness_r"),
Field('last_modified', 'datetime') Field('last_modified', 'datetime')
) )

53
gui.py
View File

@ -45,12 +45,12 @@ def html_getmetricvoice():
data_files={} data_files={}
for row in db().select(db.analitic_voice.ALL): for row in db().select(db.analitic_voice.ALL):
data_files[row.id]=row.as_dict() data_files[row.id]=row.as_dict()
#print(datafiles)
data_files=pd.DataFrame(data_files).T data_files=pd.DataFrame(data_files).T
#table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'], #table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'],
#columns=['model'], aggfunc="sum") #columns=['model'], aggfunc="sum")
#print(table,table.columns)
html=""" html="""
<h1>Data general de los modelos</h1> <h1>Data general de los modelos</h1>
@ -61,22 +61,32 @@ def html_getmetricvoice():
""" """
#<taipy:chart mode="markers" x="x" y[1]="time" y[2]="similarity">{data_files_voice}</taipy:chart> #<taipy:chart mode="markers" x="x" y[1]="time" y[2]="similarity">{data_files_voice}</taipy:chart>
print(time.time()-t)
return html,data,data_files return html,data,data_files
def getmetricllm_compra(model): def getmetricllm_compra(model):
rows = db(db.analitic_llm_compra.model==model).select() rows = db(db.analitic_llm_compra.model==model).select()
rows_list = rows.as_list() rows_list = rows.as_list()
data=pd.DataFrame(rows_list) data=pd.DataFrame(rows_list)
durationL=list()
for i in rows_list: #durationL=list()
durationL.append(db(db.trusted.path == i["path"] ).select().last().duration) #for i in rows_list:
duration=statistics.mean(durationL) #durationL.append(db(db.trusted.path == i["path"] ).select().last().duration)
time=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['time'].values[0] #duration=statistics.mean(durationL)
similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0] time=pd.pivot_table(data,values=['time'],index="model")['time'].values[0]
similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0] relevance=pd.pivot_table(data,values=["relevance"],index="model")['relevance'].values[0]
efectivetime=time/duration bias=pd.pivot_table(data,values=["bias"],index="model")['bias'].values[0]
return ({"model":model,"duration":duration,"time":time,"similarity":similarity,"similaritypartial":similaritypartial,"efectivetime":efectivetime}) toxic=pd.pivot_table(data,values=["toxic"],index="model")['toxic'].values[0]
correctness=pd.pivot_table(data,values=["correctness"],index="model")['correctness'].values[0]
#similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0]
#similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0]
#efectivetime=time/duration
return ({"model":model,"time":time,"relevance":relevance,"bias":bias,"toxic":toxic,"correctness":correctness})
def html_getmetricllm_compra(): def html_getmetricllm_compra():
models=list() models=list()
@ -90,33 +100,39 @@ def html_getmetricllm_compra():
data_files={} data_files={}
for row in db().select(db.analitic_llm_compra.ALL): for row in db().select(db.analitic_llm_compra.ALL):
data_files[row.id]=row.as_dict() data_files[row.id]=row.as_dict()
#print(datafiles)
data_files=pd.DataFrame(data_files).T data_files=pd.DataFrame(data_files).T
#table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'], #table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'],
#columns=['model'], aggfunc="sum") #columns=['model'], aggfunc="sum")
#print(table,table.columns)
html=""" html="""
<h1>Data general de los modelos</h1> <h1>Data general de los modelos</h1>
<taipy:table>{data_voice}</taipy:table> <taipy:table>{data_llm_compra}</taipy:table>
<h1>Data de cada muestra</h1> <h1>Data de cada muestra</h1>
<taipy:table filter=True>{data_files_voice}</taipy:table> <taipy:table filter=True >{data_files_llm_compra}</taipy:table>
""" """
#<taipy:chart mode="markers" x="x" y[1]="time" y[2]="similarity">{data_files_voice}</taipy:chart> #<taipy:chart mode="markers" x="x" y[1]="time" y[2]="similarity">{data_files_voice}</taipy:chart>
print(time.time()-t)
return html,data,data_files return html,data,data_files
def on_init(state): def on_init(state):
state.html_page_getmetricsvoice,state.data_voice,state.data_files_voice=html_getmetricvoice() state.html_page_getmetricsvoice,state.data_voice,state.data_files_voice=html_getmetricvoice()
state.html_page_getmetricsllm_compra,state.data_llm_compra,state.data_files_llm_compra=html_getmetricllm_compra()
pass pass
html_page_getmetricsvoice,data_voice,data_files_voice=html_getmetricvoice() html_page_getmetricsvoice,data_voice,data_files_voice=html_getmetricvoice()
html_page_getmetricsllm_compra,data_llm_compra,data_files_llm_compra=html_getmetricllm_compra()
# mode="voice" # mode="voice"
# modetypedata="audio" # modetypedata="audio"
# file="id2" # file="id2"
@ -135,10 +151,11 @@ html_page_getmetricsvoice,data_voice,data_files_voice=html_getmetricvoice()
data=pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
pages = { pages = {
"getmetricsvoice": Html(html_page_getmetricsvoice), "getmetricsvoice": Html(html_page_getmetricsvoice),
"getmetricsllm_compra": Html(html_page_getmetricsllm_compra),
} }
app = Gui(pages=pages) app = Gui(pages=pages)

76
main.py
View File

@ -2,14 +2,30 @@ import requests
import evaluate import evaluate
import deepdiff import deepdiff
import json import json
import os
from fuzzywuzzy import fuzz from fuzzywuzzy import fuzz
from deepdiff import DeepDiff from deepdiff import DeepDiff
from deepdiff import Delta from deepdiff import Delta
import databases import databases
import metrics
#print(evaluate.list_evaluation_modules()) #print(evaluate.list_evaluation_modules())
pwd = os.getcwd()
urlAud="http://127.0.0.1:7870/" urlAud="http://127.0.0.1:7870/"
urlText="http://127.0.0.1:7869" urlText="http://127.0.0.1:7869"
password="1223Aer*"
def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"):
configPath=os.path.join(os.getcwd(),relPath)
with open(configPath, 'r', encoding='utf-8') as file:
config = json.load(file)[nameModel]
Output= config[dataOut]
return Output
mode_list=extractConfig(nameModel="SystemData",dataOut="mode_list")
keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
password=extractConfig(nameModel="SystemData",dataOut="password")
def EvalVoice2Text(endpoint,datajson,Trusted): def EvalVoice2Text(endpoint,datajson,Trusted):
"""Evaluate Voice 2 text """Evaluate Voice 2 text
""" """
@ -43,15 +59,19 @@ def EvalVosk(path,Trusted=""):
def EvalLLMCompra(endpoint,datajson,Trusted): def EvalLLMCompra(endpoint,datajson,Trusted):
"""Evaluate Voice 2 text """Evaluate LLL compra
""" """
apiUrl=urlText+endpoint apiUrl=urlText+endpoint
response = requests.get(apiUrl, json=datajson) response = requests.get(apiUrl, json=datajson)
A=json.loads(response.content) A=json.loads(response.content)
time=A['time'] time=A['time']
print(A) relevance=metrics.RelevanceMetric(datajson["system"]+datajson["content"],response.content)
similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower()) bias=metrics.BiasMetric22(datajson["system"]+datajson["content"],response.content)
similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower()) toxic=metrics.ToxicMetric(datajson["system"]+datajson["content"],response.content)
correctness=metrics.correctnessMetric(datajson["system"]+datajson["content"],response.content,Trusted)
#jsonmetrics=metrics.jsonMetrics(response.content,Trusted)
#similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower())
#similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower())
#path=datajson["local"] #path=datajson["local"]
model=datajson["model"] model=datajson["model"]
@ -60,8 +80,14 @@ def EvalLLMCompra(endpoint,datajson,Trusted):
"trusted":Trusted, "trusted":Trusted,
"model":model, "model":model,
"time":time, "time":time,
"similarity":similarity, "relevance":relevance["score"],
"similaritypartial":similarityPartial, "bias":bias["score"],
"toxic":toxic["score"],
"correctness":correctness["score"],
"relevance_r":relevance["reason"],
"bias_r":bias["reason"],
"toxic_r":toxic["reason"],
"correctness_r":correctness["reason"],
"path":message "path":message
} }
@ -70,7 +96,43 @@ def EvalModelLLMCompra(system,content,model,max_new_tokens,Trusted):
datajson={"system":system,"content":content,"password":password ,"model":model,"max_new_token":max_new_tokens} datajson={"system":system,"content":content,"password":password ,"model":model,"max_new_token":max_new_tokens}
return EvalLLMCompra(endpoint,datajson,Trusted) return EvalLLMCompra(endpoint,datajson,Trusted)
def EvalLLMGeneracionTexto(endpoint,datajson,Trusted):
"""Evaluate LLL compra
"""
apiUrl=urlText+endpoint
response = requests.get(apiUrl, json=datajson)
A=json.loads(response.content)
time=A['time']
relevance=metrics.RelevanceMetric(datajson["system"]+datajson["content"],response.content)
bias=metrics.BiasMetric22(datajson["system"]+datajson["content"],response.content)
toxic=metrics.ToxicMetric(datajson["system"]+datajson["content"],response.content)
correctness=metrics.correctnessMetric(datajson["system"]+datajson["content"],response.content,Trusted)
#jsonmetrics=metrics.jsonMetrics(response.content,Trusted)
#similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower())
#similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower())
#path=datajson["local"]
model=datajson["model"]
message=A['content']
return {"content":message,
"trusted":Trusted,
"model":model,
"time":time,
"relevance":relevance["score"],
"bias":bias["score"],
"toxic":toxic["score"],
"correctness":correctness["score"],
"relevance_r":relevance["reason"],
"bias_r":bias["reason"],
"toxic_r":toxic["reason"],
"correctness_r":correctness["reason"],
"path":message
}
def EvalModelLLMGeneracionTexto(system,content,model,max_new_tokens,Trusted):
endpoint="/genTextCustom"
datajson={"system":system,"content":content,"password":password ,"model":model,"max_new_token":max_new_tokens}
return EvalLLMGeneracionTexto(endpoint,datajson,Trusted)

150
metrics.py Normal file
View File

@ -0,0 +1,150 @@
from pydantic import BaseModel
from anthropic import Anthropic
import instructor
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval.metrics import BiasMetric
from deepeval.metrics import ToxicityMetric
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepdiff import DeepDiff
import json
import os
pwd = os.getcwd()
def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"):
configPath=os.path.join(os.getcwd(),relPath)
with open(configPath, 'r', encoding='utf-8') as file:
config = json.load(file)[nameModel]
Output= config[dataOut]
return Output
keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
class CustomClaudeOpus(DeepEvalBaseLLM):
def __init__(self):
self.model = Anthropic(api_key=keyanthropic)
def load_model(self):
return self.model
def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
client = self.load_model()
instructor_client = instructor.from_anthropic(client)
resp = instructor_client.messages.create(
model="claude-3-5-sonnet-20240620",
max_tokens=1024,
messages=[
{
"role": "user",
"content": prompt,
}
],
response_model=schema,
)
return resp
async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
return self.generate(prompt, schema)
def get_model_name(self):
return "Claude-3.5 sonnet"
customModel=CustomClaudeOpus()
def BiasMetric22(input,actual_output):
metric = BiasMetric(threshold=0.5,model=customModel)
test_case = LLMTestCase(
input=input,
actual_output=actual_output
)
metric.measure(test_case)
return {"score":metric.score,"reason":metric.reason}
def RelevanceMetric(input,actual_output):
# Replace this with the actual output from your LLM application
metric = AnswerRelevancyMetric(
threshold=0.7,
model=customModel,
include_reason=True
)
test_case = LLMTestCase(
input=input,
actual_output=actual_output
)
metric.measure(test_case)
return {"score":metric.score,"reason":metric.reason}
def ToxicMetric(input,actual_output):
metric = ToxicityMetric(threshold=0.5,model=customModel)
test_case = LLMTestCase(
input=input,
actual_output=actual_output
)
metric.measure(test_case)
print(metric.score,"toxic")
return {"score":metric.score,"reason":metric.reason}
def correctnessMetric(input,actual_output,expected_output,criteria="Determine that the output is a json whose keys contain with compra and the data correspond to the input",evaluation_steps=["Check whether the facts in 'actual output' contradicts any facts in 'expected output'","You should also heavily penalize omission of detail","Vague language, or contradicting OPINIONS, are OK" ]):
correctness_metric = GEval(
name="Correctness",
model=customModel,
criteria=criteria,
# NOTE: you can only provide either criteria or evaluation_steps, and not both
#evaluation_steps=evaluation_steps,
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
)
test_case = LLMTestCase(
input=input,
actual_output=actual_output,
expected_output=expected_output
)
correctness_metric.measure(test_case)
return {"score":correctness_metric.score,"reason":correctness_metric.reason}
def jsonMetrics(text,Trusted):
false=False
print(type(text),type(Trusted))
try:
A=json.loads(text)
jsonOk=1
except:
jsonOk=0
print(jsonOk)
if jsonOk==1:
try:
Trus=json.loads(Trusted)
except:
Trus=Trusted
print(11111,3333,Trus)
# print(type(A),type(json.loads(Trus)))
# ddiff = DeepDiff(A, Trus)
# print(5555,ddiff)
# affectedkeys=ddiff.affected_root_keys/len(A.keys())
# keys=set(json.loads(Trusted).keys())
# jsonkeys=set(A.keys())
# TotKey=len(keys.intersection(jsonkeys))/len(keys)
# keyplus=jsonkeys.intersection(keys)
# else:
# TotKey=0
# keyplus=0
# affectedkeys=0
return {"jsonOk":jsonOk}#,"TotKey":TotKey,"keyplus":keyplus,"affectedkeys":affectedkeys}