feat: Eval LLm
This commit is contained in:
parent
881d3074cf
commit
3514733885
140
apis.py
140
apis.py
|
@ -249,7 +249,7 @@ def EvalVoicehtml():
|
|||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Petición Evaluar modelo de voz comtra datos curados</h1>
|
||||
<h1>Petición Evaluar modelo de voz contra datos curados</h1>
|
||||
|
||||
<select id="texto1">
|
||||
%s
|
||||
|
@ -319,7 +319,7 @@ def EvalLLMCompra(response:Response4):
|
|||
db.commit()
|
||||
else:
|
||||
print(2,Sal)
|
||||
db((db.analitic_llm_compra.path == Sal["path"]) & (db.analitic_llm_compra.model == Sal["model"])).update(similarity= Sal["similarity"],similaritypartial= Sal["similaritypartial"],last_modified=Sal["last_modified"])
|
||||
db((db.analitic_llm_compra.path == Sal["path"]) & (db.analitic_llm_compra.model == Sal["model"])).update(last_modified=Sal["last_modified"],relevance=Sal["relevance"],bias=Sal["bias"],toxic=Sal["toxic"],correctness=Sal["correctness"],relevance_r=Sal["relevance_r"],bias_r=Sal["bias_r"],toxic_r=Sal["toxic_r"],correctness_r=Sal["correctness_r"])
|
||||
db.commit()
|
||||
return Sal
|
||||
|
||||
|
@ -368,7 +368,7 @@ def EvalLLMComprahtml():
|
|||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Petición Evaluar modelo de voz comtra datos curados</h1>
|
||||
<h1>Petición Evaluar modelo de LLM para evaluar compras contra datos curados</h1>
|
||||
|
||||
<select id="texto1">
|
||||
%s
|
||||
|
@ -424,6 +424,140 @@ def EvalLLMComprahtml():
|
|||
"""%(Sal,Sal2)
|
||||
return HTMLResponse(content=html, status_code=200)
|
||||
|
||||
#
|
||||
@app.get("/EvalLLMGeneracionTexto")
|
||||
@app.post("/EvalLLMGeneracionTexto")
|
||||
def EvalLLMGeneracionTexto(response:Response4):
|
||||
content=response.path
|
||||
model=response.model
|
||||
system= response.system
|
||||
max_tokens= response.max_tokens
|
||||
path=content
|
||||
|
||||
if db((db.trusted.path == path ) & ( db.trusted.mode == "llm_generaciontexto")).count()==0:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content={"content": "Trusted no found" }
|
||||
)
|
||||
|
||||
Trusted=db((db.trusted.path == path ) & ( db.trusted.mode == "llm_generaciontexto")).select().last().trusted
|
||||
Sal=main.EvalModelLLMCompra(system,content,model,max_tokens,Trusted)
|
||||
Sal["last_modified"]=datetime.now()
|
||||
if db((db.analitic_llm_generaciontexto.path == Sal["path"]) & (db.analitic_llm_generaciontexto.model == Sal["model"])).count()==0:
|
||||
print(1,Sal)
|
||||
db.analitic_llm_generaciontexto.insert(**Sal)
|
||||
db.commit()
|
||||
else:
|
||||
print(2,Sal)
|
||||
db((db.analitic_llm_generaciontexto.path == Sal["path"]) & (db.analitic_llm_generaciontexto.model == Sal["model"])).update(last_modified=Sal["last_modified"],relevance=Sal["relevance"],bias=Sal["bias"],toxic=Sal["toxic"],correctness=Sal["correctness"],relevance_r=Sal["relevance_r"],bias_r=Sal["bias_r"],toxic_r=Sal["toxic_r"],correctness_r=Sal["correctness_r"])
|
||||
db.commit()
|
||||
return Sal
|
||||
|
||||
@app.get("/evalllmgeneraciontextohtml")
|
||||
def EvalLLMGeneracionTextohtml():
|
||||
dir_list = db((db.trusted.mode == "llm_generaciontexto" )).select()
|
||||
Sal=""
|
||||
t=1
|
||||
for i in dir_list:
|
||||
temp="""<option value="%s">Opción %s, %s</option>
|
||||
"""%(i.path,str(t),str(i.path))
|
||||
Sal=Sal+temp
|
||||
t=t+1
|
||||
|
||||
dir_list2 = db((db.prompt.mode == "llm_generaciontexto" )).select()
|
||||
Sal2=""
|
||||
t=1
|
||||
for i in dir_list2:
|
||||
temp="""<option value="%s">Opción %s, %s</option>
|
||||
"""%(i.prompt,str(t),str(i.prompt))
|
||||
Sal2=Sal2+temp
|
||||
t=t+1
|
||||
|
||||
|
||||
html="""<!DOCTYPE html>
|
||||
<html lang="es">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Evaluacion de modelos voice2txt</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
margin: 20px;
|
||||
}
|
||||
input, button {
|
||||
margin: 10px 0;
|
||||
padding: 5px;
|
||||
}
|
||||
#respuesta {
|
||||
margin-top: 20px;
|
||||
padding: 10px;
|
||||
border: 1px solid #ccc;
|
||||
background-color: #f9f9f9;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Petición Evaluar modelo de LLM para generar texto contra datos curados</h1>
|
||||
|
||||
<select id="texto1">
|
||||
%s
|
||||
</select>
|
||||
|
||||
<br>
|
||||
<select id="texto2">
|
||||
<option value="meta-llama/Meta-Llama-3.1-70B-Instruct">meta-llama/Meta-Llama-3.1-70B-Instruct</option>
|
||||
<option value="meta-llama/Meta-Llama-3.1-8B-Instruct">meta-llama/Meta-Llama-3.1-8B-Instruct</option>
|
||||
<option value="Mistral">Mistral</option>
|
||||
</select>
|
||||
<br>
|
||||
<select id="texto3">
|
||||
%s
|
||||
</select>
|
||||
<br>
|
||||
<input type="text" id="texto4" placeholder="max_tokens">
|
||||
<br>
|
||||
<button onclick="enviarPeticion()">Enviar petición</button>
|
||||
<div id="respuesta"></div>
|
||||
|
||||
<script>
|
||||
function enviarPeticion() {
|
||||
const texto1 = document.getElementById('texto1').value;
|
||||
const texto2 = document.getElementById('texto2').value;
|
||||
const texto3 = document.getElementById('texto3').value;
|
||||
const datos = {
|
||||
path: texto1,
|
||||
model: texto2,
|
||||
system: texto3
|
||||
|
||||
|
||||
};
|
||||
|
||||
fetch('/EvalLLMGeneracionTexto', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify(datos)
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
document.getElementById('respuesta').innerHTML = JSON.stringify(data, null, 2);
|
||||
})
|
||||
.catch(error => {
|
||||
document.getElementById('respuesta').innerHTML = 'Error: ' + error;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""%(Sal,Sal2)
|
||||
return HTMLResponse(content=html, status_code=200)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#Por revisar
|
||||
|
||||
|
|
22
databases.py
22
databases.py
|
@ -55,8 +55,14 @@ db.define_table(
|
|||
Field("model"),
|
||||
Field("time", type="double"),
|
||||
Field("path"),
|
||||
Field("similarity", type="double"),
|
||||
Field("similaritypartial", type="double"),
|
||||
Field("relevance", type="double"),
|
||||
Field("bias", type="double"),
|
||||
Field("toxic", type="double"),
|
||||
Field("correctness", type="double"),
|
||||
Field("relevance_r"),
|
||||
Field("bias_r"),
|
||||
Field("toxic_r"),
|
||||
Field("correctness_r"),
|
||||
Field('last_modified', 'datetime')
|
||||
)
|
||||
|
||||
|
@ -79,9 +85,15 @@ db.define_table(
|
|||
Field("model"),
|
||||
Field("time", type="double"),
|
||||
Field("path"),
|
||||
Field("similarity", type="double"),
|
||||
Field("similaritypartial", type="double"),
|
||||
Field('last_modified', 'datetime')
|
||||
Field("relevance", type="double"),
|
||||
Field("bias", type="double"),
|
||||
Field("toxic", type="double"),
|
||||
Field("correctness", type="double"),
|
||||
Field("relevance_r"),
|
||||
Field("bias_r"),
|
||||
Field("toxic_r"),
|
||||
Field("correctness_r"),
|
||||
Field('last_modified', 'datetime')
|
||||
)
|
||||
|
||||
db.define_table(
|
||||
|
|
53
gui.py
53
gui.py
|
@ -45,12 +45,12 @@ def html_getmetricvoice():
|
|||
data_files={}
|
||||
for row in db().select(db.analitic_voice.ALL):
|
||||
data_files[row.id]=row.as_dict()
|
||||
#print(datafiles)
|
||||
|
||||
data_files=pd.DataFrame(data_files).T
|
||||
|
||||
#table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'],
|
||||
#columns=['model'], aggfunc="sum")
|
||||
#print(table,table.columns)
|
||||
|
||||
|
||||
html="""
|
||||
<h1>Data general de los modelos</h1>
|
||||
|
@ -61,22 +61,32 @@ def html_getmetricvoice():
|
|||
|
||||
"""
|
||||
#<taipy:chart mode="markers" x="x" y[1]="time" y[2]="similarity">{data_files_voice}</taipy:chart>
|
||||
print(time.time()-t)
|
||||
|
||||
return html,data,data_files
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def getmetricllm_compra(model):
|
||||
rows = db(db.analitic_llm_compra.model==model).select()
|
||||
rows_list = rows.as_list()
|
||||
data=pd.DataFrame(rows_list)
|
||||
durationL=list()
|
||||
for i in rows_list:
|
||||
durationL.append(db(db.trusted.path == i["path"] ).select().last().duration)
|
||||
duration=statistics.mean(durationL)
|
||||
time=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['time'].values[0]
|
||||
similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0]
|
||||
similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0]
|
||||
efectivetime=time/duration
|
||||
return ({"model":model,"duration":duration,"time":time,"similarity":similarity,"similaritypartial":similaritypartial,"efectivetime":efectivetime})
|
||||
|
||||
#durationL=list()
|
||||
#for i in rows_list:
|
||||
#durationL.append(db(db.trusted.path == i["path"] ).select().last().duration)
|
||||
#duration=statistics.mean(durationL)
|
||||
time=pd.pivot_table(data,values=['time'],index="model")['time'].values[0]
|
||||
relevance=pd.pivot_table(data,values=["relevance"],index="model")['relevance'].values[0]
|
||||
bias=pd.pivot_table(data,values=["bias"],index="model")['bias'].values[0]
|
||||
toxic=pd.pivot_table(data,values=["toxic"],index="model")['toxic'].values[0]
|
||||
|
||||
correctness=pd.pivot_table(data,values=["correctness"],index="model")['correctness'].values[0]
|
||||
#similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0]
|
||||
#similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0]
|
||||
#efectivetime=time/duration
|
||||
return ({"model":model,"time":time,"relevance":relevance,"bias":bias,"toxic":toxic,"correctness":correctness})
|
||||
|
||||
def html_getmetricllm_compra():
|
||||
models=list()
|
||||
|
@ -90,33 +100,39 @@ def html_getmetricllm_compra():
|
|||
data_files={}
|
||||
for row in db().select(db.analitic_llm_compra.ALL):
|
||||
data_files[row.id]=row.as_dict()
|
||||
#print(datafiles)
|
||||
|
||||
data_files=pd.DataFrame(data_files).T
|
||||
|
||||
#table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'],
|
||||
#columns=['model'], aggfunc="sum")
|
||||
#print(table,table.columns)
|
||||
|
||||
|
||||
html="""
|
||||
<h1>Data general de los modelos</h1>
|
||||
<taipy:table>{data_voice}</taipy:table>
|
||||
<taipy:table>{data_llm_compra}</taipy:table>
|
||||
<h1>Data de cada muestra</h1>
|
||||
<taipy:table filter=True>{data_files_voice}</taipy:table>
|
||||
<taipy:table filter=True >{data_files_llm_compra}</taipy:table>
|
||||
|
||||
|
||||
"""
|
||||
#<taipy:chart mode="markers" x="x" y[1]="time" y[2]="similarity">{data_files_voice}</taipy:chart>
|
||||
print(time.time()-t)
|
||||
|
||||
return html,data,data_files
|
||||
|
||||
|
||||
def on_init(state):
|
||||
state.html_page_getmetricsvoice,state.data_voice,state.data_files_voice=html_getmetricvoice()
|
||||
state.html_page_getmetricsllm_compra,state.data_llm_compra,state.data_files_llm_compra=html_getmetricllm_compra()
|
||||
|
||||
pass
|
||||
|
||||
|
||||
|
||||
html_page_getmetricsvoice,data_voice,data_files_voice=html_getmetricvoice()
|
||||
|
||||
html_page_getmetricsllm_compra,data_llm_compra,data_files_llm_compra=html_getmetricllm_compra()
|
||||
|
||||
|
||||
# mode="voice"
|
||||
# modetypedata="audio"
|
||||
# file="id2"
|
||||
|
@ -135,10 +151,11 @@ html_page_getmetricsvoice,data_voice,data_files_voice=html_getmetricvoice()
|
|||
|
||||
|
||||
|
||||
data=pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
|
||||
|
||||
|
||||
pages = {
|
||||
"getmetricsvoice": Html(html_page_getmetricsvoice),
|
||||
"getmetricsllm_compra": Html(html_page_getmetricsllm_compra),
|
||||
}
|
||||
|
||||
app = Gui(pages=pages)
|
||||
|
|
76
main.py
76
main.py
|
@ -2,14 +2,30 @@ import requests
|
|||
import evaluate
|
||||
import deepdiff
|
||||
import json
|
||||
import os
|
||||
|
||||
from fuzzywuzzy import fuzz
|
||||
from deepdiff import DeepDiff
|
||||
from deepdiff import Delta
|
||||
import databases
|
||||
import metrics
|
||||
#print(evaluate.list_evaluation_modules())
|
||||
pwd = os.getcwd()
|
||||
urlAud="http://127.0.0.1:7870/"
|
||||
urlText="http://127.0.0.1:7869"
|
||||
password="1223Aer*"
|
||||
|
||||
|
||||
def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"):
|
||||
configPath=os.path.join(os.getcwd(),relPath)
|
||||
with open(configPath, 'r', encoding='utf-8') as file:
|
||||
config = json.load(file)[nameModel]
|
||||
Output= config[dataOut]
|
||||
return Output
|
||||
mode_list=extractConfig(nameModel="SystemData",dataOut="mode_list")
|
||||
keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
|
||||
password=extractConfig(nameModel="SystemData",dataOut="password")
|
||||
|
||||
|
||||
def EvalVoice2Text(endpoint,datajson,Trusted):
|
||||
"""Evaluate Voice 2 text
|
||||
"""
|
||||
|
@ -43,15 +59,19 @@ def EvalVosk(path,Trusted=""):
|
|||
|
||||
|
||||
def EvalLLMCompra(endpoint,datajson,Trusted):
|
||||
"""Evaluate Voice 2 text
|
||||
"""Evaluate LLL compra
|
||||
"""
|
||||
apiUrl=urlText+endpoint
|
||||
response = requests.get(apiUrl, json=datajson)
|
||||
A=json.loads(response.content)
|
||||
time=A['time']
|
||||
print(A)
|
||||
similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower())
|
||||
similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower())
|
||||
relevance=metrics.RelevanceMetric(datajson["system"]+datajson["content"],response.content)
|
||||
bias=metrics.BiasMetric22(datajson["system"]+datajson["content"],response.content)
|
||||
toxic=metrics.ToxicMetric(datajson["system"]+datajson["content"],response.content)
|
||||
correctness=metrics.correctnessMetric(datajson["system"]+datajson["content"],response.content,Trusted)
|
||||
#jsonmetrics=metrics.jsonMetrics(response.content,Trusted)
|
||||
#similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower())
|
||||
#similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower())
|
||||
#path=datajson["local"]
|
||||
model=datajson["model"]
|
||||
|
||||
|
@ -60,8 +80,14 @@ def EvalLLMCompra(endpoint,datajson,Trusted):
|
|||
"trusted":Trusted,
|
||||
"model":model,
|
||||
"time":time,
|
||||
"similarity":similarity,
|
||||
"similaritypartial":similarityPartial,
|
||||
"relevance":relevance["score"],
|
||||
"bias":bias["score"],
|
||||
"toxic":toxic["score"],
|
||||
"correctness":correctness["score"],
|
||||
"relevance_r":relevance["reason"],
|
||||
"bias_r":bias["reason"],
|
||||
"toxic_r":toxic["reason"],
|
||||
"correctness_r":correctness["reason"],
|
||||
"path":message
|
||||
}
|
||||
|
||||
|
@ -70,7 +96,43 @@ def EvalModelLLMCompra(system,content,model,max_new_tokens,Trusted):
|
|||
datajson={"system":system,"content":content,"password":password ,"model":model,"max_new_token":max_new_tokens}
|
||||
return EvalLLMCompra(endpoint,datajson,Trusted)
|
||||
|
||||
def EvalLLMGeneracionTexto(endpoint,datajson,Trusted):
|
||||
"""Evaluate LLL compra
|
||||
"""
|
||||
apiUrl=urlText+endpoint
|
||||
response = requests.get(apiUrl, json=datajson)
|
||||
A=json.loads(response.content)
|
||||
time=A['time']
|
||||
relevance=metrics.RelevanceMetric(datajson["system"]+datajson["content"],response.content)
|
||||
bias=metrics.BiasMetric22(datajson["system"]+datajson["content"],response.content)
|
||||
toxic=metrics.ToxicMetric(datajson["system"]+datajson["content"],response.content)
|
||||
correctness=metrics.correctnessMetric(datajson["system"]+datajson["content"],response.content,Trusted)
|
||||
#jsonmetrics=metrics.jsonMetrics(response.content,Trusted)
|
||||
#similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower())
|
||||
#similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower())
|
||||
#path=datajson["local"]
|
||||
model=datajson["model"]
|
||||
|
||||
message=A['content']
|
||||
return {"content":message,
|
||||
"trusted":Trusted,
|
||||
"model":model,
|
||||
"time":time,
|
||||
"relevance":relevance["score"],
|
||||
"bias":bias["score"],
|
||||
"toxic":toxic["score"],
|
||||
"correctness":correctness["score"],
|
||||
"relevance_r":relevance["reason"],
|
||||
"bias_r":bias["reason"],
|
||||
"toxic_r":toxic["reason"],
|
||||
"correctness_r":correctness["reason"],
|
||||
"path":message
|
||||
}
|
||||
|
||||
def EvalModelLLMGeneracionTexto(system,content,model,max_new_tokens,Trusted):
|
||||
endpoint="/genTextCustom"
|
||||
datajson={"system":system,"content":content,"password":password ,"model":model,"max_new_token":max_new_tokens}
|
||||
return EvalLLMGeneracionTexto(endpoint,datajson,Trusted)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,150 @@
|
|||
from pydantic import BaseModel
|
||||
from anthropic import Anthropic
|
||||
import instructor
|
||||
from deepeval.models import DeepEvalBaseLLM
|
||||
from deepeval.metrics import AnswerRelevancyMetric
|
||||
from deepeval.test_case import LLMTestCase
|
||||
from deepeval.metrics import BiasMetric
|
||||
from deepeval.metrics import ToxicityMetric
|
||||
from deepeval.metrics import GEval
|
||||
from deepeval.test_case import LLMTestCaseParams
|
||||
from deepdiff import DeepDiff
|
||||
import json
|
||||
import os
|
||||
pwd = os.getcwd()
|
||||
def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"):
|
||||
configPath=os.path.join(os.getcwd(),relPath)
|
||||
with open(configPath, 'r', encoding='utf-8') as file:
|
||||
config = json.load(file)[nameModel]
|
||||
Output= config[dataOut]
|
||||
return Output
|
||||
|
||||
keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
|
||||
|
||||
class CustomClaudeOpus(DeepEvalBaseLLM):
|
||||
def __init__(self):
|
||||
self.model = Anthropic(api_key=keyanthropic)
|
||||
|
||||
def load_model(self):
|
||||
return self.model
|
||||
|
||||
def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
|
||||
client = self.load_model()
|
||||
instructor_client = instructor.from_anthropic(client)
|
||||
resp = instructor_client.messages.create(
|
||||
model="claude-3-5-sonnet-20240620",
|
||||
max_tokens=1024,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
response_model=schema,
|
||||
)
|
||||
return resp
|
||||
|
||||
async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
|
||||
return self.generate(prompt, schema)
|
||||
|
||||
def get_model_name(self):
|
||||
return "Claude-3.5 sonnet"
|
||||
customModel=CustomClaudeOpus()
|
||||
|
||||
def BiasMetric22(input,actual_output):
|
||||
metric = BiasMetric(threshold=0.5,model=customModel)
|
||||
|
||||
test_case = LLMTestCase(
|
||||
input=input,
|
||||
actual_output=actual_output
|
||||
)
|
||||
metric.measure(test_case)
|
||||
return {"score":metric.score,"reason":metric.reason}
|
||||
|
||||
def RelevanceMetric(input,actual_output):
|
||||
# Replace this with the actual output from your LLM application
|
||||
metric = AnswerRelevancyMetric(
|
||||
threshold=0.7,
|
||||
model=customModel,
|
||||
include_reason=True
|
||||
)
|
||||
test_case = LLMTestCase(
|
||||
input=input,
|
||||
actual_output=actual_output
|
||||
)
|
||||
|
||||
metric.measure(test_case)
|
||||
return {"score":metric.score,"reason":metric.reason}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def ToxicMetric(input,actual_output):
|
||||
metric = ToxicityMetric(threshold=0.5,model=customModel)
|
||||
test_case = LLMTestCase(
|
||||
input=input,
|
||||
actual_output=actual_output
|
||||
)
|
||||
metric.measure(test_case)
|
||||
print(metric.score,"toxic")
|
||||
return {"score":metric.score,"reason":metric.reason}
|
||||
|
||||
|
||||
|
||||
def correctnessMetric(input,actual_output,expected_output,criteria="Determine that the output is a json whose keys contain with compra and the data correspond to the input",evaluation_steps=["Check whether the facts in 'actual output' contradicts any facts in 'expected output'","You should also heavily penalize omission of detail","Vague language, or contradicting OPINIONS, are OK" ]):
|
||||
correctness_metric = GEval(
|
||||
name="Correctness",
|
||||
model=customModel,
|
||||
criteria=criteria,
|
||||
# NOTE: you can only provide either criteria or evaluation_steps, and not both
|
||||
#evaluation_steps=evaluation_steps,
|
||||
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
|
||||
)
|
||||
test_case = LLMTestCase(
|
||||
input=input,
|
||||
actual_output=actual_output,
|
||||
expected_output=expected_output
|
||||
)
|
||||
|
||||
correctness_metric.measure(test_case)
|
||||
return {"score":correctness_metric.score,"reason":correctness_metric.reason}
|
||||
|
||||
def jsonMetrics(text,Trusted):
|
||||
false=False
|
||||
print(type(text),type(Trusted))
|
||||
try:
|
||||
A=json.loads(text)
|
||||
jsonOk=1
|
||||
except:
|
||||
jsonOk=0
|
||||
print(jsonOk)
|
||||
if jsonOk==1:
|
||||
|
||||
try:
|
||||
Trus=json.loads(Trusted)
|
||||
except:
|
||||
Trus=Trusted
|
||||
print(11111,3333,Trus)
|
||||
# print(type(A),type(json.loads(Trus)))
|
||||
# ddiff = DeepDiff(A, Trus)
|
||||
# print(5555,ddiff)
|
||||
# affectedkeys=ddiff.affected_root_keys/len(A.keys())
|
||||
# keys=set(json.loads(Trusted).keys())
|
||||
# jsonkeys=set(A.keys())
|
||||
# TotKey=len(keys.intersection(jsonkeys))/len(keys)
|
||||
# keyplus=jsonkeys.intersection(keys)
|
||||
# else:
|
||||
# TotKey=0
|
||||
# keyplus=0
|
||||
# affectedkeys=0
|
||||
|
||||
return {"jsonOk":jsonOk}#,"TotKey":TotKey,"keyplus":keyplus,"affectedkeys":affectedkeys}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue