From 35147338850c50551afdf78d0a5340425cfcd9f8 Mon Sep 17 00:00:00 2001 From: marioggil Date: Fri, 9 Aug 2024 08:15:44 -0500 Subject: [PATCH] feat: Eval LLm --- apis.py | 140 +++++++++++++++++++++++++++++++++++++++++++++-- databases.py | 22 ++++++-- gui.py | 53 +++++++++++------- main.py | 76 +++++++++++++++++++++++--- metrics.py | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 408 insertions(+), 33 deletions(-) create mode 100644 metrics.py diff --git a/apis.py b/apis.py index e29be7d..5b53bc4 100644 --- a/apis.py +++ b/apis.py @@ -249,7 +249,7 @@ def EvalVoicehtml(): -

Petición Evaluar modelo de voz comtra datos curados

+

Petición Evaluar modelo de voz contra datos curados

%s @@ -424,6 +424,140 @@ def EvalLLMComprahtml(): """%(Sal,Sal2) return HTMLResponse(content=html, status_code=200) +# +@app.get("/EvalLLMGeneracionTexto") +@app.post("/EvalLLMGeneracionTexto") +def EvalLLMGeneracionTexto(response:Response4): + content=response.path + model=response.model + system= response.system + max_tokens= response.max_tokens + path=content + + if db((db.trusted.path == path ) & ( db.trusted.mode == "llm_generaciontexto")).count()==0: + return JSONResponse( + status_code=404, + content={"content": "Trusted no found" } + ) + + Trusted=db((db.trusted.path == path ) & ( db.trusted.mode == "llm_generaciontexto")).select().last().trusted + Sal=main.EvalModelLLMCompra(system,content,model,max_tokens,Trusted) + Sal["last_modified"]=datetime.now() + if db((db.analitic_llm_generaciontexto.path == Sal["path"]) & (db.analitic_llm_generaciontexto.model == Sal["model"])).count()==0: + print(1,Sal) + db.analitic_llm_generaciontexto.insert(**Sal) + db.commit() + else: + print(2,Sal) + db((db.analitic_llm_generaciontexto.path == Sal["path"]) & (db.analitic_llm_generaciontexto.model == Sal["model"])).update(last_modified=Sal["last_modified"],relevance=Sal["relevance"],bias=Sal["bias"],toxic=Sal["toxic"],correctness=Sal["correctness"],relevance_r=Sal["relevance_r"],bias_r=Sal["bias_r"],toxic_r=Sal["toxic_r"],correctness_r=Sal["correctness_r"]) + db.commit() + return Sal + +@app.get("/evalllmgeneraciontextohtml") +def EvalLLMGeneracionTextohtml(): + dir_list = db((db.trusted.mode == "llm_generaciontexto" )).select() + Sal="" + t=1 + for i in dir_list: + temp=""" + """%(i.path,str(t),str(i.path)) + Sal=Sal+temp + t=t+1 + + dir_list2 = db((db.prompt.mode == "llm_generaciontexto" )).select() + Sal2="" + t=1 + for i in dir_list2: + temp=""" + """%(i.prompt,str(t),str(i.prompt)) + Sal2=Sal2+temp + t=t+1 + + + html=""" + + + + + Evaluacion de modelos voice2txt + + + +

Petición Evaluar modelo de LLM para generar texto contra datos curados

+ + + +
+ +
+ +
+ +
+ +
+ + + + + """%(Sal,Sal2) + return HTMLResponse(content=html, status_code=200) + + + + + #Por revisar diff --git a/databases.py b/databases.py index d19b055..8fca7e7 100644 --- a/databases.py +++ b/databases.py @@ -55,8 +55,14 @@ db.define_table( Field("model"), Field("time", type="double"), Field("path"), - Field("similarity", type="double"), - Field("similaritypartial", type="double"), + Field("relevance", type="double"), + Field("bias", type="double"), + Field("toxic", type="double"), + Field("correctness", type="double"), + Field("relevance_r"), + Field("bias_r"), + Field("toxic_r"), + Field("correctness_r"), Field('last_modified', 'datetime') ) @@ -79,9 +85,15 @@ db.define_table( Field("model"), Field("time", type="double"), Field("path"), - Field("similarity", type="double"), - Field("similaritypartial", type="double"), - Field('last_modified', 'datetime') + Field("relevance", type="double"), + Field("bias", type="double"), + Field("toxic", type="double"), + Field("correctness", type="double"), + Field("relevance_r"), + Field("bias_r"), + Field("toxic_r"), + Field("correctness_r"), + Field('last_modified', 'datetime') ) db.define_table( diff --git a/gui.py b/gui.py index eeb8233..01c9e86 100644 --- a/gui.py +++ b/gui.py @@ -45,12 +45,12 @@ def html_getmetricvoice(): data_files={} for row in db().select(db.analitic_voice.ALL): data_files[row.id]=row.as_dict() - #print(datafiles) + data_files=pd.DataFrame(data_files).T #table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'], #columns=['model'], aggfunc="sum") - #print(table,table.columns) + html="""

Data general de los modelos

@@ -61,22 +61,32 @@ def html_getmetricvoice(): """ #{data_files_voice} - print(time.time()-t) + return html,data,data_files + + + + def getmetricllm_compra(model): rows = db(db.analitic_llm_compra.model==model).select() rows_list = rows.as_list() data=pd.DataFrame(rows_list) - durationL=list() - for i in rows_list: - durationL.append(db(db.trusted.path == i["path"] ).select().last().duration) - duration=statistics.mean(durationL) - time=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['time'].values[0] - similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0] - similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0] - efectivetime=time/duration - return ({"model":model,"duration":duration,"time":time,"similarity":similarity,"similaritypartial":similaritypartial,"efectivetime":efectivetime}) + + #durationL=list() + #for i in rows_list: + #durationL.append(db(db.trusted.path == i["path"] ).select().last().duration) + #duration=statistics.mean(durationL) + time=pd.pivot_table(data,values=['time'],index="model")['time'].values[0] + relevance=pd.pivot_table(data,values=["relevance"],index="model")['relevance'].values[0] + bias=pd.pivot_table(data,values=["bias"],index="model")['bias'].values[0] + toxic=pd.pivot_table(data,values=["toxic"],index="model")['toxic'].values[0] + + correctness=pd.pivot_table(data,values=["correctness"],index="model")['correctness'].values[0] + #similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0] + #similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0] + #efectivetime=time/duration + return ({"model":model,"time":time,"relevance":relevance,"bias":bias,"toxic":toxic,"correctness":correctness}) def html_getmetricllm_compra(): models=list() @@ -90,33 +100,39 @@ def html_getmetricllm_compra(): data_files={} for row in db().select(db.analitic_llm_compra.ALL): data_files[row.id]=row.as_dict() - #print(datafiles) + data_files=pd.DataFrame(data_files).T #table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'], #columns=['model'], aggfunc="sum") - #print(table,table.columns) + html="""

Data general de los modelos

- {data_voice} + {data_llm_compra}

Data de cada muestra

- {data_files_voice} + {data_files_llm_compra} """ #{data_files_voice} - print(time.time()-t) + return html,data,data_files def on_init(state): state.html_page_getmetricsvoice,state.data_voice,state.data_files_voice=html_getmetricvoice() + state.html_page_getmetricsllm_compra,state.data_llm_compra,state.data_files_llm_compra=html_getmetricllm_compra() + pass html_page_getmetricsvoice,data_voice,data_files_voice=html_getmetricvoice() + +html_page_getmetricsllm_compra,data_llm_compra,data_files_llm_compra=html_getmetricllm_compra() + + # mode="voice" # modetypedata="audio" # file="id2" @@ -135,10 +151,11 @@ html_page_getmetricsvoice,data_voice,data_files_voice=html_getmetricvoice() -data=pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + pages = { "getmetricsvoice": Html(html_page_getmetricsvoice), + "getmetricsllm_compra": Html(html_page_getmetricsllm_compra), } app = Gui(pages=pages) diff --git a/main.py b/main.py index 47aef3a..38c2c38 100644 --- a/main.py +++ b/main.py @@ -2,14 +2,30 @@ import requests import evaluate import deepdiff import json +import os + from fuzzywuzzy import fuzz from deepdiff import DeepDiff from deepdiff import Delta import databases +import metrics #print(evaluate.list_evaluation_modules()) +pwd = os.getcwd() urlAud="http://127.0.0.1:7870/" urlText="http://127.0.0.1:7869" -password="1223Aer*" + + +def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"): + configPath=os.path.join(os.getcwd(),relPath) + with open(configPath, 'r', encoding='utf-8') as file: + config = json.load(file)[nameModel] + Output= config[dataOut] + return Output +mode_list=extractConfig(nameModel="SystemData",dataOut="mode_list") +keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics") +password=extractConfig(nameModel="SystemData",dataOut="password") + + def EvalVoice2Text(endpoint,datajson,Trusted): """Evaluate Voice 2 text """ @@ -43,15 +59,19 @@ def EvalVosk(path,Trusted=""): def EvalLLMCompra(endpoint,datajson,Trusted): - """Evaluate Voice 2 text + """Evaluate LLL compra """ apiUrl=urlText+endpoint response = requests.get(apiUrl, json=datajson) A=json.loads(response.content) time=A['time'] - print(A) - similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower()) - similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower()) + relevance=metrics.RelevanceMetric(datajson["system"]+datajson["content"],response.content) + bias=metrics.BiasMetric22(datajson["system"]+datajson["content"],response.content) + toxic=metrics.ToxicMetric(datajson["system"]+datajson["content"],response.content) + correctness=metrics.correctnessMetric(datajson["system"]+datajson["content"],response.content,Trusted) + #jsonmetrics=metrics.jsonMetrics(response.content,Trusted) + #similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower()) + #similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower()) #path=datajson["local"] model=datajson["model"] @@ -60,8 +80,14 @@ def EvalLLMCompra(endpoint,datajson,Trusted): "trusted":Trusted, "model":model, "time":time, - "similarity":similarity, - "similaritypartial":similarityPartial, + "relevance":relevance["score"], + "bias":bias["score"], + "toxic":toxic["score"], + "correctness":correctness["score"], + "relevance_r":relevance["reason"], + "bias_r":bias["reason"], + "toxic_r":toxic["reason"], + "correctness_r":correctness["reason"], "path":message } @@ -70,7 +96,43 @@ def EvalModelLLMCompra(system,content,model,max_new_tokens,Trusted): datajson={"system":system,"content":content,"password":password ,"model":model,"max_new_token":max_new_tokens} return EvalLLMCompra(endpoint,datajson,Trusted) +def EvalLLMGeneracionTexto(endpoint,datajson,Trusted): + """Evaluate LLL compra + """ + apiUrl=urlText+endpoint + response = requests.get(apiUrl, json=datajson) + A=json.loads(response.content) + time=A['time'] + relevance=metrics.RelevanceMetric(datajson["system"]+datajson["content"],response.content) + bias=metrics.BiasMetric22(datajson["system"]+datajson["content"],response.content) + toxic=metrics.ToxicMetric(datajson["system"]+datajson["content"],response.content) + correctness=metrics.correctnessMetric(datajson["system"]+datajson["content"],response.content,Trusted) + #jsonmetrics=metrics.jsonMetrics(response.content,Trusted) + #similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower()) + #similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower()) + #path=datajson["local"] + model=datajson["model"] + message=A['content'] + return {"content":message, + "trusted":Trusted, + "model":model, + "time":time, + "relevance":relevance["score"], + "bias":bias["score"], + "toxic":toxic["score"], + "correctness":correctness["score"], + "relevance_r":relevance["reason"], + "bias_r":bias["reason"], + "toxic_r":toxic["reason"], + "correctness_r":correctness["reason"], + "path":message + } + +def EvalModelLLMGeneracionTexto(system,content,model,max_new_tokens,Trusted): + endpoint="/genTextCustom" + datajson={"system":system,"content":content,"password":password ,"model":model,"max_new_token":max_new_tokens} + return EvalLLMGeneracionTexto(endpoint,datajson,Trusted) diff --git a/metrics.py b/metrics.py new file mode 100644 index 0000000..3dadee3 --- /dev/null +++ b/metrics.py @@ -0,0 +1,150 @@ +from pydantic import BaseModel +from anthropic import Anthropic +import instructor +from deepeval.models import DeepEvalBaseLLM +from deepeval.metrics import AnswerRelevancyMetric +from deepeval.test_case import LLMTestCase +from deepeval.metrics import BiasMetric +from deepeval.metrics import ToxicityMetric +from deepeval.metrics import GEval +from deepeval.test_case import LLMTestCaseParams +from deepdiff import DeepDiff +import json +import os +pwd = os.getcwd() +def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"): + configPath=os.path.join(os.getcwd(),relPath) + with open(configPath, 'r', encoding='utf-8') as file: + config = json.load(file)[nameModel] + Output= config[dataOut] + return Output + +keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics") + +class CustomClaudeOpus(DeepEvalBaseLLM): + def __init__(self): + self.model = Anthropic(api_key=keyanthropic) + + def load_model(self): + return self.model + + def generate(self, prompt: str, schema: BaseModel) -> BaseModel: + client = self.load_model() + instructor_client = instructor.from_anthropic(client) + resp = instructor_client.messages.create( + model="claude-3-5-sonnet-20240620", + max_tokens=1024, + messages=[ + { + "role": "user", + "content": prompt, + } + ], + response_model=schema, + ) + return resp + + async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel: + return self.generate(prompt, schema) + + def get_model_name(self): + return "Claude-3.5 sonnet" +customModel=CustomClaudeOpus() + +def BiasMetric22(input,actual_output): + metric = BiasMetric(threshold=0.5,model=customModel) + + test_case = LLMTestCase( + input=input, + actual_output=actual_output + ) + metric.measure(test_case) + return {"score":metric.score,"reason":metric.reason} + +def RelevanceMetric(input,actual_output): +# Replace this with the actual output from your LLM application + metric = AnswerRelevancyMetric( + threshold=0.7, + model=customModel, + include_reason=True + ) + test_case = LLMTestCase( + input=input, + actual_output=actual_output + ) + + metric.measure(test_case) + return {"score":metric.score,"reason":metric.reason} + + + + + +def ToxicMetric(input,actual_output): + metric = ToxicityMetric(threshold=0.5,model=customModel) + test_case = LLMTestCase( + input=input, + actual_output=actual_output + ) + metric.measure(test_case) + print(metric.score,"toxic") + return {"score":metric.score,"reason":metric.reason} + + + +def correctnessMetric(input,actual_output,expected_output,criteria="Determine that the output is a json whose keys contain with compra and the data correspond to the input",evaluation_steps=["Check whether the facts in 'actual output' contradicts any facts in 'expected output'","You should also heavily penalize omission of detail","Vague language, or contradicting OPINIONS, are OK" ]): + correctness_metric = GEval( + name="Correctness", + model=customModel, + criteria=criteria, + # NOTE: you can only provide either criteria or evaluation_steps, and not both + #evaluation_steps=evaluation_steps, + evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT] + ) + test_case = LLMTestCase( + input=input, + actual_output=actual_output, + expected_output=expected_output + ) + + correctness_metric.measure(test_case) + return {"score":correctness_metric.score,"reason":correctness_metric.reason} + +def jsonMetrics(text,Trusted): + false=False + print(type(text),type(Trusted)) + try: + A=json.loads(text) + jsonOk=1 + except: + jsonOk=0 + print(jsonOk) + if jsonOk==1: + + try: + Trus=json.loads(Trusted) + except: + Trus=Trusted + print(11111,3333,Trus) + # print(type(A),type(json.loads(Trus))) + # ddiff = DeepDiff(A, Trus) + # print(5555,ddiff) + # affectedkeys=ddiff.affected_root_keys/len(A.keys()) + # keys=set(json.loads(Trusted).keys()) + # jsonkeys=set(A.keys()) + # TotKey=len(keys.intersection(jsonkeys))/len(keys) + # keyplus=jsonkeys.intersection(keys) + # else: + # TotKey=0 + # keyplus=0 + # affectedkeys=0 + + return {"jsonOk":jsonOk}#,"TotKey":TotKey,"keyplus":keyplus,"affectedkeys":affectedkeys} + + + + + + + +