feat: Eval LLm

2024-08-09 08:15:44 -05:00 · 2024-08-09 08:15:44 -05:00 · 3514733885
parent 881d3074cf
commit 3514733885
5 changed files with 408 additions and 33 deletions
--- a/apis.py
+++ b/apis.py
@ -249,7 +249,7 @@ def EvalVoicehtml():
    </style>
 </head>
 <body>
-    <h1>Petición Evaluar modelo de voz comtra datos curados</h1>
+    <h1>Petición Evaluar modelo de voz contra datos curados</h1>
    <select id="texto1">
        %s
@ -319,7 +319,7 @@ def EvalLLMCompra(response:Response4):
        db.commit()
    else:
        print(2,Sal)
-        db((db.analitic_llm_compra.path == Sal["path"]) & (db.analitic_llm_compra.model == Sal["model"])).update(similarity= Sal["similarity"],similaritypartial= Sal["similaritypartial"],last_modified=Sal["last_modified"])
+        db((db.analitic_llm_compra.path == Sal["path"]) & (db.analitic_llm_compra.model == Sal["model"])).update(last_modified=Sal["last_modified"],relevance=Sal["relevance"],bias=Sal["bias"],toxic=Sal["toxic"],correctness=Sal["correctness"],relevance_r=Sal["relevance_r"],bias_r=Sal["bias_r"],toxic_r=Sal["toxic_r"],correctness_r=Sal["correctness_r"])
        db.commit()
    return Sal
@ -368,7 +368,7 @@ def EvalLLMComprahtml():
    </style>
 </head>
 <body>
-    <h1>Petición Evaluar modelo de voz comtra datos curados</h1>
+    <h1>Petición Evaluar modelo de LLM para evaluar compras contra datos curados</h1>
    <select id="texto1">
        %s
@ -424,6 +424,140 @@ def EvalLLMComprahtml():
    """%(Sal,Sal2)
    return HTMLResponse(content=html, status_code=200)
 #
@app.get("/EvalLLMGeneracionTexto")
@app.post("/EvalLLMGeneracionTexto")
 def EvalLLMGeneracionTexto(response:Response4):
    content=response.path
    model=response.model
    system= response.system
    max_tokens= response.max_tokens
    path=content
    if db((db.trusted.path == path ) & ( db.trusted.mode == "llm_generaciontexto")).count()==0:
        return JSONResponse(
        status_code=404,
        content={"content": "Trusted no found" }
    )
    Trusted=db((db.trusted.path == path ) & ( db.trusted.mode == "llm_generaciontexto")).select().last().trusted
    Sal=main.EvalModelLLMCompra(system,content,model,max_tokens,Trusted)
    Sal["last_modified"]=datetime.now()
    if db((db.analitic_llm_generaciontexto.path == Sal["path"]) & (db.analitic_llm_generaciontexto.model == Sal["model"])).count()==0:
        print(1,Sal)
        db.analitic_llm_generaciontexto.insert(**Sal)
        db.commit()
    else:
        print(2,Sal)
        db((db.analitic_llm_generaciontexto.path == Sal["path"]) & (db.analitic_llm_generaciontexto.model == Sal["model"])).update(last_modified=Sal["last_modified"],relevance=Sal["relevance"],bias=Sal["bias"],toxic=Sal["toxic"],correctness=Sal["correctness"],relevance_r=Sal["relevance_r"],bias_r=Sal["bias_r"],toxic_r=Sal["toxic_r"],correctness_r=Sal["correctness_r"])
        db.commit()
    return Sal
@app.get("/evalllmgeneraciontextohtml")
 def EvalLLMGeneracionTextohtml():
    dir_list = db((db.trusted.mode == "llm_generaciontexto" )).select()
    Sal=""
    t=1
    for i in dir_list:
        temp="""<option value="%s">Opción %s, %s</option>
        """%(i.path,str(t),str(i.path))
        Sal=Sal+temp
        t=t+1
    dir_list2 = db((db.prompt.mode == "llm_generaciontexto" )).select()
    Sal2=""
    t=1
    for i in dir_list2:
        temp="""<option value="%s">Opción %s, %s</option>
        """%(i.prompt,str(t),str(i.prompt))
        Sal2=Sal2+temp
        t=t+1
    html="""<!DOCTYPE html>
 <html lang="es">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Evaluacion de modelos voice2txt</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
        }
        input, button {
            margin: 10px 0;
            padding: 5px;
        }
        #respuesta {
            margin-top: 20px;
            padding: 10px;
            border: 1px solid #ccc;
            background-color: #f9f9f9;
        }
    </style>
 </head>
 <body>
    <h1>Petición Evaluar modelo de LLM para generar texto contra datos curados</h1>
    <select id="texto1">
        %s
    </select>
    <br>
    <select id="texto2">
        <option value="meta-llama/Meta-Llama-3.1-70B-Instruct">meta-llama/Meta-Llama-3.1-70B-Instruct</option>
        <option value="meta-llama/Meta-Llama-3.1-8B-Instruct">meta-llama/Meta-Llama-3.1-8B-Instruct</option>
        <option value="Mistral">Mistral</option>
    </select>
        <br>
    <select id="texto3">
        %s
    </select>
        <br>
    <input type="text" id="texto4" placeholder="max_tokens">
    <br>
    <button onclick="enviarPeticion()">Enviar petición</button>
    <div id="respuesta"></div>
    <script>
        function enviarPeticion() {
            const texto1 = document.getElementById('texto1').value;
            const texto2 = document.getElementById('texto2').value;
            const texto3 = document.getElementById('texto3').value;
            const datos = {
                path: texto1,
                model: texto2,
                system: texto3
            };
            fetch('/EvalLLMGeneracionTexto', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json'
                },
                body: JSON.stringify(datos)
            })
            .then(response => response.json())
            .then(data => {
                document.getElementById('respuesta').innerHTML = JSON.stringify(data, null, 2);
            })
            .catch(error => {
                document.getElementById('respuesta').innerHTML = 'Error: ' + error;
            });
        }
    </script>
 </body>
 </html>
    """%(Sal,Sal2)
    return HTMLResponse(content=html, status_code=200)
 #Por revisar
--- a/databases.py
+++ b/databases.py
@ -55,8 +55,14 @@ db.define_table(
    Field("model"),
    Field("time", type="double"),
    Field("path"),
-    Field("similarity", type="double"),
+    Field("relevance", type="double"),
-    Field("similaritypartial", type="double"),
+    Field("bias", type="double"),
    Field("toxic", type="double"),
    Field("correctness", type="double"),
    Field("relevance_r"),
    Field("bias_r"),
    Field("toxic_r"),
    Field("correctness_r"),
    Field('last_modified', 'datetime')    
 )
@ -79,8 +85,14 @@ db.define_table(
    Field("model"),
    Field("time", type="double"),
    Field("path"),
-    Field("similarity", type="double"),
+    Field("relevance", type="double"),
-    Field("similaritypartial", type="double"),
+    Field("bias", type="double"),
    Field("toxic", type="double"),
    Field("correctness", type="double"),
    Field("relevance_r"),
    Field("bias_r"),
    Field("toxic_r"),
    Field("correctness_r"),
    Field('last_modified', 'datetime')   
 )
--- a/gui.py
+++ b/gui.py
@ -45,12 +45,12 @@ def html_getmetricvoice():
    data_files={}
    for row in db().select(db.analitic_voice.ALL):
        data_files[row.id]=row.as_dict()
-    #print(datafiles)
+
    data_files=pd.DataFrame(data_files).T
    #table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'],
                       #columns=['model'], aggfunc="sum")
-    #print(table,table.columns)
+ 
    html="""
    <h1>Data general de los modelos</h1>
@ -61,22 +61,32 @@ def html_getmetricvoice():
    """
    #<taipy:chart mode="markers" x="x" y[1]="time" y[2]="similarity">{data_files_voice}</taipy:chart>
-    print(time.time()-t)
+
    return html,data,data_files
 def getmetricllm_compra(model):
    rows = db(db.analitic_llm_compra.model==model).select()
    rows_list = rows.as_list()
    data=pd.DataFrame(rows_list)
-    durationL=list()
+
-    for i in rows_list:
+    #durationL=list()
-        durationL.append(db(db.trusted.path == i["path"] ).select().last().duration)
+    #for i in rows_list:
-    duration=statistics.mean(durationL)
+        #durationL.append(db(db.trusted.path == i["path"] ).select().last().duration)
-    time=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['time'].values[0]
+    #duration=statistics.mean(durationL)
-    similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0]
+    time=pd.pivot_table(data,values=['time'],index="model")['time'].values[0]
-    similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0]
+    relevance=pd.pivot_table(data,values=["relevance"],index="model")['relevance'].values[0]
-    efectivetime=time/duration
+    bias=pd.pivot_table(data,values=["bias"],index="model")['bias'].values[0]
-    return ({"model":model,"duration":duration,"time":time,"similarity":similarity,"similaritypartial":similaritypartial,"efectivetime":efectivetime})
+    toxic=pd.pivot_table(data,values=["toxic"],index="model")['toxic'].values[0]
    correctness=pd.pivot_table(data,values=["correctness"],index="model")['correctness'].values[0]
    #similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0]
    #similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0]
    #efectivetime=time/duration
    return ({"model":model,"time":time,"relevance":relevance,"bias":bias,"toxic":toxic,"correctness":correctness})
 def html_getmetricllm_compra():
    models=list()
@ -90,33 +100,39 @@ def html_getmetricllm_compra():
    data_files={}
    for row in db().select(db.analitic_llm_compra.ALL):
        data_files[row.id]=row.as_dict()
-    #print(datafiles)
+
    data_files=pd.DataFrame(data_files).T
    #table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'],
                       #columns=['model'], aggfunc="sum")
-    #print(table,table.columns)
+
    html="""
    <h1>Data general de los modelos</h1>
-    <taipy:table>{data_voice}</taipy:table>
+    <taipy:table>{data_llm_compra}</taipy:table>
    <h1>Data de cada muestra</h1>
-    <taipy:table filter=True>{data_files_voice}</taipy:table>
+    <taipy:table filter=True >{data_files_llm_compra}</taipy:table>
    """
    #<taipy:chart mode="markers" x="x" y[1]="time" y[2]="similarity">{data_files_voice}</taipy:chart>
-    print(time.time()-t)
+
    return html,data,data_files
 def on_init(state):
    state.html_page_getmetricsvoice,state.data_voice,state.data_files_voice=html_getmetricvoice()
    state.html_page_getmetricsllm_compra,state.data_llm_compra,state.data_files_llm_compra=html_getmetricllm_compra()
    pass
 html_page_getmetricsvoice,data_voice,data_files_voice=html_getmetricvoice()
 html_page_getmetricsllm_compra,data_llm_compra,data_files_llm_compra=html_getmetricllm_compra()
 # mode="voice"
 # modetypedata="audio"
 # file="id2"
@ -135,10 +151,11 @@ html_page_getmetricsvoice,data_voice,data_files_voice=html_getmetricvoice()
-data=pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+
 pages = {
  "getmetricsvoice": Html(html_page_getmetricsvoice),
  "getmetricsllm_compra": Html(html_page_getmetricsllm_compra),
 }
 app = Gui(pages=pages)
--- a/main.py
+++ b/main.py
@ -2,14 +2,30 @@ import requests
 import evaluate
 import deepdiff
 import json
 import os 
 from fuzzywuzzy import fuzz
 from deepdiff import DeepDiff
 from deepdiff import Delta
 import databases
 import metrics
 #print(evaluate.list_evaluation_modules())
 pwd = os.getcwd()
 urlAud="http://127.0.0.1:7870/"
 urlText="http://127.0.0.1:7869"
-password="1223Aer*"
+
 def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"):
    configPath=os.path.join(os.getcwd(),relPath)
    with open(configPath, 'r', encoding='utf-8') as file:
        config = json.load(file)[nameModel]
    Output= config[dataOut]
    return Output
 mode_list=extractConfig(nameModel="SystemData",dataOut="mode_list")
 keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
 password=extractConfig(nameModel="SystemData",dataOut="password")
 def EvalVoice2Text(endpoint,datajson,Trusted):
    """Evaluate Voice 2 text
    """
@ -43,15 +59,19 @@ def EvalVosk(path,Trusted=""):
 def EvalLLMCompra(endpoint,datajson,Trusted):
-    """Evaluate Voice 2 text
+    """Evaluate LLL compra
    """
    apiUrl=urlText+endpoint
    response = requests.get(apiUrl,  json=datajson)
    A=json.loads(response.content)
    time=A['time']
-    print(A)
+    relevance=metrics.RelevanceMetric(datajson["system"]+datajson["content"],response.content)
-    similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower())
+    bias=metrics.BiasMetric22(datajson["system"]+datajson["content"],response.content)
-    similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower())
+    toxic=metrics.ToxicMetric(datajson["system"]+datajson["content"],response.content)
    correctness=metrics.correctnessMetric(datajson["system"]+datajson["content"],response.content,Trusted)
    #jsonmetrics=metrics.jsonMetrics(response.content,Trusted)
    #similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower())
    #similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower())
    #path=datajson["local"]
    model=datajson["model"]
@ -60,8 +80,14 @@ def EvalLLMCompra(endpoint,datajson,Trusted):
            "trusted":Trusted,
            "model":model,
            "time":time,
-            "similarity":similarity,
+            "relevance":relevance["score"],
-            "similaritypartial":similarityPartial,
+            "bias":bias["score"],
            "toxic":toxic["score"],
            "correctness":correctness["score"],
            "relevance_r":relevance["reason"],
            "bias_r":bias["reason"],
            "toxic_r":toxic["reason"],
            "correctness_r":correctness["reason"],
            "path":message
            }
@ -70,7 +96,43 @@ def EvalModelLLMCompra(system,content,model,max_new_tokens,Trusted):
    datajson={"system":system,"content":content,"password":password ,"model":model,"max_new_token":max_new_tokens}
    return EvalLLMCompra(endpoint,datajson,Trusted)
 def EvalLLMGeneracionTexto(endpoint,datajson,Trusted):
    """Evaluate LLL compra
    """
    apiUrl=urlText+endpoint
    response = requests.get(apiUrl,  json=datajson)
    A=json.loads(response.content)
    time=A['time']
    relevance=metrics.RelevanceMetric(datajson["system"]+datajson["content"],response.content)
    bias=metrics.BiasMetric22(datajson["system"]+datajson["content"],response.content)
    toxic=metrics.ToxicMetric(datajson["system"]+datajson["content"],response.content)
    correctness=metrics.correctnessMetric(datajson["system"]+datajson["content"],response.content,Trusted)
    #jsonmetrics=metrics.jsonMetrics(response.content,Trusted)
    #similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower())
    #similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower())
    #path=datajson["local"]
    model=datajson["model"]
    message=A['content']
    return {"content":message,
            "trusted":Trusted,
            "model":model,
            "time":time,
            "relevance":relevance["score"],
            "bias":bias["score"],
            "toxic":toxic["score"],
            "correctness":correctness["score"],
            "relevance_r":relevance["reason"],
            "bias_r":bias["reason"],
            "toxic_r":toxic["reason"],
            "correctness_r":correctness["reason"],
            "path":message
            }
 def EvalModelLLMGeneracionTexto(system,content,model,max_new_tokens,Trusted):
    endpoint="/genTextCustom"
    datajson={"system":system,"content":content,"password":password ,"model":model,"max_new_token":max_new_tokens}
    return EvalLLMGeneracionTexto(endpoint,datajson,Trusted)
--- a/metrics.py
+++ b/metrics.py
@ -0,0 +1,150 @@
 from pydantic import BaseModel
 from anthropic import Anthropic
 import instructor
 from deepeval.models import DeepEvalBaseLLM
 from deepeval.metrics import AnswerRelevancyMetric
 from deepeval.test_case import LLMTestCase
 from deepeval.metrics import BiasMetric
 from deepeval.metrics import ToxicityMetric
 from deepeval.metrics import GEval
 from deepeval.test_case import LLMTestCaseParams
 from deepdiff import DeepDiff
 import json
 import os 
 pwd = os.getcwd()
 def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"):
    configPath=os.path.join(os.getcwd(),relPath)
    with open(configPath, 'r', encoding='utf-8') as file:
        config = json.load(file)[nameModel]
    Output= config[dataOut]
    return Output
 keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
 class CustomClaudeOpus(DeepEvalBaseLLM):
    def __init__(self):
        self.model = Anthropic(api_key=keyanthropic)
    def load_model(self):
        return self.model
    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        client = self.load_model()
        instructor_client = instructor.from_anthropic(client)
        resp = instructor_client.messages.create(
            model="claude-3-5-sonnet-20240620",
            max_tokens=1024,
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp
    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)
    def get_model_name(self):
        return "Claude-3.5 sonnet"
 customModel=CustomClaudeOpus() 
 def BiasMetric22(input,actual_output):
    metric = BiasMetric(threshold=0.5,model=customModel)
    test_case = LLMTestCase(
        input=input,
        actual_output=actual_output
    )
    metric.measure(test_case)
    return {"score":metric.score,"reason":metric.reason}
 def RelevanceMetric(input,actual_output):
 # Replace this with the actual output from your LLM application
    metric = AnswerRelevancyMetric(
      threshold=0.7,
      model=customModel,
      include_reason=True
  )
    test_case = LLMTestCase(
      input=input,
      actual_output=actual_output
  )
    metric.measure(test_case)
    return {"score":metric.score,"reason":metric.reason}
 def ToxicMetric(input,actual_output):
    metric = ToxicityMetric(threshold=0.5,model=customModel)
    test_case = LLMTestCase(
        input=input,
        actual_output=actual_output
    )
    metric.measure(test_case)
    print(metric.score,"toxic")
    return {"score":metric.score,"reason":metric.reason}
 def correctnessMetric(input,actual_output,expected_output,criteria="Determine that the output is a json whose keys contain with compra and the data correspond to the input",evaluation_steps=["Check whether the facts in 'actual output' contradicts any facts in 'expected output'","You should also heavily penalize omission of detail","Vague language, or contradicting OPINIONS, are OK" ]):
    correctness_metric = GEval(
        name="Correctness",
        model=customModel,
        criteria=criteria,
        # NOTE: you can only provide either criteria or evaluation_steps, and not both
        #evaluation_steps=evaluation_steps,
        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
    )
    test_case = LLMTestCase(
        input=input,
        actual_output=actual_output,
        expected_output=expected_output
    )
    correctness_metric.measure(test_case)
    return {"score":correctness_metric.score,"reason":correctness_metric.reason}
 def jsonMetrics(text,Trusted):
    false=False
    print(type(text),type(Trusted))
    try:
        A=json.loads(text)
        jsonOk=1
    except:
        jsonOk=0
    print(jsonOk)
    if jsonOk==1:
        try:
            Trus=json.loads(Trusted)
        except:
            Trus=Trusted
    print(11111,3333,Trus)
    #     print(type(A),type(json.loads(Trus)))
    #     ddiff = DeepDiff(A, Trus)
    #     print(5555,ddiff)
    #     affectedkeys=ddiff.affected_root_keys/len(A.keys())
    #     keys=set(json.loads(Trusted).keys())
    #     jsonkeys=set(A.keys())
    #     TotKey=len(keys.intersection(jsonkeys))/len(keys)
    #     keyplus=jsonkeys.intersection(keys)
    # else:
    #     TotKey=0
    #     keyplus=0
    #     affectedkeys=0
    return {"jsonOk":jsonOk}#,"TotKey":TotKey,"keyplus":keyplus,"affectedkeys":affectedkeys}