From 35147338850c50551afdf78d0a5340425cfcd9f8 Mon Sep 17 00:00:00 2001
From: marioggil <marioggil@gmail.com>
Date: Fri, 9 Aug 2024 08:15:44 -0500
Subject: [PATCH] feat: Eval LLm

---
 apis.py      | 140 +++++++++++++++++++++++++++++++++++++++++++++--
 databases.py |  22 ++++++--
 gui.py       |  53 +++++++++++-------
 main.py      |  76 +++++++++++++++++++++++---
 metrics.py   | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 408 insertions(+), 33 deletions(-)
 create mode 100644 metrics.py
diff --git a/apis.py b/apis.py
index e29be7d..5b53bc4 100644
--- a/apis.py
+++ b/apis.py
@@ -249,7 +249,7 @@ def EvalVoicehtml():
     </style>
 </head>
 <body>
-    <h1>Petición Evaluar modelo de voz comtra datos curados</h1>
+    <h1>Petición Evaluar modelo de voz contra datos curados</h1>
 
     <select id="texto1">
         %s
@@ -319,7 +319,7 @@ def EvalLLMCompra(response:Response4):
         db.commit()
     else:
         print(2,Sal)
-        db((db.analitic_llm_compra.path == Sal["path"]) & (db.analitic_llm_compra.model == Sal["model"])).update(similarity= Sal["similarity"],similaritypartial= Sal["similaritypartial"],last_modified=Sal["last_modified"])
+        db((db.analitic_llm_compra.path == Sal["path"]) & (db.analitic_llm_compra.model == Sal["model"])).update(last_modified=Sal["last_modified"],relevance=Sal["relevance"],bias=Sal["bias"],toxic=Sal["toxic"],correctness=Sal["correctness"],relevance_r=Sal["relevance_r"],bias_r=Sal["bias_r"],toxic_r=Sal["toxic_r"],correctness_r=Sal["correctness_r"])
         db.commit()
     return Sal
 
@@ -368,7 +368,7 @@ def EvalLLMComprahtml():
     </style>
 </head>
 <body>
-    <h1>Petición Evaluar modelo de voz comtra datos curados</h1>
+    <h1>Petición Evaluar modelo de LLM para evaluar compras contra datos curados</h1>
 
     <select id="texto1">
         %s
@@ -424,6 +424,140 @@ def EvalLLMComprahtml():
     """%(Sal,Sal2)
     return HTMLResponse(content=html, status_code=200)
 
+#
+@app.get("/EvalLLMGeneracionTexto")
+@app.post("/EvalLLMGeneracionTexto")
+def EvalLLMGeneracionTexto(response:Response4):
+    content=response.path
+    model=response.model
+    system= response.system
+    max_tokens= response.max_tokens
+    path=content
+    
+    if db((db.trusted.path == path ) & ( db.trusted.mode == "llm_generaciontexto")).count()==0:
+        return JSONResponse(
+        status_code=404,
+        content={"content": "Trusted no found" }
+    )
+    
+    Trusted=db((db.trusted.path == path ) & ( db.trusted.mode == "llm_generaciontexto")).select().last().trusted
+    Sal=main.EvalModelLLMCompra(system,content,model,max_tokens,Trusted)
+    Sal["last_modified"]=datetime.now()
+    if db((db.analitic_llm_generaciontexto.path == Sal["path"]) & (db.analitic_llm_generaciontexto.model == Sal["model"])).count()==0:
+        print(1,Sal)
+        db.analitic_llm_generaciontexto.insert(**Sal)
+        db.commit()
+    else:
+        print(2,Sal)
+        db((db.analitic_llm_generaciontexto.path == Sal["path"]) & (db.analitic_llm_generaciontexto.model == Sal["model"])).update(last_modified=Sal["last_modified"],relevance=Sal["relevance"],bias=Sal["bias"],toxic=Sal["toxic"],correctness=Sal["correctness"],relevance_r=Sal["relevance_r"],bias_r=Sal["bias_r"],toxic_r=Sal["toxic_r"],correctness_r=Sal["correctness_r"])
+        db.commit()
+    return Sal
+
+@app.get("/evalllmgeneraciontextohtml")
+def EvalLLMGeneracionTextohtml():
+    dir_list = db((db.trusted.mode == "llm_generaciontexto" )).select()
+    Sal=""
+    t=1
+    for i in dir_list:
+        temp="""<option value="%s">Opción %s, %s</option>
+        """%(i.path,str(t),str(i.path))
+        Sal=Sal+temp
+        t=t+1
+
+    dir_list2 = db((db.prompt.mode == "llm_generaciontexto" )).select()
+    Sal2=""
+    t=1
+    for i in dir_list2:
+        temp="""<option value="%s">Opción %s, %s</option>
+        """%(i.prompt,str(t),str(i.prompt))
+        Sal2=Sal2+temp
+        t=t+1
+
+
+    html="""<!DOCTYPE html>
+<html lang="es">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Evaluacion de modelos voice2txt</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 20px;
+        }
+        input, button {
+            margin: 10px 0;
+            padding: 5px;
+        }
+        #respuesta {
+            margin-top: 20px;
+            padding: 10px;
+            border: 1px solid #ccc;
+            background-color: #f9f9f9;
+        }
+    </style>
+</head>
+<body>
+    <h1>Petición Evaluar modelo de LLM para generar texto contra datos curados</h1>
+
+    <select id="texto1">
+        %s
+    </select>
+    
+    <br>
+    <select id="texto2">
+        <option value="meta-llama/Meta-Llama-3.1-70B-Instruct">meta-llama/Meta-Llama-3.1-70B-Instruct</option>
+        <option value="meta-llama/Meta-Llama-3.1-8B-Instruct">meta-llama/Meta-Llama-3.1-8B-Instruct</option>
+        <option value="Mistral">Mistral</option>
+    </select>
+        <br>
+    <select id="texto3">
+        %s
+    </select>
+        <br>
+    <input type="text" id="texto4" placeholder="max_tokens">
+    <br>
+    <button onclick="enviarPeticion()">Enviar petición</button>
+    <div id="respuesta"></div>
+
+    <script>
+        function enviarPeticion() {
+            const texto1 = document.getElementById('texto1').value;
+            const texto2 = document.getElementById('texto2').value;
+            const texto3 = document.getElementById('texto3').value;
+            const datos = {
+                path: texto1,
+                model: texto2,
+                system: texto3
+                
+
+            };
+
+            fetch('/EvalLLMGeneracionTexto', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify(datos)
+            })
+            .then(response => response.json())
+            .then(data => {
+                document.getElementById('respuesta').innerHTML = JSON.stringify(data, null, 2);
+            })
+            .catch(error => {
+                document.getElementById('respuesta').innerHTML = 'Error: ' + error;
+            });
+        }
+    </script>
+</body>
+</html>
+    """%(Sal,Sal2)
+    return HTMLResponse(content=html, status_code=200)
+
+
+
+
+
 
 #Por revisar
 
diff --git a/databases.py b/databases.py
index d19b055..8fca7e7 100644
--- a/databases.py
+++ b/databases.py
@@ -55,8 +55,14 @@ db.define_table(
     Field("model"),
     Field("time", type="double"),
     Field("path"),
-    Field("similarity", type="double"),
-    Field("similaritypartial", type="double"),
+    Field("relevance", type="double"),
+    Field("bias", type="double"),
+    Field("toxic", type="double"),
+    Field("correctness", type="double"),
+    Field("relevance_r"),
+    Field("bias_r"),
+    Field("toxic_r"),
+    Field("correctness_r"),
     Field('last_modified', 'datetime')    
 )
 
@@ -79,9 +85,15 @@ db.define_table(
     Field("model"),
     Field("time", type="double"),
     Field("path"),
-    Field("similarity", type="double"),
-    Field("similaritypartial", type="double"),
-    Field('last_modified', 'datetime')    
+    Field("relevance", type="double"),
+    Field("bias", type="double"),
+    Field("toxic", type="double"),
+    Field("correctness", type="double"),
+    Field("relevance_r"),
+    Field("bias_r"),
+    Field("toxic_r"),
+    Field("correctness_r"),
+    Field('last_modified', 'datetime')   
 )
 
 db.define_table(
diff --git a/gui.py b/gui.py
index eeb8233..01c9e86 100644
--- a/gui.py
+++ b/gui.py
@@ -45,12 +45,12 @@ def html_getmetricvoice():
     data_files={}
     for row in db().select(db.analitic_voice.ALL):
         data_files[row.id]=row.as_dict()
-    #print(datafiles)
+
     data_files=pd.DataFrame(data_files).T
 
     #table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'],
                        #columns=['model'], aggfunc="sum")
-    #print(table,table.columns)
+ 
 
     html="""
     <h1>Data general de los modelos</h1>
@@ -61,22 +61,32 @@ def html_getmetricvoice():
 
     """
     #<taipy:chart mode="markers" x="x" y[1]="time" y[2]="similarity">{data_files_voice}</taipy:chart>
-    print(time.time()-t)
+
     return html,data,data_files
 
+
+
+
+
 def getmetricllm_compra(model):
     rows = db(db.analitic_llm_compra.model==model).select()
     rows_list = rows.as_list()
     data=pd.DataFrame(rows_list)
-    durationL=list()
-    for i in rows_list:
-        durationL.append(db(db.trusted.path == i["path"] ).select().last().duration)
-    duration=statistics.mean(durationL)
-    time=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['time'].values[0]
-    similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0]
-    similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0]
-    efectivetime=time/duration
-    return ({"model":model,"duration":duration,"time":time,"similarity":similarity,"similaritypartial":similaritypartial,"efectivetime":efectivetime})
+
+    #durationL=list()
+    #for i in rows_list:
+        #durationL.append(db(db.trusted.path == i["path"] ).select().last().duration)
+    #duration=statistics.mean(durationL)
+    time=pd.pivot_table(data,values=['time'],index="model")['time'].values[0]
+    relevance=pd.pivot_table(data,values=["relevance"],index="model")['relevance'].values[0]
+    bias=pd.pivot_table(data,values=["bias"],index="model")['bias'].values[0]
+    toxic=pd.pivot_table(data,values=["toxic"],index="model")['toxic'].values[0]
+
+    correctness=pd.pivot_table(data,values=["correctness"],index="model")['correctness'].values[0]
+    #similarity=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similarity'].values[0]
+    #similaritypartial=pd.pivot_table(data,values=['time','similarity', 'similaritypartial'],index="model")['similaritypartial'].values[0]
+    #efectivetime=time/duration
+    return ({"model":model,"time":time,"relevance":relevance,"bias":bias,"toxic":toxic,"correctness":correctness})
 
 def html_getmetricllm_compra():
     models=list()
@@ -90,33 +100,39 @@ def html_getmetricllm_compra():
     data_files={}
     for row in db().select(db.analitic_llm_compra.ALL):
         data_files[row.id]=row.as_dict()
-    #print(datafiles)
+
     data_files=pd.DataFrame(data_files).T
 
     #table = pd.pivot_table(data_files, values=['path', 'similarity','similaritypartial'], index=['path'],
                        #columns=['model'], aggfunc="sum")
-    #print(table,table.columns)
+
 
     html="""
     <h1>Data general de los modelos</h1>
-    <taipy:table>{data_voice}</taipy:table>
+    <taipy:table>{data_llm_compra}</taipy:table>
     <h1>Data de cada muestra</h1>
-    <taipy:table filter=True>{data_files_voice}</taipy:table>
+    <taipy:table filter=True >{data_files_llm_compra}</taipy:table>
     
 
     """
     #<taipy:chart mode="markers" x="x" y[1]="time" y[2]="similarity">{data_files_voice}</taipy:chart>
-    print(time.time()-t)
+
     return html,data,data_files
 
 
 def on_init(state):
     state.html_page_getmetricsvoice,state.data_voice,state.data_files_voice=html_getmetricvoice()
+    state.html_page_getmetricsllm_compra,state.data_llm_compra,state.data_files_llm_compra=html_getmetricllm_compra()
+
     pass
     
     
 
 html_page_getmetricsvoice,data_voice,data_files_voice=html_getmetricvoice()
+
+html_page_getmetricsllm_compra,data_llm_compra,data_files_llm_compra=html_getmetricllm_compra()
+
+
 # mode="voice"
 # modetypedata="audio"
 # file="id2"
@@ -135,10 +151,11 @@ html_page_getmetricsvoice,data_voice,data_files_voice=html_getmetricvoice()
 
 
 
-data=pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+
 
 pages = {
   "getmetricsvoice": Html(html_page_getmetricsvoice),
+  "getmetricsllm_compra": Html(html_page_getmetricsllm_compra),
 }
 
 app = Gui(pages=pages)
diff --git a/main.py b/main.py
index 47aef3a..38c2c38 100644
--- a/main.py
+++ b/main.py
@@ -2,14 +2,30 @@ import requests
 import evaluate
 import deepdiff
 import json
+import os 
+
 from fuzzywuzzy import fuzz
 from deepdiff import DeepDiff
 from deepdiff import Delta
 import databases
+import metrics
 #print(evaluate.list_evaluation_modules())
+pwd = os.getcwd()
 urlAud="http://127.0.0.1:7870/"
 urlText="http://127.0.0.1:7869"
-password="1223Aer*"
+
+
+def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"):
+    configPath=os.path.join(os.getcwd(),relPath)
+    with open(configPath, 'r', encoding='utf-8') as file:
+        config = json.load(file)[nameModel]
+    Output= config[dataOut]
+    return Output
+mode_list=extractConfig(nameModel="SystemData",dataOut="mode_list")
+keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
+password=extractConfig(nameModel="SystemData",dataOut="password")
+
+
 def EvalVoice2Text(endpoint,datajson,Trusted):
     """Evaluate Voice 2 text
     """
@@ -43,15 +59,19 @@ def EvalVosk(path,Trusted=""):
 
 
 def EvalLLMCompra(endpoint,datajson,Trusted):
-    """Evaluate Voice 2 text
+    """Evaluate LLL compra
     """
     apiUrl=urlText+endpoint
     response = requests.get(apiUrl,  json=datajson)
     A=json.loads(response.content)
     time=A['time']
-    print(A)
-    similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower())
-    similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower())
+    relevance=metrics.RelevanceMetric(datajson["system"]+datajson["content"],response.content)
+    bias=metrics.BiasMetric22(datajson["system"]+datajson["content"],response.content)
+    toxic=metrics.ToxicMetric(datajson["system"]+datajson["content"],response.content)
+    correctness=metrics.correctnessMetric(datajson["system"]+datajson["content"],response.content,Trusted)
+    #jsonmetrics=metrics.jsonMetrics(response.content,Trusted)
+    #similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower())
+    #similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower())
     #path=datajson["local"]
     model=datajson["model"]
 
@@ -60,8 +80,14 @@ def EvalLLMCompra(endpoint,datajson,Trusted):
             "trusted":Trusted,
             "model":model,
             "time":time,
-            "similarity":similarity,
-            "similaritypartial":similarityPartial,
+            "relevance":relevance["score"],
+            "bias":bias["score"],
+            "toxic":toxic["score"],
+            "correctness":correctness["score"],
+            "relevance_r":relevance["reason"],
+            "bias_r":bias["reason"],
+            "toxic_r":toxic["reason"],
+            "correctness_r":correctness["reason"],
             "path":message
             }
 
@@ -70,7 +96,43 @@ def EvalModelLLMCompra(system,content,model,max_new_tokens,Trusted):
     datajson={"system":system,"content":content,"password":password ,"model":model,"max_new_token":max_new_tokens}
     return EvalLLMCompra(endpoint,datajson,Trusted)
 
+def EvalLLMGeneracionTexto(endpoint,datajson,Trusted):
+    """Evaluate LLL compra
+    """
+    apiUrl=urlText+endpoint
+    response = requests.get(apiUrl,  json=datajson)
+    A=json.loads(response.content)
+    time=A['time']
+    relevance=metrics.RelevanceMetric(datajson["system"]+datajson["content"],response.content)
+    bias=metrics.BiasMetric22(datajson["system"]+datajson["content"],response.content)
+    toxic=metrics.ToxicMetric(datajson["system"]+datajson["content"],response.content)
+    correctness=metrics.correctnessMetric(datajson["system"]+datajson["content"],response.content,Trusted)
+    #jsonmetrics=metrics.jsonMetrics(response.content,Trusted)
+    #similarity=fuzz.ratio( Trusted.strip().lower(),A['content'].strip().lower())
+    #similarityPartial=fuzz.partial_ratio( Trusted.strip().lower(),A['content'].strip().lower())
+    #path=datajson["local"]
+    model=datajson["model"]
 
+    message=A['content']
+    return {"content":message,
+            "trusted":Trusted,
+            "model":model,
+            "time":time,
+            "relevance":relevance["score"],
+            "bias":bias["score"],
+            "toxic":toxic["score"],
+            "correctness":correctness["score"],
+            "relevance_r":relevance["reason"],
+            "bias_r":bias["reason"],
+            "toxic_r":toxic["reason"],
+            "correctness_r":correctness["reason"],
+            "path":message
+            }
+
+def EvalModelLLMGeneracionTexto(system,content,model,max_new_tokens,Trusted):
+    endpoint="/genTextCustom"
+    datajson={"system":system,"content":content,"password":password ,"model":model,"max_new_token":max_new_tokens}
+    return EvalLLMGeneracionTexto(endpoint,datajson,Trusted)
 
 
 
diff --git a/metrics.py b/metrics.py
new file mode 100644
index 0000000..3dadee3
--- /dev/null
+++ b/metrics.py
@@ -0,0 +1,150 @@
+from pydantic import BaseModel
+from anthropic import Anthropic
+import instructor
+from deepeval.models import DeepEvalBaseLLM
+from deepeval.metrics import AnswerRelevancyMetric
+from deepeval.test_case import LLMTestCase
+from deepeval.metrics import BiasMetric
+from deepeval.metrics import ToxicityMetric
+from deepeval.metrics import GEval
+from deepeval.test_case import LLMTestCaseParams
+from deepdiff import DeepDiff
+import json
+import os 
+pwd = os.getcwd()
+def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"):
+    configPath=os.path.join(os.getcwd(),relPath)
+    with open(configPath, 'r', encoding='utf-8') as file:
+        config = json.load(file)[nameModel]
+    Output= config[dataOut]
+    return Output
+
+keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")
+
+class CustomClaudeOpus(DeepEvalBaseLLM):
+    def __init__(self):
+        self.model = Anthropic(api_key=keyanthropic)
+
+    def load_model(self):
+        return self.model
+
+    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
+        client = self.load_model()
+        instructor_client = instructor.from_anthropic(client)
+        resp = instructor_client.messages.create(
+            model="claude-3-5-sonnet-20240620",
+            max_tokens=1024,
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            response_model=schema,
+        )
+        return resp
+
+    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
+        return self.generate(prompt, schema)
+
+    def get_model_name(self):
+        return "Claude-3.5 sonnet"
+customModel=CustomClaudeOpus() 
+
+def BiasMetric22(input,actual_output):
+    metric = BiasMetric(threshold=0.5,model=customModel)
+    
+    test_case = LLMTestCase(
+        input=input,
+        actual_output=actual_output
+    )
+    metric.measure(test_case)
+    return {"score":metric.score,"reason":metric.reason}
+
+def RelevanceMetric(input,actual_output):
+# Replace this with the actual output from your LLM application
+    metric = AnswerRelevancyMetric(
+      threshold=0.7,
+      model=customModel,
+      include_reason=True
+  )
+    test_case = LLMTestCase(
+      input=input,
+      actual_output=actual_output
+  )
+
+    metric.measure(test_case)
+    return {"score":metric.score,"reason":metric.reason}
+
+
+
+
+
+def ToxicMetric(input,actual_output):
+    metric = ToxicityMetric(threshold=0.5,model=customModel)
+    test_case = LLMTestCase(
+        input=input,
+        actual_output=actual_output
+    )
+    metric.measure(test_case)
+    print(metric.score,"toxic")
+    return {"score":metric.score,"reason":metric.reason}
+
+
+
+def correctnessMetric(input,actual_output,expected_output,criteria="Determine that the output is a json whose keys contain with compra and the data correspond to the input",evaluation_steps=["Check whether the facts in 'actual output' contradicts any facts in 'expected output'","You should also heavily penalize omission of detail","Vague language, or contradicting OPINIONS, are OK" ]):
+    correctness_metric = GEval(
+        name="Correctness",
+        model=customModel,
+        criteria=criteria,
+        # NOTE: you can only provide either criteria or evaluation_steps, and not both
+        #evaluation_steps=evaluation_steps,
+        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
+    )
+    test_case = LLMTestCase(
+        input=input,
+        actual_output=actual_output,
+        expected_output=expected_output
+    )
+
+    correctness_metric.measure(test_case)
+    return {"score":correctness_metric.score,"reason":correctness_metric.reason}
+
+def jsonMetrics(text,Trusted):
+    false=False
+    print(type(text),type(Trusted))
+    try:
+        A=json.loads(text)
+        jsonOk=1
+    except:
+        jsonOk=0
+    print(jsonOk)
+    if jsonOk==1:
+
+        try:
+            Trus=json.loads(Trusted)
+        except:
+            Trus=Trusted
+    print(11111,3333,Trus)
+    #     print(type(A),type(json.loads(Trus)))
+    #     ddiff = DeepDiff(A, Trus)
+    #     print(5555,ddiff)
+    #     affectedkeys=ddiff.affected_root_keys/len(A.keys())
+    #     keys=set(json.loads(Trusted).keys())
+    #     jsonkeys=set(A.keys())
+    #     TotKey=len(keys.intersection(jsonkeys))/len(keys)
+    #     keyplus=jsonkeys.intersection(keys)
+    # else:
+    #     TotKey=0
+    #     keyplus=0
+    #     affectedkeys=0
+
+    return {"jsonOk":jsonOk}#,"TotKey":TotKey,"keyplus":keyplus,"affectedkeys":affectedkeys}
+
+
+
+
+
+
+
+