EvalDataSetHugging/metrics.py

from pydantic import BaseModel
from anthropic import Anthropic
import instructor
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval.metrics import BiasMetric
from deepeval.metrics import ToxicityMetric
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepdiff import DeepDiff
import json
import os
pwd = os.getcwd()
def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"):
    configPath=os.path.join(os.getcwd(),relPath)
    with open(configPath, 'r', encoding='utf-8') as file:
        config = json.load(file)[nameModel]
    Output= config[dataOut]
    return Output

keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics")

class CustomClaudeOpus(DeepEvalBaseLLM):
    def __init__(self):
        self.model = Anthropic(api_key=keyanthropic)

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        client = self.load_model()
        instructor_client = instructor.from_anthropic(client)
        resp = instructor_client.messages.create(
            model="claude-3-5-sonnet-20240620",
            max_tokens=1024,
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Claude-3.5 sonnet"
customModel=CustomClaudeOpus()

def BiasMetric22(input,actual_output):
    metric = BiasMetric(threshold=0.5,model=customModel)

    test_case = LLMTestCase(
        input=input,
        actual_output=actual_output
    )
    metric.measure(test_case)
    return {"score":metric.score,"reason":metric.reason}

def RelevanceMetric(input,actual_output):
# Replace this with the actual output from your LLM application
    metric = AnswerRelevancyMetric(
      threshold=0.7,
      model=customModel,
      include_reason=True
  )
    test_case = LLMTestCase(
      input=input,
      actual_output=actual_output
  )

    metric.measure(test_case)
    return {"score":metric.score,"reason":metric.reason}


def ToxicMetric(input,actual_output):
    metric = ToxicityMetric(threshold=0.5,model=customModel)
    test_case = LLMTestCase(
        input=input,
        actual_output=actual_output
    )
    metric.measure(test_case)
    print(metric.score,"toxic")
    return {"score":metric.score,"reason":metric.reason}


def correctnessMetric(input,actual_output,expected_output,criteria="Determine that the output is a json whose keys contain with compra and the data correspond to the input",evaluation_steps=["Check whether the facts in 'actual output' contradicts any facts in 'expected output'","You should also heavily penalize omission of detail","Vague language, or contradicting OPINIONS, are OK" ]):
    correctness_metric = GEval(
        name="Correctness",
        model=customModel,
        criteria=criteria,
        # NOTE: you can only provide either criteria or evaluation_steps, and not both
        #evaluation_steps=evaluation_steps,
        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
    )
    test_case = LLMTestCase(
        input=input,
        actual_output=actual_output,
        expected_output=expected_output
    )

    correctness_metric.measure(test_case)
    return {"score":correctness_metric.score,"reason":correctness_metric.reason}

def jsonMetrics(text,Trusted):
    false=False
    print(type(text),type(Trusted))
    try:
        A=json.loads(text)
        jsonOk=1
    except:
        jsonOk=0
    print(jsonOk)
    if jsonOk==1:

        try:
            Trus=json.loads(Trusted)
        except:
            Trus=Trusted
    print(11111,3333,Trus)
    #     print(type(A),type(json.loads(Trus)))
    #     ddiff = DeepDiff(A, Trus)
    #     print(5555,ddiff)
    #     affectedkeys=ddiff.affected_root_keys/len(A.keys())
    #     keys=set(json.loads(Trusted).keys())
    #     jsonkeys=set(A.keys())
    #     TotKey=len(keys.intersection(jsonkeys))/len(keys)
    #     keyplus=jsonkeys.intersection(keys)
    # else:
    #     TotKey=0
    #     keyplus=0
    #     affectedkeys=0

    return {"jsonOk":jsonOk}#,"TotKey":TotKey,"keyplus":keyplus,"affectedkeys":affectedkeys}