from pydantic import BaseModel from anthropic import Anthropic import instructor from deepeval.models import DeepEvalBaseLLM from deepeval.metrics import AnswerRelevancyMetric from deepeval.test_case import LLMTestCase from deepeval.metrics import BiasMetric from deepeval.metrics import ToxicityMetric from deepeval.metrics import GEval from deepeval.test_case import LLMTestCaseParams from deepdiff import DeepDiff import json import os pwd = os.getcwd() def extractConfig(nameModel="SystemData",relPath=os.path.join(pwd,"conf/experiment_config.json"),dataOut="keyantrophics"): configPath=os.path.join(os.getcwd(),relPath) with open(configPath, 'r', encoding='utf-8') as file: config = json.load(file)[nameModel] Output= config[dataOut] return Output keyanthropic=extractConfig(nameModel="SystemData",dataOut="keyantrophics") class CustomClaudeOpus(DeepEvalBaseLLM): def __init__(self): self.model = Anthropic(api_key=keyanthropic) def load_model(self): return self.model def generate(self, prompt: str, schema: BaseModel) -> BaseModel: client = self.load_model() instructor_client = instructor.from_anthropic(client) resp = instructor_client.messages.create( model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=[ { "role": "user", "content": prompt, } ], response_model=schema, ) return resp async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel: return self.generate(prompt, schema) def get_model_name(self): return "Claude-3.5 sonnet" customModel=CustomClaudeOpus() def BiasMetric22(input,actual_output): metric = BiasMetric(threshold=0.5,model=customModel) test_case = LLMTestCase( input=input, actual_output=actual_output ) metric.measure(test_case) return {"score":metric.score,"reason":metric.reason} def RelevanceMetric(input,actual_output): # Replace this with the actual output from your LLM application metric = AnswerRelevancyMetric( threshold=0.7, model=customModel, include_reason=True ) test_case = LLMTestCase( input=input, actual_output=actual_output ) metric.measure(test_case) return {"score":metric.score,"reason":metric.reason} def ToxicMetric(input,actual_output): metric = ToxicityMetric(threshold=0.5,model=customModel) test_case = LLMTestCase( input=input, actual_output=actual_output ) metric.measure(test_case) print(metric.score,"toxic") return {"score":metric.score,"reason":metric.reason} def correctnessMetric(input,actual_output,expected_output,criteria="Determine that the output is a json whose keys contain with compra and the data correspond to the input",evaluation_steps=["Check whether the facts in 'actual output' contradicts any facts in 'expected output'","You should also heavily penalize omission of detail","Vague language, or contradicting OPINIONS, are OK" ]): correctness_metric = GEval( name="Correctness", model=customModel, criteria=criteria, # NOTE: you can only provide either criteria or evaluation_steps, and not both #evaluation_steps=evaluation_steps, evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT] ) test_case = LLMTestCase( input=input, actual_output=actual_output, expected_output=expected_output ) correctness_metric.measure(test_case) return {"score":correctness_metric.score,"reason":correctness_metric.reason} def jsonMetrics(text,Trusted): false=False print(type(text),type(Trusted)) try: A=json.loads(text) jsonOk=1 except: jsonOk=0 print(jsonOk) if jsonOk==1: try: Trus=json.loads(Trusted) except: Trus=Trusted print(11111,3333,Trus) # print(type(A),type(json.loads(Trus))) # ddiff = DeepDiff(A, Trus) # print(5555,ddiff) # affectedkeys=ddiff.affected_root_keys/len(A.keys()) # keys=set(json.loads(Trusted).keys()) # jsonkeys=set(A.keys()) # TotKey=len(keys.intersection(jsonkeys))/len(keys) # keyplus=jsonkeys.intersection(keys) # else: # TotKey=0 # keyplus=0 # affectedkeys=0 return {"jsonOk":jsonOk}#,"TotKey":TotKey,"keyplus":keyplus,"affectedkeys":affectedkeys}