diff --git a/main.py b/main.py index 59aedab..d58a398 100644 --- a/main.py +++ b/main.py @@ -12,6 +12,8 @@ def read_main(): return {"message": "Ok"} class Response(BaseModel): path: str + ratehertz:int + encoding:str model = whisper.load_model("medium") @@ -26,8 +28,10 @@ def calculate_api(response: Response): @app.post("/voice2txtGoogle/") def calculate_api_g(response: Response): path = response.path + ratehertz = response.ratehertz + encoding = response.encoding t=time.time() - result = transcribe_ogg_audio(path) + result = transcribe_audio(audio_file_path=path,ratehertz=ratehertz,encoding=encoding) return {"message": result,"time":time.time()-t} @@ -37,16 +41,20 @@ import os os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "token.json" -def transcribe_ogg_audio(audio_file_path, language_code='es-US'): +def transcribe_audio(audio_file_path, language_code='es-US',ratehertz=48000,encoding="OGG"): client = speech_v1.SpeechClient() - + if encoding=="OGG": + confEncoding=types.RecognitionConfig.AudioEncoding.OGG_OPUS + elif encoding=="WEBM": + confEncoding=types.RecognitionConfig.AudioEncoding.WEBM_OPUS + else: + confEncoding=types.RecognitionConfig.AudioEncoding.OGG_OPUS with open(audio_file_path, 'rb') as audio_file: content = audio_file.read() - audio = speech_v1.RecognitionAudio(content=content) config = speech_v1.RecognitionConfig( - encoding=types.RecognitionConfig.AudioEncoding.OGG_OPUS,#.FLAC, # Use OGG encoding - sample_rate_hertz=48000, # Update this to match your audio file + encoding=confEncoding,#.FLAC, # Use OGG encoding + sample_rate_hertz=ratehertz, language_code=language_code, ) diff --git a/requirements.txt b/requirements.txt index 99b91ca..51757a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -109,13 +109,7 @@ tabulate==0.9.0 tensorboardX==2.6.2.2 threadpoolctl==3.2.0 tiktoken==0.3.3 -tokenizers==0.14.1 -torch==2.1.0+cpu -torch-audiomentations==0.11.0 -torch-pitch-shift==1.2.4 -torchaudio==2.1.0+cpu -torchmetrics==1.2.0 -torchvision==0.16.0+cpu +tokenizer tqdm==4.66.1 transformers==4.34.1 triton==2.0.0