V2

2023-11-27 17:06:34 +01:00 · 2023-11-27 17:06:34 +01:00 · 4514592453
parent b94be5a668
commit 4514592453
2 changed files with 15 additions and 13 deletions
--- a/main.py
+++ b/main.py
@ -12,6 +12,8 @@ def read_main():
    return {"message": "Ok"}
 class Response(BaseModel):
    path: str
+    ratehertz:int
+    encoding:str


 model = whisper.load_model("medium")
@ -26,8 +28,10 @@ def calculate_api(response: Response):
@app.post("/voice2txtGoogle/")
 def calculate_api_g(response: Response):
    path = response.path
+    ratehertz = response.ratehertz
+    encoding = response.encoding
    t=time.time()
-    result = transcribe_ogg_audio(path)
+    result = transcribe_audio(audio_file_path=path,ratehertz=ratehertz,encoding=encoding)
    return {"message": result,"time":time.time()-t}


@ -37,16 +41,20 @@ import os

 os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "token.json"

-def transcribe_ogg_audio(audio_file_path, language_code='es-US'):
+def transcribe_audio(audio_file_path, language_code='es-US',ratehertz=48000,encoding="OGG"):
    client = speech_v1.SpeechClient()
-   
+    if encoding=="OGG":
+        confEncoding=types.RecognitionConfig.AudioEncoding.OGG_OPUS
+    elif encoding=="WEBM":
+        confEncoding=types.RecognitionConfig.AudioEncoding.WEBM_OPUS
+    else:
+        confEncoding=types.RecognitionConfig.AudioEncoding.OGG_OPUS
    with open(audio_file_path, 'rb') as audio_file:
        content = audio_file.read()
-   
    audio = speech_v1.RecognitionAudio(content=content)
    config = speech_v1.RecognitionConfig(
-        encoding=types.RecognitionConfig.AudioEncoding.OGG_OPUS,#.FLAC,  # Use OGG encoding
-        sample_rate_hertz=48000,  # Update this to match your audio file
+        encoding=confEncoding,#.FLAC,  # Use OGG encoding
+        sample_rate_hertz=ratehertz,
        language_code=language_code,
    )
   
--- a/requirements.txt
+++ b/requirements.txt
@ -109,13 +109,7 @@ tabulate==0.9.0
 tensorboardX==2.6.2.2
 threadpoolctl==3.2.0
 tiktoken==0.3.3
-tokenizers==0.14.1
-torch==2.1.0+cpu
-torch-audiomentations==0.11.0
-torch-pitch-shift==1.2.4
-torchaudio==2.1.0+cpu
-torchmetrics==1.2.0
-torchvision==0.16.0+cpu
+tokenizer
 tqdm==4.66.1
 transformers==4.34.1
 triton==2.0.0