-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgladia_batch_transcriber_slow.py
More file actions
154 lines (123 loc) · 4.5 KB
/
gladia_batch_transcriber_slow.py
File metadata and controls
154 lines (123 loc) · 4.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""
Gladia Batch Transcriber (https://github.com/robomustib/gladia_batch_transcriber)
Copyright (c) 2025 Mustafa Bilgin
Licensed under Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
"""
import os
import time
import requests
from dotenv import load_dotenv
# ===========================
# CONFIGURATION
# ===========================
load_dotenv()
API_KEY = os.getenv("GLADIA_API_KEY")
INPUT_FOLDER = os.getenv("INPUT_FOLDER", "audio_files")
OUTPUT_FOLDER = os.getenv("OUTPUT_FOLDER", "transkripte_output")
if not API_KEY:
print("ERROR: Please check .env file for API Key.")
exit()
# ===========================
# PREPARATION
# ===========================
base_path = os.getcwd()
input_path = os.path.join(base_path, INPUT_FOLDER)
output_path = os.path.join(base_path, OUTPUT_FOLDER)
# Fallback: Look in current folder if input folder missing
if not os.path.exists(input_path):
input_path = base_path
if not os.path.exists(output_path):
os.makedirs(output_path)
files = [f for f in os.listdir(input_path) if f.lower().endswith(".mp3")]
files.sort()
# ===========================
# START PROCESSING
# ===========================
headers = {"x-gladia-key": API_KEY}
print(f"--> Found {len(files)} files.")
print(f"--> Saving to: {OUTPUT_FOLDER}\n")
skipped_count = 0
for filename in files:
mp3_path = os.path.join(input_path, filename)
txt_path = os.path.join(output_path, filename.replace(".mp3", ".txt"))
# RESUME FUNCTION: Skip if already done
if os.path.exists(txt_path):
skipped_count += 1
# Print only every 10 files to keep log clean, or just silent
continue
if skipped_count > 0:
print(f"--> Skipped {skipped_count} files (already done). Resuming...\n")
skipped_count = 0 # Reset counter
print(f"--- Processing: {filename} ---")
# 1. UPLOAD
print(" Uploading...", end=" ", flush=True)
try:
with open(mp3_path, 'rb') as f:
payload = {'audio': (filename, f, 'audio/mpeg')}
# Added timeout to prevent freezing
response = requests.post(
'https://api.gladia.io/v2/upload/',
headers=headers,
files=payload,
timeout=60
)
if response.status_code == 429:
print("\n\n STOP: Hourly Limit Reached (429)!")
print(" The script will stop now. Please wait 1 hour.")
print(" Restart the script later to continue exactly here.")
break
if response.status_code != 200:
print(f"\n UPLOAD ERROR: {response.text}")
continue
audio_url = response.json().get("audio_url")
print("OK.")
except Exception as e:
print(f"\n Network error during upload: {e}")
time.sleep(5)
continue
# 2. START TRANSCRIPTION
print(" Starting...", end=" ", flush=True)
try:
response = requests.post(
'https://api.gladia.io/v2/pre-recorded/',
headers=headers,
json={"audio_url": audio_url},
timeout=30
)
if response.status_code == 429:
print("\n\n STOP: Hourly Limit Reached!")
break
if response.status_code != 201:
print(f"\n START ERROR: {response.text}")
continue
result_url = response.json().get("result_url")
print("Running.", end=" ", flush=True)
except Exception as e:
print(f"\n Network error during start: {e}")
continue
# 3. POLLING (Waiting)
while True:
try:
poll = requests.get(result_url, headers=headers, timeout=30).json()
status = poll.get("status")
if status == "done":
text = poll["result"]["transcription"]["full_transcript"]
with open(txt_path, "w", encoding="utf-8") as tf:
tf.write(text)
print(f"\n DONE! Saved.")
break
elif status == "error":
print(f"\n LADIA ERROR: {poll}")
break
else:
print(".", end="", flush=True)
time.sleep(3)
except KeyboardInterrupt:
print("\n\n Script interrupted by user. Exiting safely.")
exit()
except Exception:
# On network hiccup, just wait a bit and try again
time.sleep(3)
# Polite pause
time.sleep(1)
print("\n--- Script finished or stopped ---")