Modificar main.py

Usar Wishper
2024-11-26 07:31:50 -08:00
parent 76669c0ed3
commit d23aadcd0e
1 changed files with 301 additions and 482 deletions
--- a/main.py
+++ b/main.py
@ -1,483 +1,302 @@
-import tkinter as tk
-from tkinter import ttk, filedialog, scrolledtext
-from tkinter import messagebox
-import torch
-from transformers import AutoProcessor, WhisperForConditionalGeneration
-import cv2
-from datetime import timedelta
-import os
-import threading
-import subprocess
-import time
-import re
-import numpy as np
-
-class VideoSubtitleApp:
-    def __init__(self, root):
-        self.root = root
-        self.root.title("Extrator de Legendas")
-        self.root.geometry("900x700")
-        
-        # Variáveis
-        self.video_path = tk.StringVar()
-        self.video_info = tk.StringVar()
-        self.selected_language = tk.StringVar(value='pt-BR')
-        self.subtitles_list = []
-        
-        # Inicializar modelo Whisper e processador
-        self.initialize_whisper()
-        
-        # Dicionário de línguas disponíveis
-        self.languages = {
-            'Português (Brasil)': 'pt',
-            'Português (Portugal)': 'pt',
-            'English': 'en',
-            'Español': 'es',
-            'Français': 'fr',
-            'Deutsch': 'de',
-            'Italiano': 'it'
-        }
-        
-        # Criar interface
-        self.create_widgets()
-        
-        # Variável para armazenar o vídeo
-        self.video = None
-        
-    def create_widgets(self):
-        # Frame principal
-        main_frame = ttk.Frame(self.root, padding="10")
-        main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
-        
-        # Configurar expansão da grade
-        self.root.grid_rowconfigure(0, weight=1)
-        self.root.grid_columnconfigure(0, weight=1)
-        main_frame.grid_columnconfigure(1, weight=1)
-        
-        # Frame para seleção de arquivo e idioma
-        file_frame = ttk.Frame(main_frame)
-        file_frame.grid(row=0, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
-        
-        # Botão para selecionar arquivo
-        ttk.Button(file_frame, text="Selecionar Vídeo", command=self.select_file).pack(side=tk.LEFT, padx=5)
-        
-        # Seleção de idioma
-        ttk.Label(file_frame, text="Idioma:").pack(side=tk.LEFT, padx=5)
-        language_combo = ttk.Combobox(file_frame, 
-                                    values=list(self.languages.keys()),
-                                    textvariable=self.selected_language,
-                                    state='readonly',
-                                    width=20)
-        language_combo.pack(side=tk.LEFT, padx=5)
-        language_combo.set('Português (Brasil)')
-        
-        # Label para mostrar caminho do arquivo
-        ttk.Label(main_frame, textvariable=self.video_path, wraplength=500).grid(row=1, column=0, columnspan=2, pady=5)
-        
-        # Frame para informações do vídeo
-        info_frame = ttk.LabelFrame(main_frame, text="Informações do Vídeo", padding="5")
-        info_frame.grid(row=2, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
-        
-        ttk.Label(info_frame, textvariable=self.video_info).grid(row=0, column=0, sticky=tk.W)
-        
-        # Frame para botões de ação
-        button_frame = ttk.Frame(main_frame)
-        button_frame.grid(row=3, column=0, columnspan=2, pady=5)
-        
-        ttk.Button(button_frame, text="Gerar Legendas", command=self.generate_subtitles).pack(side=tk.LEFT, padx=5)
-        ttk.Button(button_frame, text="Salvar Alterações", command=self.save_subtitles).pack(side=tk.LEFT, padx=5)
-        
-        # Progress bar
-        self.progress = ttk.Progressbar(main_frame, mode='indeterminate')
-        self.progress.grid(row=4, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
-        
-        # Frame para edição de legendas
-        subtitle_frame = ttk.LabelFrame(main_frame, text="Editor de Legendas", padding="5")
-        subtitle_frame.grid(row=5, column=0, columnspan=2, sticky=(tk.W, tk.E, tk.N, tk.S), pady=5)
-        subtitle_frame.grid_rowconfigure(0, weight=1)
-        subtitle_frame.grid_columnconfigure(0, weight=1)
-        
-        # Área de texto editável para legendas
-        self.subtitle_text = scrolledtext.ScrolledText(subtitle_frame, height=20, width=80, wrap=tk.WORD)
-        self.subtitle_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=5, pady=5)
-        
-        # Instruções de uso
-        instructions = """Instruções:
-        1. Selecione o idioma do áudio do vídeo
-        2. Clique em 'Selecionar Vídeo' e escolha o arquivo
-        3. Aguarde o processamento do modelo Whisper
-        4. Edite as legendas se necessário
-        5. Clique em 'Salvar Alterações' para gerar o arquivo .srt"""
-        
-        ttk.Label(main_frame, text=instructions, justify=tk.LEFT, wraplength=600).grid(
-            row=6, column=0, columnspan=2, pady=5, sticky=tk.W)
-
-    def select_file(self):
-        filetypes = (
-            ('Arquivos de vídeo', '*.mp4 *.avi *.mkv'),
-            ('Todos os arquivos', '*.*')
-        )
-        
-        filename = filedialog.askopenfilename(
-            title='Selecione um vídeo',
-            filetypes=filetypes
-        )
-        
-        if filename:
-            self.video_path.set(filename)
-            self.load_video_info(filename)
-
-    def load_video_info(self, filename):
-        try:
-            self.video = cv2.VideoCapture(filename)
-            
-            # Obter informações do vídeo
-            fps = self.video.get(cv2.CAP_PROP_FPS)
-            frame_count = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))
-            duration = frame_count / fps
-            width = int(self.video.get(cv2.CAP_PROP_FRAME_WIDTH))
-            height = int(self.video.get(cv2.CAP_PROP_FRAME_HEIGHT))
-            
-            info = f"""
-            Duração: {str(timedelta(seconds=int(duration)))}
-            Resolução: {width}x{height}
-            FPS: {fps:.2f}
-            Formato: {os.path.splitext(filename)[1]}
-            """
-            self.video_info.set(info)
-            
-        except Exception as e:
-            messagebox.showerror("Erro", f"Erro ao carregar o vídeo: {str(e)}")
-
-    def generate_subtitles(self):
-        if not self.video_path.get():
-            messagebox.showwarning("Aviso", "Por favor, selecione um vídeo primeiro.")
-            return
-        
-        # Iniciar processamento em thread separada
-        self.progress.start()
-        thread = threading.Thread(target=self.process_video)
-        thread.start()
-    
-    def initialize_whisper(self):
-        """Inicializa o modelo Whisper e o processador com configurações otimizadas"""
-        try:
-            # Usar o modelo maior para melhor qualidade
-            model_name = "openai/whisper-large-v3"
-            self.processor = AutoProcessor.from_pretrained(model_name)
-            self.model = WhisperForConditionalGeneration.from_pretrained(
-                model_name,
-                device_map="auto",  # Usar a melhor dispositivo disponível
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                low_cpu_mem_usage=True
-            )
-            
-            if torch.cuda.is_available():
-                print("Usando GPU para processamento")
-            else:
-                print("Usando CPU para processamento")
-                
-        except Exception as e:
-            messagebox.showerror("Erro", f"Erro ao carregar modelo Whisper: {str(e)}")
-
-    def extract_audio(self, video_path, audio_path):
-        """Extrai o áudio do vídeo com configurações otimizadas"""
-        try:
-            print(f"Extraindo áudio de {video_path}")
-            
-            # Primeiro comando - qualidade máxima
-            command = [
-                'ffmpeg',
-                '-i', video_path,
-                '-vn',  # Não processar vídeo
-                '-acodec', 'pcm_s16le',  # Codec PCM 16-bit
-                '-ac', '1',  # Mono
-                '-ar', '16000',  # Taxa de amostragem para Whisper
-                '-af', 'volume=2.0,highpass=f=200,lowpass=f=3000,areverse,silenceremove=start_periods=1:start_duration=1:start_threshold=-60dB,areverse',  # Filtros de áudio
-                '-y',  # Sobrescrever arquivo
-                audio_path
-            ]
-            
-            print("Tentando primeira extração de áudio...")
-            process = subprocess.run(
-                command,
-                capture_output=True,
-                text=True,
-                encoding='utf-8'
-            )
-            
-            if process.returncode != 0:
-                print("Primeira tentativa falhou, tentando método alternativo...")
-                # Comando alternativo - mais simples
-                alt_command = [
-                    'ffmpeg',
-                    '-i', video_path,
-                    '-vn',
-                    '-acodec', 'pcm_s16le',
-                    '-ac', '1',
-                    '-ar', '16000',
-                    '-y',
-                    audio_path
-                ]
-                process = subprocess.run(
-                    alt_command,
-                    capture_output=True,
-                    text=True,
-                    encoding='utf-8'
-                )
-            
-            if os.path.exists(audio_path) and os.path.getsize(audio_path) > 0:
-                print(f"Áudio extraído com sucesso: {os.path.getsize(audio_path)} bytes")
-                return True
-            else:
-                raise Exception("Arquivo de áudio não foi criado ou está vazio")
-                
-        except Exception as e:
-            print(f"Erro detalhado na extração de áudio: {str(e)}")
-            if process and process.stderr:
-                print(f"Erro FFmpeg: {process.stderr}")
-            return False
-
-    def process_audio_with_whisper(self, audio_path, language_code):
-        try:
-            import soundfile as sf
-            print(f"Processando áudio em {language_code}...")
-            
-            # Carregar áudio
-            audio, sample_rate = sf.read(audio_path)
-            print(f"Áudio carregado: {len(audio)} amostras, taxa de amostragem: {sample_rate}Hz")
-            
-            # Normalizar áudio
-            if audio.dtype == np.int16:
-                audio = audio.astype(np.float32) / 32768.0
-            elif audio.dtype == np.int32:
-                audio = audio.astype(np.float32) / 2147483648.0
-            
-            # Garantir que o áudio esteja entre -1 e 1
-            max_abs = np.max(np.abs(audio))
-            if max_abs > 1.0:
-                audio = audio / max_abs
-            
-            # Preparar input features com configurações explícitas
-            inputs = self.processor(
-                audio, 
-                sampling_rate=sample_rate,
-                return_tensors="pt",
-                padding=True,
-                do_normalize=True,
-                return_attention_mask=True
-            )
-            
-            print("Features de entrada processadas")
-            
-            # Mover para GPU se disponível
-            if torch.cuda.is_available():
-                inputs = {k: v.to("cuda") for k, v in inputs.items()}
-                print("Dados movidos para GPU")
-            
-            # Configurar parâmetros de geração corrigidos
-            generate_kwargs = {
-                "temperature": 0.0,  # Determinístico
-                "no_speech_threshold": 0.6,
-                "logprob_threshold": -1.0,
-                "compression_ratio_threshold": 2.4,
-                "condition_on_previous_text": True,
-                "max_initial_timestamp": 1.0,
-                "return_timestamps": True
-            }
-            
-            if language_code:
-                generate_kwargs["language"] = language_code
-            
-            print("Iniciando geração da transcrição...")
-            
-            # Gerar transcrição com timestamps
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    inputs.input_features,
-                    **generate_kwargs
-                )
-            
-            print("Transcrição gerada, decodificando...")
-            
-            # Decodificar saída com timestamp_begin=True
-            transcription = self.processor.batch_decode(
-                outputs, 
-                skip_special_tokens=True,
-                output_offsets=True
-            )[0]
-            
-            print(f"Transcrição decodificada: {len(transcription.text)} caracteres")
-            
-            if not transcription.text.strip():
-                raise Exception("Transcrição vazia retornada pelo modelo")
-            
-            # Formatar segmentos com timestamps
-            segments = []
-            for i, segment in enumerate(transcription.offsets, start=1):
-                start_time = self.format_timestamp(segment['timestamp'][0])
-                end_time = self.format_timestamp(segment['timestamp'][1])
-                text = segment['text'].strip()
-                
-                if text:  # Só adicionar se houver texto
-                    segment_str = f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
-                    segments.append(segment_str)
-            
-            print(f"Segmentos formatados: {len(segments)}")
-            return segments
-                
-        except Exception as e:
-            print(f"Erro detalhado no processamento do áudio: {str(e)}")
-            raise Exception(f"Erro no processamento do áudio: {str(e)}")
-    
-    def format_timestamp(self, seconds):
-        """Converte segundos em formato de timestamp SRT (HH:MM:SS,mmm)"""
-        hours = int(seconds // 3600)
-        minutes = int((seconds % 3600) // 60)
-        seconds = seconds % 60
-        milliseconds = int((seconds % 1) * 1000)
-        seconds = int(seconds)
-        
-        return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
-
-
-    def format_whisper_output(self, transcription):
-        """Formata a saída do Whisper em formato SRT"""
-        segments = []
-        pattern = r"\[(\d+:\d+\.\d+) --> (\d+:\d+\.\d+)\](.*?)(?=\[|$)"
-        
-        matches = re.finditer(pattern, transcription, re.DOTALL)
-        
-        for idx, match in enumerate(matches, 1):
-            start_time = match.group(1)
-            end_time = match.group(2)
-            text = match.group(3).strip()
-            
-            # Converter para formato SRT
-            start_time = self.convert_timestamp_to_srt(start_time)
-            end_time = self.convert_timestamp_to_srt(end_time)
-            
-            segment = f"{idx}\n{start_time} --> {end_time}\n{text}\n\n"
-            segments.append(segment)
-        
-        return segments
-
-    def convert_timestamp_to_srt(self, timestamp):
-        """Converte timestamp do Whisper para formato SRT"""
-        # Converter MM:SS.ms para HH:MM:SS,mmm
-        minutes, seconds = timestamp.split(":")
-        seconds, milliseconds = seconds.split(".")
-        
-        hours = int(minutes) // 60
-        minutes = int(minutes) % 60
-        
-        return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
-
-    def process_video(self):
-        try:
-            # Extrair áudio
-            audio_path = "temp_audio.wav"
-            print("Iniciando extração de áudio...")
-            
-            if not self.extract_audio(self.video_path.get(), audio_path):
-                raise Exception("Falha na extração do áudio")
-            
-            print("Áudio extraído com sucesso")
-            
-            # Obter código do idioma
-            selected_name = self.selected_language.get()
-            language_code = self.languages.get(selected_name, 'en')
-            print(f"Idioma selecionado: {selected_name} ({language_code})")
-            
-            # Processar áudio com Whisper
-            print("Iniciando reconhecimento de fala...")
-            self.subtitles_list = self.process_audio_with_whisper(audio_path, language_code)
-            
-            if not self.subtitles_list:
-                raise Exception("Nenhum texto foi reconhecido")
-            
-            print(f"Texto reconhecido com sucesso: {len(self.subtitles_list)} segmentos")
-            
-            # Mostrar legendas na interface
-            self.root.after(0, self.update_subtitle_text, ''.join(self.subtitles_list))
-            
-        except Exception as e:
-            print(f"Erro no processamento: {str(e)}")
-            self.root.after(0, messagebox.showerror, "Erro", f"Erro ao gerar legendas: {str(e)}")
-        
-        finally:
-            # Limpar
-            self.root.after(0, self.progress.stop)
-            if self.video is not None:
-                self.video.release()
-            try:
-                if os.path.exists(audio_path):
-                    print(f"Removendo arquivo temporário: {audio_path}")
-                    os.remove(audio_path)
-            except Exception as e:
-                print(f"Erro ao remover arquivo temporário: {str(e)}")
-            
-    def update_subtitle_text(self, text):
-        self.subtitle_text.delete(1.0, tk.END)
-        self.subtitle_text.insert(tk.END, text)
-    
-    def save_subtitles(self):
-        try:
-            # Pegar texto atual
-            current_text = self.subtitle_text.get(1.0, tk.END).strip()
-            
-            # Validar formato básico das legendas
-            if not self.validate_subtitle_format(current_text):
-                raise ValueError("Formato de legendas inválido. Mantenha o formato: número + tempo + texto")
-            
-            # Salvar em arquivo
-            output_path = os.path.splitext(self.video_path.get())[0] + ".srt"
-            with open(output_path, 'w', encoding='utf-8') as f:
-                f.write(current_text)
-            
-            messagebox.showinfo("Sucesso", f"Legendas salvas com sucesso em:\n{output_path}")
-            
-        except Exception as e:
-            messagebox.showerror("Erro", f"Erro ao salvar legendas: {str(e)}")
-
-    def validate_subtitle_format(self, text):
-        """Validação melhorada do formato das legendas"""
-        if not text.strip():
-            return False
-            
-        lines = text.split('\n')
-        i = 0
-        
-        while i < len(lines):
-            if not lines[i].strip():
-                i += 1
-                continue
-            
-            # Validar número da legenda
-            if not lines[i].strip().isdigit():
-                return False
-            
-            # Validar formato do tempo
-            i += 1
-            if i >= len(lines):
-                return False
-            
-            time_line = lines[i].strip()
-            if not (' --> ' in time_line and 
-                   time_line.count(':') == 4 and 
-                   len(time_line.split(' --> ')) == 2):
-                return False
-            
-            # Validar texto da legenda
-            i += 1
-            if i >= len(lines) or not lines[i].strip():
-                return False
-            
-            i += 1
-            
-        return True
-
-if __name__ == "__main__":
-    root = tk.Tk()
-    app = VideoSubtitleApp(root)
+import tkinter as tk
+from tkinter import ttk, scrolledtext, filedialog, messagebox
+import threading
+import whisper
+import cv2
+from datetime import timedelta
+import os
+import subprocess
+import numpy as np
+
+class VideoSubtitleApp:
+    def __init__(self, root):
+        self.root = root
+        self.root.title("Extrator de Legendas")
+        self.root.geometry("900x700")
+        
+        # Variáveis
+        self.video_path = tk.StringVar()
+        self.video_info = tk.StringVar()
+        self.selected_language = tk.StringVar(value='English')
+        self.status_var = tk.StringVar(value="Pronto")
+        self.subtitles_list = []
+        
+        # Dicionário de línguas
+        self.languages = {
+            'Português (Brasil)': 'pt',
+            'Português (Portugal)': 'pt',
+            'English': 'en',
+            'Español': 'es',
+            'Français': 'fr',
+            'Deutsch': 'de',
+            'Italiano': 'it'
+        }
+        
+        # Criar interface
+        self.create_widgets()
+        
+        # Inicializar modelo em thread separada
+        self.model_ready = False
+        threading.Thread(target=self.initialize_whisper, daemon=True).start()
+
+    def initialize_whisper(self):
+        """Inicializa o modelo Whisper"""
+        try:
+            self.status_var.set("Carregando modelo...")
+            # Usar modelo base para melhor equilíbrio
+            self.model = whisper.load_model("base")
+            self.model_ready = True
+            self.status_var.set("Modelo carregado com sucesso")
+            self.generate_button.config(state='normal')
+        except Exception as e:
+            self.status_var.set("Erro ao carregar modelo")
+            messagebox.showerror("Erro", f"Erro ao carregar modelo Whisper:\n{str(e)}")
+
+    def create_widgets(self):
+        """Cria a interface gráfica"""
+        # Frame principal
+        main_frame = ttk.Frame(self.root, padding=(10, 10, 10, 10))
+        main_frame.grid(row=0, column=0, sticky="nsew")
+        
+        # Configurar grid
+        self.root.grid_rowconfigure(0, weight=1)
+        self.root.grid_columnconfigure(0, weight=1)
+        main_frame.grid_columnconfigure(0, weight=1)
+        
+        # Frame superior
+        top_frame = ttk.Frame(main_frame)
+        top_frame.grid(row=0, column=0, sticky="ew", pady=(0, 10))
+        
+        # Botões e controles
+        ttk.Button(top_frame, text="📂 Selecionar Vídeo", 
+                  command=self.select_file).pack(side=tk.LEFT, padx=5)
+        
+        ttk.Label(top_frame, text="🌐 Idioma:").pack(side=tk.LEFT, padx=5)
+        
+        language_combo = ttk.Combobox(top_frame, 
+                                    values=list(self.languages.keys()),
+                                    textvariable=self.selected_language,
+                                    state='readonly',
+                                    width=20)
+        language_combo.pack(side=tk.LEFT, padx=5)
+        
+        # Caminho do arquivo
+        path_frame = ttk.LabelFrame(main_frame, text="Arquivo Selecionado")
+        path_frame.grid(row=1, column=0, sticky="ew", pady=(0, 10))
+        ttk.Label(path_frame, textvariable=self.video_path, 
+                 wraplength=800).grid(row=0, column=0, padx=5, pady=5)
+        
+        # Informações do vídeo
+        info_frame = ttk.LabelFrame(main_frame, text="Informações do Vídeo")
+        info_frame.grid(row=2, column=0, sticky="ew", pady=(0, 10))
+        ttk.Label(info_frame, textvariable=self.video_info).grid(row=0, column=0, padx=5, pady=5)
+        
+        # Botões de ação
+        button_frame = ttk.Frame(main_frame)
+        button_frame.grid(row=3, column=0, pady=(0, 10))
+        
+        self.generate_button = ttk.Button(button_frame, text="🎬 Gerar Legendas",
+                                        command=self.generate_subtitles,
+                                        state='disabled')
+        self.generate_button.pack(side=tk.LEFT, padx=5)
+        
+        self.save_button = ttk.Button(button_frame, text="💾 Salvar",
+                                    command=self.save_subtitles,
+                                    state='disabled')
+        self.save_button.pack(side=tk.LEFT, padx=5)
+        
+        # Barra de progresso
+        self.progress = ttk.Progressbar(main_frame, mode='indeterminate')
+        self.progress.grid(row=4, column=0, sticky="ew", pady=(0, 10))
+        
+        # Editor de legendas
+        editor_frame = ttk.LabelFrame(main_frame, text="Editor de Legendas")
+        editor_frame.grid(row=5, column=0, sticky="nsew", pady=(0, 10))
+        editor_frame.grid_columnconfigure(0, weight=1)
+        editor_frame.grid_rowconfigure(0, weight=1)
+        
+        self.subtitle_text = scrolledtext.ScrolledText(
+            editor_frame,
+            wrap=tk.WORD,
+            font=('Consolas', 10)
+        )
+        self.subtitle_text.grid(row=0, column=0, sticky="nsew", padx=5, pady=5)
+        
+        # Barra de status
+        status_frame = ttk.Frame(main_frame)
+        status_frame.grid(row=6, column=0, sticky="ew")
+        
+        ttk.Label(status_frame, textvariable=self.status_var, relief=tk.SUNKEN).grid(
+            row=0, column=0, sticky="ew")
+        status_frame.grid_columnconfigure(0, weight=1)
+
+    def select_file(self):
+        """Seleciona arquivo de vídeo"""
+        filename = filedialog.askopenfilename(
+            title="Selecionar Vídeo",
+            filetypes=[
+                ("Arquivos de Vídeo", "*.mp4 *.mkv *.avi"),
+                ("Todos os Arquivos", "*.*")
+            ]
+        )
+        
+        if filename:
+            self.video_path.set(filename)
+            self.load_video_info(filename)
+
+    def load_video_info(self, filename):
+        """Carrega informações do vídeo"""
+        try:
+            cap = cv2.VideoCapture(filename)
+            fps = cap.get(cv2.CAP_PROP_FPS)
+            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            duration = frame_count / fps
+            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            
+            info = f"Duração: {str(timedelta(seconds=int(duration)))}\n"
+            info += f"Resolução: {width}x{height}\n"
+            info += f"FPS: {fps:.2f}\n"
+            info += f"Formato: {os.path.splitext(filename)[1]}"
+            
+            self.video_info.set(info)
+            cap.release()
+            
+        except Exception as e:
+            messagebox.showerror("Erro", f"Erro ao carregar vídeo: {str(e)}")
+
+    def extract_audio(self, video_path, audio_path):
+        """Extrai o áudio do vídeo"""
+        try:
+            command = [
+                'ffmpeg',
+                '-i', video_path,
+                '-vn',
+                '-acodec', 'pcm_s16le',
+                '-ar', '16000',
+                '-ac', '1',
+                '-y',
+                audio_path
+            ]
+            
+            process = subprocess.run(
+                command,
+                capture_output=True,
+                text=True
+            )
+            
+            return os.path.exists(audio_path) and os.path.getsize(audio_path) > 0
+            
+        except Exception as e:
+            print(f"Erro na extração de áudio: {str(e)}")
+            return False
+
+    def process_audio_with_whisper(self, audio_path, language_code):
+        """Processa o áudio usando Whisper"""
+        try:
+            # Configurar opções do Whisper
+            options = {
+                "language": language_code,
+                "task": "transcribe",
+                "verbose": False
+            }
+            
+            # Realizar transcrição
+            result = self.model.transcribe(audio_path, **options)
+            
+            # Processar segmentos
+            segments = []
+            for i, segment in enumerate(result["segments"], 1):
+                start_time = segment["start"]
+                end_time = segment["end"]
+                text = segment["text"].strip()
+                
+                if text:
+                    segment_str = f"{i}\n"
+                    segment_str += f"{self.format_timestamp(start_time)} --> {self.format_timestamp(end_time)}\n"
+                    segment_str += f"{text}\n\n"
+                    segments.append(segment_str)
+            
+            return segments
+            
+        except Exception as e:
+            print(f"Erro detalhado no processamento do áudio: {str(e)}")
+            raise
+
+    def format_timestamp(self, seconds):
+        """Converte segundos para formato SRT"""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        millisecs = int((seconds * 1000) % 1000)
+        
+        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}"
+
+    def generate_subtitles(self):
+        """Inicia processo de geração de legendas"""
+        if not self.video_path.get():
+            messagebox.showwarning("Aviso", "Selecione um vídeo primeiro.")
+            return
+            
+        if not self.model_ready:
+            messagebox.showwarning("Aviso", "Aguarde o modelo ser carregado.")
+            return
+        
+        self.progress.start()
+        self.generate_button.config(state='disabled')
+        self.save_button.config(state='disabled')
+        threading.Thread(target=self.process_video, daemon=True).start()
+
+    def process_video(self):
+        """Processa o vídeo e gera legendas"""
+        audio_path = "temp_audio.wav"
+        try:
+            self.status_var.set("Extraindo áudio...")
+            
+            if not self.extract_audio(self.video_path.get(), audio_path):
+                raise Exception("Falha na extração do áudio")
+            
+            self.status_var.set("Processando áudio...")
+            language = self.languages[self.selected_language.get()]
+            
+            self.subtitles_list = self.process_audio_with_whisper(audio_path, language)
+            
+            if not self.subtitles_list:
+                raise Exception("Nenhuma legenda gerada")
+            
+            self.status_var.set("Legendas geradas com sucesso!")
+            self.root.after(0, self.update_subtitle_text)
+            
+        except Exception as e:
+            self.status_var.set("Erro no processamento")
+            messagebox.showerror("Erro", str(e))
+            
+        finally:
+            self.progress.stop()
+            self.generate_button.config(state='normal')
+            try:
+                if os.path.exists(audio_path):
+                    os.remove(audio_path)
+            except:
+                pass
+
+    def update_subtitle_text(self):
+        """Atualiza o texto das legendas na interface"""
+        self.subtitle_text.delete('1.0', tk.END)
+        self.subtitle_text.insert('1.0', ''.join(self.subtitles_list))
+        self.save_button.config(state='normal')
+
+    def save_subtitles(self):
+        """Salva as legendas em arquivo"""
+        try:
+            output_path = os.path.splitext(self.video_path.get())[0] + ".srt"
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(self.subtitle_text.get('1.0', tk.END))
+            messagebox.showinfo("Sucesso", f"Legendas salvas em:\n{output_path}")
+        except Exception as e:
+            messagebox.showerror("Erro", f"Erro ao salvar legendas: {str(e)}")
+
+if __name__ == "__main__":
+    root = tk.Tk()
+    app = VideoSubtitleApp(root)
    root.mainloop()