Modificar main.py

Usar Wishper
2024-11-26 07:31:50 -08:00
parent 76669c0ed3
commit d23aadcd0e
1 changed files with 301 additions and 482 deletions
--- a/main.py
+++ b/main.py
@ -1,15 +1,11 @@
 import tkinter as tk
-from tkinter import ttk, filedialog, scrolledtext
+from tkinter import ttk, scrolledtext, filedialog, messagebox
-from tkinter import messagebox
+import threading
-import torch
+import whisper
 from transformers import AutoProcessor, WhisperForConditionalGeneration
 import cv2
 from datetime import timedelta
 import os
 import threading
 import subprocess
 import time
 import re
 import numpy as np
 class VideoSubtitleApp:
@ -21,13 +17,11 @@ class VideoSubtitleApp:
        # Variáveis
        self.video_path = tk.StringVar()
        self.video_info = tk.StringVar()
-        self.selected_language = tk.StringVar(value='pt-BR')
+        self.selected_language = tk.StringVar(value='English')
        self.status_var = tk.StringVar(value="Pronto")
        self.subtitles_list = []
-        # Inicializar modelo Whisper e processador
+        # Dicionário de línguas
        self.initialize_whisper()
        # Dicionário de línguas disponíveis
        self.languages = {
            'Português (Brasil)': 'pt',
            'Português (Portugal)': 'pt',
@ -41,86 +35,109 @@ class VideoSubtitleApp:
        # Criar interface
        self.create_widgets()
-        # Variável para armazenar o vídeo
+        # Inicializar modelo em thread separada
-        self.video = None
+        self.model_ready = False
        threading.Thread(target=self.initialize_whisper, daemon=True).start()
    def initialize_whisper(self):
        """Inicializa o modelo Whisper"""
        try:
            self.status_var.set("Carregando modelo...")
            # Usar modelo base para melhor equilíbrio
            self.model = whisper.load_model("base")
            self.model_ready = True
            self.status_var.set("Modelo carregado com sucesso")
            self.generate_button.config(state='normal')
        except Exception as e:
            self.status_var.set("Erro ao carregar modelo")
            messagebox.showerror("Erro", f"Erro ao carregar modelo Whisper:\n{str(e)}")
    def create_widgets(self):
        """Cria a interface gráfica"""
        # Frame principal
-        main_frame = ttk.Frame(self.root, padding="10")
+        main_frame = ttk.Frame(self.root, padding=(10, 10, 10, 10))
-        main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
+        main_frame.grid(row=0, column=0, sticky="nsew")
-        # Configurar expansão da grade
+        # Configurar grid
        self.root.grid_rowconfigure(0, weight=1)
        self.root.grid_columnconfigure(0, weight=1)
-        main_frame.grid_columnconfigure(1, weight=1)
+        main_frame.grid_columnconfigure(0, weight=1)
-        # Frame para seleção de arquivo e idioma
+        # Frame superior
-        file_frame = ttk.Frame(main_frame)
+        top_frame = ttk.Frame(main_frame)
-        file_frame.grid(row=0, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
+        top_frame.grid(row=0, column=0, sticky="ew", pady=(0, 10))
-        # Botão para selecionar arquivo
+        # Botões e controles
-        ttk.Button(file_frame, text="Selecionar Vídeo", command=self.select_file).pack(side=tk.LEFT, padx=5)
+        ttk.Button(top_frame, text="📂 Selecionar Vídeo", 
                  command=self.select_file).pack(side=tk.LEFT, padx=5)
-        # Seleção de idioma
+        ttk.Label(top_frame, text="🌐 Idioma:").pack(side=tk.LEFT, padx=5)
-        ttk.Label(file_frame, text="Idioma:").pack(side=tk.LEFT, padx=5)
+        
-        language_combo = ttk.Combobox(file_frame, 
+        language_combo = ttk.Combobox(top_frame, 
                                    values=list(self.languages.keys()),
                                    textvariable=self.selected_language,
                                    state='readonly',
                                    width=20)
        language_combo.pack(side=tk.LEFT, padx=5)
        language_combo.set('Português (Brasil)')
-        # Label para mostrar caminho do arquivo
+        # Caminho do arquivo
-        ttk.Label(main_frame, textvariable=self.video_path, wraplength=500).grid(row=1, column=0, columnspan=2, pady=5)
+        path_frame = ttk.LabelFrame(main_frame, text="Arquivo Selecionado")
        path_frame.grid(row=1, column=0, sticky="ew", pady=(0, 10))
        ttk.Label(path_frame, textvariable=self.video_path, 
                 wraplength=800).grid(row=0, column=0, padx=5, pady=5)
-        # Frame para informações do vídeo
+        # Informações do vídeo
-        info_frame = ttk.LabelFrame(main_frame, text="Informações do Vídeo", padding="5")
+        info_frame = ttk.LabelFrame(main_frame, text="Informações do Vídeo")
-        info_frame.grid(row=2, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
+        info_frame.grid(row=2, column=0, sticky="ew", pady=(0, 10))
        ttk.Label(info_frame, textvariable=self.video_info).grid(row=0, column=0, padx=5, pady=5)
-        ttk.Label(info_frame, textvariable=self.video_info).grid(row=0, column=0, sticky=tk.W)
+        # Botões de ação
        # Frame para botões de ação
        button_frame = ttk.Frame(main_frame)
-        button_frame.grid(row=3, column=0, columnspan=2, pady=5)
+        button_frame.grid(row=3, column=0, pady=(0, 10))
-        ttk.Button(button_frame, text="Gerar Legendas", command=self.generate_subtitles).pack(side=tk.LEFT, padx=5)
+        self.generate_button = ttk.Button(button_frame, text="🎬 Gerar Legendas",
-        ttk.Button(button_frame, text="Salvar Alterações", command=self.save_subtitles).pack(side=tk.LEFT, padx=5)
+                                        command=self.generate_subtitles,
                                        state='disabled')
        self.generate_button.pack(side=tk.LEFT, padx=5)
-        # Progress bar
+        self.save_button = ttk.Button(button_frame, text="💾 Salvar",
                                    command=self.save_subtitles,
                                    state='disabled')
        self.save_button.pack(side=tk.LEFT, padx=5)
        # Barra de progresso
        self.progress = ttk.Progressbar(main_frame, mode='indeterminate')
-        self.progress.grid(row=4, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
+        self.progress.grid(row=4, column=0, sticky="ew", pady=(0, 10))
-        # Frame para edição de legendas
+        # Editor de legendas
-        subtitle_frame = ttk.LabelFrame(main_frame, text="Editor de Legendas", padding="5")
+        editor_frame = ttk.LabelFrame(main_frame, text="Editor de Legendas")
-        subtitle_frame.grid(row=5, column=0, columnspan=2, sticky=(tk.W, tk.E, tk.N, tk.S), pady=5)
+        editor_frame.grid(row=5, column=0, sticky="nsew", pady=(0, 10))
-        subtitle_frame.grid_rowconfigure(0, weight=1)
+        editor_frame.grid_columnconfigure(0, weight=1)
-        subtitle_frame.grid_columnconfigure(0, weight=1)
+        editor_frame.grid_rowconfigure(0, weight=1)
-        # Área de texto editável para legendas
+        self.subtitle_text = scrolledtext.ScrolledText(
-        self.subtitle_text = scrolledtext.ScrolledText(subtitle_frame, height=20, width=80, wrap=tk.WORD)
+            editor_frame,
-        self.subtitle_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=5, pady=5)
+            wrap=tk.WORD,
            font=('Consolas', 10)
        )
        self.subtitle_text.grid(row=0, column=0, sticky="nsew", padx=5, pady=5)
-        # Instruções de uso
+        # Barra de status
-        instructions = """Instruções:
+        status_frame = ttk.Frame(main_frame)
-        1. Selecione o idioma do áudio do vídeo
+        status_frame.grid(row=6, column=0, sticky="ew")
        2. Clique em 'Selecionar Vídeo' e escolha o arquivo
        3. Aguarde o processamento do modelo Whisper
        4. Edite as legendas se necessário
        5. Clique em 'Salvar Alterações' para gerar o arquivo .srt"""
-        ttk.Label(main_frame, text=instructions, justify=tk.LEFT, wraplength=600).grid(
+        ttk.Label(status_frame, textvariable=self.status_var, relief=tk.SUNKEN).grid(
-            row=6, column=0, columnspan=2, pady=5, sticky=tk.W)
+            row=0, column=0, sticky="ew")
        status_frame.grid_columnconfigure(0, weight=1)
    def select_file(self):
-        filetypes = (
+        """Seleciona arquivo de vídeo"""
            ('Arquivos de vídeo', '*.mp4 *.avi *.mkv'),
            ('Todos os arquivos', '*.*')
        )
        filename = filedialog.askopenfilename(
-            title='Selecione um vídeo',
+            title="Selecionar Vídeo",
-            filetypes=filetypes
+            filetypes=[
                ("Arquivos de Vídeo", "*.mp4 *.mkv *.avi"),
                ("Todos os Arquivos", "*.*")
            ]
        )
        if filename:
@ -128,355 +145,157 @@ class VideoSubtitleApp:
            self.load_video_info(filename)
    def load_video_info(self, filename):
        """Carrega informações do vídeo"""
        try:
-            self.video = cv2.VideoCapture(filename)
+            cap = cv2.VideoCapture(filename)
-            
+            fps = cap.get(cv2.CAP_PROP_FPS)
-            # Obter informações do vídeo
+            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            fps = self.video.get(cv2.CAP_PROP_FPS)
            frame_count = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))
            duration = frame_count / fps
-            width = int(self.video.get(cv2.CAP_PROP_FRAME_WIDTH))
+            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-            height = int(self.video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            info = f"Duração: {str(timedelta(seconds=int(duration)))}\n"
            info += f"Resolução: {width}x{height}\n"
            info += f"FPS: {fps:.2f}\n"
            info += f"Formato: {os.path.splitext(filename)[1]}"
            info = f"""
            Duração: {str(timedelta(seconds=int(duration)))}
            Resolução: {width}x{height}
            FPS: {fps:.2f}
            Formato: {os.path.splitext(filename)[1]}
            """
            self.video_info.set(info)
            cap.release()
        except Exception as e:
-            messagebox.showerror("Erro", f"Erro ao carregar o vídeo: {str(e)}")
+            messagebox.showerror("Erro", f"Erro ao carregar vídeo: {str(e)}")
    def generate_subtitles(self):
        if not self.video_path.get():
            messagebox.showwarning("Aviso", "Por favor, selecione um vídeo primeiro.")
            return
        # Iniciar processamento em thread separada
        self.progress.start()
        thread = threading.Thread(target=self.process_video)
        thread.start()
    def initialize_whisper(self):
        """Inicializa o modelo Whisper e o processador com configurações otimizadas"""
        try:
            # Usar o modelo maior para melhor qualidade
            model_name = "openai/whisper-large-v3"
            self.processor = AutoProcessor.from_pretrained(model_name)
            self.model = WhisperForConditionalGeneration.from_pretrained(
                model_name,
                device_map="auto",  # Usar a melhor dispositivo disponível
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                low_cpu_mem_usage=True
            )
            if torch.cuda.is_available():
                print("Usando GPU para processamento")
            else:
                print("Usando CPU para processamento")
        except Exception as e:
            messagebox.showerror("Erro", f"Erro ao carregar modelo Whisper: {str(e)}")
    def extract_audio(self, video_path, audio_path):
-        """Extrai o áudio do vídeo com configurações otimizadas"""
+        """Extrai o áudio do vídeo"""
        try:
            print(f"Extraindo áudio de {video_path}")
            # Primeiro comando - qualidade máxima
            command = [
                'ffmpeg',
                '-i', video_path,
-                '-vn',  # Não processar vídeo
+                '-vn',
-                '-acodec', 'pcm_s16le',  # Codec PCM 16-bit
+                '-acodec', 'pcm_s16le',
-                '-ac', '1',  # Mono
+                '-ar', '16000',
-                '-ar', '16000',  # Taxa de amostragem para Whisper
+                '-ac', '1',
-                '-af', 'volume=2.0,highpass=f=200,lowpass=f=3000,areverse,silenceremove=start_periods=1:start_duration=1:start_threshold=-60dB,areverse',  # Filtros de áudio
+                '-y',
                '-y',  # Sobrescrever arquivo
                audio_path
            ]
            print("Tentando primeira extração de áudio...")
            process = subprocess.run(
                command,
                capture_output=True,
-                text=True,
+                text=True
                encoding='utf-8'
            )
-            if process.returncode != 0:
+            return os.path.exists(audio_path) and os.path.getsize(audio_path) > 0
                print("Primeira tentativa falhou, tentando método alternativo...")
                # Comando alternativo - mais simples
                alt_command = [
                    'ffmpeg',
                    '-i', video_path,
                    '-vn',
                    '-acodec', 'pcm_s16le',
                    '-ac', '1',
                    '-ar', '16000',
                    '-y',
                    audio_path
                ]
                process = subprocess.run(
                    alt_command,
                    capture_output=True,
                    text=True,
                    encoding='utf-8'
                )
            if os.path.exists(audio_path) and os.path.getsize(audio_path) > 0:
                print(f"Áudio extraído com sucesso: {os.path.getsize(audio_path)} bytes")
                return True
            else:
                raise Exception("Arquivo de áudio não foi criado ou está vazio")
        except Exception as e:
-            print(f"Erro detalhado na extração de áudio: {str(e)}")
+            print(f"Erro na extração de áudio: {str(e)}")
            if process and process.stderr:
                print(f"Erro FFmpeg: {process.stderr}")
            return False
    def process_audio_with_whisper(self, audio_path, language_code):
        """Processa o áudio usando Whisper"""
        try:
-            import soundfile as sf
+            # Configurar opções do Whisper
-            print(f"Processando áudio em {language_code}...")
+            options = {
-            
+                "language": language_code,
-            # Carregar áudio
+                "task": "transcribe",
-            audio, sample_rate = sf.read(audio_path)
+                "verbose": False
            print(f"Áudio carregado: {len(audio)} amostras, taxa de amostragem: {sample_rate}Hz")
            # Normalizar áudio
            if audio.dtype == np.int16:
                audio = audio.astype(np.float32) / 32768.0
            elif audio.dtype == np.int32:
                audio = audio.astype(np.float32) / 2147483648.0
            # Garantir que o áudio esteja entre -1 e 1
            max_abs = np.max(np.abs(audio))
            if max_abs > 1.0:
                audio = audio / max_abs
            # Preparar input features com configurações explícitas
            inputs = self.processor(
                audio, 
                sampling_rate=sample_rate,
                return_tensors="pt",
                padding=True,
                do_normalize=True,
                return_attention_mask=True
            )
            print("Features de entrada processadas")
            # Mover para GPU se disponível
            if torch.cuda.is_available():
                inputs = {k: v.to("cuda") for k, v in inputs.items()}
                print("Dados movidos para GPU")
            # Configurar parâmetros de geração corrigidos
            generate_kwargs = {
                "temperature": 0.0,  # Determinístico
                "no_speech_threshold": 0.6,
                "logprob_threshold": -1.0,
                "compression_ratio_threshold": 2.4,
                "condition_on_previous_text": True,
                "max_initial_timestamp": 1.0,
                "return_timestamps": True
            }
-            if language_code:
+            # Realizar transcrição
-                generate_kwargs["language"] = language_code
+            result = self.model.transcribe(audio_path, **options)
-            print("Iniciando geração da transcrição...")
+            # Processar segmentos
            # Gerar transcrição com timestamps
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs.input_features,
                    **generate_kwargs
                )
            print("Transcrição gerada, decodificando...")
            # Decodificar saída com timestamp_begin=True
            transcription = self.processor.batch_decode(
                outputs, 
                skip_special_tokens=True,
                output_offsets=True
            )[0]
            print(f"Transcrição decodificada: {len(transcription.text)} caracteres")
            if not transcription.text.strip():
                raise Exception("Transcrição vazia retornada pelo modelo")
            # Formatar segmentos com timestamps
            segments = []
-            for i, segment in enumerate(transcription.offsets, start=1):
+            for i, segment in enumerate(result["segments"], 1):
-                start_time = self.format_timestamp(segment['timestamp'][0])
+                start_time = segment["start"]
-                end_time = self.format_timestamp(segment['timestamp'][1])
+                end_time = segment["end"]
-                text = segment['text'].strip()
+                text = segment["text"].strip()
-                if text:  # Só adicionar se houver texto
+                if text:
-                    segment_str = f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
+                    segment_str = f"{i}\n"
                    segment_str += f"{self.format_timestamp(start_time)} --> {self.format_timestamp(end_time)}\n"
                    segment_str += f"{text}\n\n"
                    segments.append(segment_str)
            print(f"Segmentos formatados: {len(segments)}")
            return segments
        except Exception as e:
            print(f"Erro detalhado no processamento do áudio: {str(e)}")
-            raise Exception(f"Erro no processamento do áudio: {str(e)}")
+            raise
    def format_timestamp(self, seconds):
-        """Converte segundos em formato de timestamp SRT (HH:MM:SS,mmm)"""
+        """Converte segundos para formato SRT"""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
-        seconds = seconds % 60
+        secs = int(seconds % 60)
-        milliseconds = int((seconds % 1) * 1000)
+        millisecs = int((seconds * 1000) % 1000)
        seconds = int(seconds)
-        return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}"
    def generate_subtitles(self):
        """Inicia processo de geração de legendas"""
        if not self.video_path.get():
            messagebox.showwarning("Aviso", "Selecione um vídeo primeiro.")
            return
-    def format_whisper_output(self, transcription):
+        if not self.model_ready:
-        """Formata a saída do Whisper em formato SRT"""
+            messagebox.showwarning("Aviso", "Aguarde o modelo ser carregado.")
-        segments = []
+            return
        pattern = r"\[(\d+:\d+\.\d+) --> (\d+:\d+\.\d+)\](.*?)(?=\[|$)"
-        matches = re.finditer(pattern, transcription, re.DOTALL)
+        self.progress.start()
-        
+        self.generate_button.config(state='disabled')
-        for idx, match in enumerate(matches, 1):
+        self.save_button.config(state='disabled')
-            start_time = match.group(1)
+        threading.Thread(target=self.process_video, daemon=True).start()
            end_time = match.group(2)
            text = match.group(3).strip()
            # Converter para formato SRT
            start_time = self.convert_timestamp_to_srt(start_time)
            end_time = self.convert_timestamp_to_srt(end_time)
            segment = f"{idx}\n{start_time} --> {end_time}\n{text}\n\n"
            segments.append(segment)
        return segments
    def convert_timestamp_to_srt(self, timestamp):
        """Converte timestamp do Whisper para formato SRT"""
        # Converter MM:SS.ms para HH:MM:SS,mmm
        minutes, seconds = timestamp.split(":")
        seconds, milliseconds = seconds.split(".")
        hours = int(minutes) // 60
        minutes = int(minutes) % 60
        return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
    def process_video(self):
        """Processa o vídeo e gera legendas"""
        audio_path = "temp_audio.wav"
        try:
-            # Extrair áudio
+            self.status_var.set("Extraindo áudio...")
            audio_path = "temp_audio.wav"
            print("Iniciando extração de áudio...")
            if not self.extract_audio(self.video_path.get(), audio_path):
                raise Exception("Falha na extração do áudio")
-            print("Áudio extraído com sucesso")
+            self.status_var.set("Processando áudio...")
            language = self.languages[self.selected_language.get()]
-            # Obter código do idioma
+            self.subtitles_list = self.process_audio_with_whisper(audio_path, language)
            selected_name = self.selected_language.get()
            language_code = self.languages.get(selected_name, 'en')
            print(f"Idioma selecionado: {selected_name} ({language_code})")
            # Processar áudio com Whisper
            print("Iniciando reconhecimento de fala...")
            self.subtitles_list = self.process_audio_with_whisper(audio_path, language_code)
            if not self.subtitles_list:
-                raise Exception("Nenhum texto foi reconhecido")
+                raise Exception("Nenhuma legenda gerada")
-            print(f"Texto reconhecido com sucesso: {len(self.subtitles_list)} segmentos")
+            self.status_var.set("Legendas geradas com sucesso!")
-            
+            self.root.after(0, self.update_subtitle_text)
            # Mostrar legendas na interface
            self.root.after(0, self.update_subtitle_text, ''.join(self.subtitles_list))
        except Exception as e:
-            print(f"Erro no processamento: {str(e)}")
+            self.status_var.set("Erro no processamento")
-            self.root.after(0, messagebox.showerror, "Erro", f"Erro ao gerar legendas: {str(e)}")
+            messagebox.showerror("Erro", str(e))
        finally:
-            # Limpar
+            self.progress.stop()
-            self.root.after(0, self.progress.stop)
+            self.generate_button.config(state='normal')
            if self.video is not None:
                self.video.release()
            try:
                if os.path.exists(audio_path):
                    print(f"Removendo arquivo temporário: {audio_path}")
                    os.remove(audio_path)
-            except Exception as e:
+            except:
-                print(f"Erro ao remover arquivo temporário: {str(e)}")
+                pass
-    def update_subtitle_text(self, text):
+    def update_subtitle_text(self):
-        self.subtitle_text.delete(1.0, tk.END)
+        """Atualiza o texto das legendas na interface"""
-        self.subtitle_text.insert(tk.END, text)
+        self.subtitle_text.delete('1.0', tk.END)
        self.subtitle_text.insert('1.0', ''.join(self.subtitles_list))
        self.save_button.config(state='normal')
    def save_subtitles(self):
        """Salva as legendas em arquivo"""
        try:
            # Pegar texto atual
            current_text = self.subtitle_text.get(1.0, tk.END).strip()
            # Validar formato básico das legendas
            if not self.validate_subtitle_format(current_text):
                raise ValueError("Formato de legendas inválido. Mantenha o formato: número + tempo + texto")
            # Salvar em arquivo
            output_path = os.path.splitext(self.video_path.get())[0] + ".srt"
            with open(output_path, 'w', encoding='utf-8') as f:
-                f.write(current_text)
+                f.write(self.subtitle_text.get('1.0', tk.END))
-            
+            messagebox.showinfo("Sucesso", f"Legendas salvas em:\n{output_path}")
            messagebox.showinfo("Sucesso", f"Legendas salvas com sucesso em:\n{output_path}")
        except Exception as e:
            messagebox.showerror("Erro", f"Erro ao salvar legendas: {str(e)}")
    def validate_subtitle_format(self, text):
        """Validação melhorada do formato das legendas"""
        if not text.strip():
            return False
        lines = text.split('\n')
        i = 0
        while i < len(lines):
            if not lines[i].strip():
                i += 1
                continue
            # Validar número da legenda
            if not lines[i].strip().isdigit():
                return False
            # Validar formato do tempo
            i += 1
            if i >= len(lines):
                return False
            time_line = lines[i].strip()
            if not (' --> ' in time_line and 
                   time_line.count(':') == 4 and 
                   len(time_line.split(' --> ')) == 2):
                return False
            # Validar texto da legenda
            i += 1
            if i >= len(lines) or not lines[i].strip():
                return False
            i += 1
        return True
 if __name__ == "__main__":
    root = tk.Tk()
    app = VideoSubtitleApp(root)