Carregar ficheiros para "/"

2024-11-26 06:31:52 -08:00
commit 76669c0ed3
4 changed files with 2408 additions and 0 deletions
--- a/main.py
+++ b/main.py
@ -0,0 +1,483 @@
+import tkinter as tk
+from tkinter import ttk, filedialog, scrolledtext
+from tkinter import messagebox
+import torch
+from transformers import AutoProcessor, WhisperForConditionalGeneration
+import cv2
+from datetime import timedelta
+import os
+import threading
+import subprocess
+import time
+import re
+import numpy as np
+
+class VideoSubtitleApp:
+    def __init__(self, root):
+        self.root = root
+        self.root.title("Extrator de Legendas")
+        self.root.geometry("900x700")
+        
+        # Variáveis
+        self.video_path = tk.StringVar()
+        self.video_info = tk.StringVar()
+        self.selected_language = tk.StringVar(value='pt-BR')
+        self.subtitles_list = []
+        
+        # Inicializar modelo Whisper e processador
+        self.initialize_whisper()
+        
+        # Dicionário de línguas disponíveis
+        self.languages = {
+            'Português (Brasil)': 'pt',
+            'Português (Portugal)': 'pt',
+            'English': 'en',
+            'Español': 'es',
+            'Français': 'fr',
+            'Deutsch': 'de',
+            'Italiano': 'it'
+        }
+        
+        # Criar interface
+        self.create_widgets()
+        
+        # Variável para armazenar o vídeo
+        self.video = None
+        
+    def create_widgets(self):
+        # Frame principal
+        main_frame = ttk.Frame(self.root, padding="10")
+        main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
+        
+        # Configurar expansão da grade
+        self.root.grid_rowconfigure(0, weight=1)
+        self.root.grid_columnconfigure(0, weight=1)
+        main_frame.grid_columnconfigure(1, weight=1)
+        
+        # Frame para seleção de arquivo e idioma
+        file_frame = ttk.Frame(main_frame)
+        file_frame.grid(row=0, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
+        
+        # Botão para selecionar arquivo
+        ttk.Button(file_frame, text="Selecionar Vídeo", command=self.select_file).pack(side=tk.LEFT, padx=5)
+        
+        # Seleção de idioma
+        ttk.Label(file_frame, text="Idioma:").pack(side=tk.LEFT, padx=5)
+        language_combo = ttk.Combobox(file_frame, 
+                                    values=list(self.languages.keys()),
+                                    textvariable=self.selected_language,
+                                    state='readonly',
+                                    width=20)
+        language_combo.pack(side=tk.LEFT, padx=5)
+        language_combo.set('Português (Brasil)')
+        
+        # Label para mostrar caminho do arquivo
+        ttk.Label(main_frame, textvariable=self.video_path, wraplength=500).grid(row=1, column=0, columnspan=2, pady=5)
+        
+        # Frame para informações do vídeo
+        info_frame = ttk.LabelFrame(main_frame, text="Informações do Vídeo", padding="5")
+        info_frame.grid(row=2, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
+        
+        ttk.Label(info_frame, textvariable=self.video_info).grid(row=0, column=0, sticky=tk.W)
+        
+        # Frame para botões de ação
+        button_frame = ttk.Frame(main_frame)
+        button_frame.grid(row=3, column=0, columnspan=2, pady=5)
+        
+        ttk.Button(button_frame, text="Gerar Legendas", command=self.generate_subtitles).pack(side=tk.LEFT, padx=5)
+        ttk.Button(button_frame, text="Salvar Alterações", command=self.save_subtitles).pack(side=tk.LEFT, padx=5)
+        
+        # Progress bar
+        self.progress = ttk.Progressbar(main_frame, mode='indeterminate')
+        self.progress.grid(row=4, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=5)
+        
+        # Frame para edição de legendas
+        subtitle_frame = ttk.LabelFrame(main_frame, text="Editor de Legendas", padding="5")
+        subtitle_frame.grid(row=5, column=0, columnspan=2, sticky=(tk.W, tk.E, tk.N, tk.S), pady=5)
+        subtitle_frame.grid_rowconfigure(0, weight=1)
+        subtitle_frame.grid_columnconfigure(0, weight=1)
+        
+        # Área de texto editável para legendas
+        self.subtitle_text = scrolledtext.ScrolledText(subtitle_frame, height=20, width=80, wrap=tk.WORD)
+        self.subtitle_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), padx=5, pady=5)
+        
+        # Instruções de uso
+        instructions = """Instruções:
+        1. Selecione o idioma do áudio do vídeo
+        2. Clique em 'Selecionar Vídeo' e escolha o arquivo
+        3. Aguarde o processamento do modelo Whisper
+        4. Edite as legendas se necessário
+        5. Clique em 'Salvar Alterações' para gerar o arquivo .srt"""
+        
+        ttk.Label(main_frame, text=instructions, justify=tk.LEFT, wraplength=600).grid(
+            row=6, column=0, columnspan=2, pady=5, sticky=tk.W)
+
+    def select_file(self):
+        filetypes = (
+            ('Arquivos de vídeo', '*.mp4 *.avi *.mkv'),
+            ('Todos os arquivos', '*.*')
+        )
+        
+        filename = filedialog.askopenfilename(
+            title='Selecione um vídeo',
+            filetypes=filetypes
+        )
+        
+        if filename:
+            self.video_path.set(filename)
+            self.load_video_info(filename)
+
+    def load_video_info(self, filename):
+        try:
+            self.video = cv2.VideoCapture(filename)
+            
+            # Obter informações do vídeo
+            fps = self.video.get(cv2.CAP_PROP_FPS)
+            frame_count = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))
+            duration = frame_count / fps
+            width = int(self.video.get(cv2.CAP_PROP_FRAME_WIDTH))
+            height = int(self.video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            
+            info = f"""
+            Duração: {str(timedelta(seconds=int(duration)))}
+            Resolução: {width}x{height}
+            FPS: {fps:.2f}
+            Formato: {os.path.splitext(filename)[1]}
+            """
+            self.video_info.set(info)
+            
+        except Exception as e:
+            messagebox.showerror("Erro", f"Erro ao carregar o vídeo: {str(e)}")
+
+    def generate_subtitles(self):
+        if not self.video_path.get():
+            messagebox.showwarning("Aviso", "Por favor, selecione um vídeo primeiro.")
+            return
+        
+        # Iniciar processamento em thread separada
+        self.progress.start()
+        thread = threading.Thread(target=self.process_video)
+        thread.start()
+    
+    def initialize_whisper(self):
+        """Inicializa o modelo Whisper e o processador com configurações otimizadas"""
+        try:
+            # Usar o modelo maior para melhor qualidade
+            model_name = "openai/whisper-large-v3"
+            self.processor = AutoProcessor.from_pretrained(model_name)
+            self.model = WhisperForConditionalGeneration.from_pretrained(
+                model_name,
+                device_map="auto",  # Usar a melhor dispositivo disponível
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                low_cpu_mem_usage=True
+            )
+            
+            if torch.cuda.is_available():
+                print("Usando GPU para processamento")
+            else:
+                print("Usando CPU para processamento")
+                
+        except Exception as e:
+            messagebox.showerror("Erro", f"Erro ao carregar modelo Whisper: {str(e)}")
+
+    def extract_audio(self, video_path, audio_path):
+        """Extrai o áudio do vídeo com configurações otimizadas"""
+        try:
+            print(f"Extraindo áudio de {video_path}")
+            
+            # Primeiro comando - qualidade máxima
+            command = [
+                'ffmpeg',
+                '-i', video_path,
+                '-vn',  # Não processar vídeo
+                '-acodec', 'pcm_s16le',  # Codec PCM 16-bit
+                '-ac', '1',  # Mono
+                '-ar', '16000',  # Taxa de amostragem para Whisper
+                '-af', 'volume=2.0,highpass=f=200,lowpass=f=3000,areverse,silenceremove=start_periods=1:start_duration=1:start_threshold=-60dB,areverse',  # Filtros de áudio
+                '-y',  # Sobrescrever arquivo
+                audio_path
+            ]
+            
+            print("Tentando primeira extração de áudio...")
+            process = subprocess.run(
+                command,
+                capture_output=True,
+                text=True,
+                encoding='utf-8'
+            )
+            
+            if process.returncode != 0:
+                print("Primeira tentativa falhou, tentando método alternativo...")
+                # Comando alternativo - mais simples
+                alt_command = [
+                    'ffmpeg',
+                    '-i', video_path,
+                    '-vn',
+                    '-acodec', 'pcm_s16le',
+                    '-ac', '1',
+                    '-ar', '16000',
+                    '-y',
+                    audio_path
+                ]
+                process = subprocess.run(
+                    alt_command,
+                    capture_output=True,
+                    text=True,
+                    encoding='utf-8'
+                )
+            
+            if os.path.exists(audio_path) and os.path.getsize(audio_path) > 0:
+                print(f"Áudio extraído com sucesso: {os.path.getsize(audio_path)} bytes")
+                return True
+            else:
+                raise Exception("Arquivo de áudio não foi criado ou está vazio")
+                
+        except Exception as e:
+            print(f"Erro detalhado na extração de áudio: {str(e)}")
+            if process and process.stderr:
+                print(f"Erro FFmpeg: {process.stderr}")
+            return False
+
+    def process_audio_with_whisper(self, audio_path, language_code):
+        try:
+            import soundfile as sf
+            print(f"Processando áudio em {language_code}...")
+            
+            # Carregar áudio
+            audio, sample_rate = sf.read(audio_path)
+            print(f"Áudio carregado: {len(audio)} amostras, taxa de amostragem: {sample_rate}Hz")
+            
+            # Normalizar áudio
+            if audio.dtype == np.int16:
+                audio = audio.astype(np.float32) / 32768.0
+            elif audio.dtype == np.int32:
+                audio = audio.astype(np.float32) / 2147483648.0
+            
+            # Garantir que o áudio esteja entre -1 e 1
+            max_abs = np.max(np.abs(audio))
+            if max_abs > 1.0:
+                audio = audio / max_abs
+            
+            # Preparar input features com configurações explícitas
+            inputs = self.processor(
+                audio, 
+                sampling_rate=sample_rate,
+                return_tensors="pt",
+                padding=True,
+                do_normalize=True,
+                return_attention_mask=True
+            )
+            
+            print("Features de entrada processadas")
+            
+            # Mover para GPU se disponível
+            if torch.cuda.is_available():
+                inputs = {k: v.to("cuda") for k, v in inputs.items()}
+                print("Dados movidos para GPU")
+            
+            # Configurar parâmetros de geração corrigidos
+            generate_kwargs = {
+                "temperature": 0.0,  # Determinístico
+                "no_speech_threshold": 0.6,
+                "logprob_threshold": -1.0,
+                "compression_ratio_threshold": 2.4,
+                "condition_on_previous_text": True,
+                "max_initial_timestamp": 1.0,
+                "return_timestamps": True
+            }
+            
+            if language_code:
+                generate_kwargs["language"] = language_code
+            
+            print("Iniciando geração da transcrição...")
+            
+            # Gerar transcrição com timestamps
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    inputs.input_features,
+                    **generate_kwargs
+                )
+            
+            print("Transcrição gerada, decodificando...")
+            
+            # Decodificar saída com timestamp_begin=True
+            transcription = self.processor.batch_decode(
+                outputs, 
+                skip_special_tokens=True,
+                output_offsets=True
+            )[0]
+            
+            print(f"Transcrição decodificada: {len(transcription.text)} caracteres")
+            
+            if not transcription.text.strip():
+                raise Exception("Transcrição vazia retornada pelo modelo")
+            
+            # Formatar segmentos com timestamps
+            segments = []
+            for i, segment in enumerate(transcription.offsets, start=1):
+                start_time = self.format_timestamp(segment['timestamp'][0])
+                end_time = self.format_timestamp(segment['timestamp'][1])
+                text = segment['text'].strip()
+                
+                if text:  # Só adicionar se houver texto
+                    segment_str = f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
+                    segments.append(segment_str)
+            
+            print(f"Segmentos formatados: {len(segments)}")
+            return segments
+                
+        except Exception as e:
+            print(f"Erro detalhado no processamento do áudio: {str(e)}")
+            raise Exception(f"Erro no processamento do áudio: {str(e)}")
+    
+    def format_timestamp(self, seconds):
+        """Converte segundos em formato de timestamp SRT (HH:MM:SS,mmm)"""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        seconds = seconds % 60
+        milliseconds = int((seconds % 1) * 1000)
+        seconds = int(seconds)
+        
+        return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+
+
+    def format_whisper_output(self, transcription):
+        """Formata a saída do Whisper em formato SRT"""
+        segments = []
+        pattern = r"\[(\d+:\d+\.\d+) --> (\d+:\d+\.\d+)\](.*?)(?=\[|$)"
+        
+        matches = re.finditer(pattern, transcription, re.DOTALL)
+        
+        for idx, match in enumerate(matches, 1):
+            start_time = match.group(1)
+            end_time = match.group(2)
+            text = match.group(3).strip()
+            
+            # Converter para formato SRT
+            start_time = self.convert_timestamp_to_srt(start_time)
+            end_time = self.convert_timestamp_to_srt(end_time)
+            
+            segment = f"{idx}\n{start_time} --> {end_time}\n{text}\n\n"
+            segments.append(segment)
+        
+        return segments
+
+    def convert_timestamp_to_srt(self, timestamp):
+        """Converte timestamp do Whisper para formato SRT"""
+        # Converter MM:SS.ms para HH:MM:SS,mmm
+        minutes, seconds = timestamp.split(":")
+        seconds, milliseconds = seconds.split(".")
+        
+        hours = int(minutes) // 60
+        minutes = int(minutes) % 60
+        
+        return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+
+    def process_video(self):
+        try:
+            # Extrair áudio
+            audio_path = "temp_audio.wav"
+            print("Iniciando extração de áudio...")
+            
+            if not self.extract_audio(self.video_path.get(), audio_path):
+                raise Exception("Falha na extração do áudio")
+            
+            print("Áudio extraído com sucesso")
+            
+            # Obter código do idioma
+            selected_name = self.selected_language.get()
+            language_code = self.languages.get(selected_name, 'en')
+            print(f"Idioma selecionado: {selected_name} ({language_code})")
+            
+            # Processar áudio com Whisper
+            print("Iniciando reconhecimento de fala...")
+            self.subtitles_list = self.process_audio_with_whisper(audio_path, language_code)
+            
+            if not self.subtitles_list:
+                raise Exception("Nenhum texto foi reconhecido")
+            
+            print(f"Texto reconhecido com sucesso: {len(self.subtitles_list)} segmentos")
+            
+            # Mostrar legendas na interface
+            self.root.after(0, self.update_subtitle_text, ''.join(self.subtitles_list))
+            
+        except Exception as e:
+            print(f"Erro no processamento: {str(e)}")
+            self.root.after(0, messagebox.showerror, "Erro", f"Erro ao gerar legendas: {str(e)}")
+        
+        finally:
+            # Limpar
+            self.root.after(0, self.progress.stop)
+            if self.video is not None:
+                self.video.release()
+            try:
+                if os.path.exists(audio_path):
+                    print(f"Removendo arquivo temporário: {audio_path}")
+                    os.remove(audio_path)
+            except Exception as e:
+                print(f"Erro ao remover arquivo temporário: {str(e)}")
+            
+    def update_subtitle_text(self, text):
+        self.subtitle_text.delete(1.0, tk.END)
+        self.subtitle_text.insert(tk.END, text)
+    
+    def save_subtitles(self):
+        try:
+            # Pegar texto atual
+            current_text = self.subtitle_text.get(1.0, tk.END).strip()
+            
+            # Validar formato básico das legendas
+            if not self.validate_subtitle_format(current_text):
+                raise ValueError("Formato de legendas inválido. Mantenha o formato: número + tempo + texto")
+            
+            # Salvar em arquivo
+            output_path = os.path.splitext(self.video_path.get())[0] + ".srt"
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(current_text)
+            
+            messagebox.showinfo("Sucesso", f"Legendas salvas com sucesso em:\n{output_path}")
+            
+        except Exception as e:
+            messagebox.showerror("Erro", f"Erro ao salvar legendas: {str(e)}")
+
+    def validate_subtitle_format(self, text):
+        """Validação melhorada do formato das legendas"""
+        if not text.strip():
+            return False
+            
+        lines = text.split('\n')
+        i = 0
+        
+        while i < len(lines):
+            if not lines[i].strip():
+                i += 1
+                continue
+            
+            # Validar número da legenda
+            if not lines[i].strip().isdigit():
+                return False
+            
+            # Validar formato do tempo
+            i += 1
+            if i >= len(lines):
+                return False
+            
+            time_line = lines[i].strip()
+            if not (' --> ' in time_line and 
+                   time_line.count(':') == 4 and 
+                   len(time_line.split(' --> ')) == 2):
+                return False
+            
+            # Validar texto da legenda
+            i += 1
+            if i >= len(lines) or not lines[i].strip():
+                return False
+            
+            i += 1
+            
+        return True
+
+if __name__ == "__main__":
+    root = tk.Tk()
+    app = VideoSubtitleApp(root)
+    root.mainloop()