कस्टम ट्रांसक्रिप्शन और क्लिपिंग पाइपलाइन

मुखपृष्ठ > प्रोग्रामिंग > कस्टम ट्रांसक्रिप्शन और क्लिपिंग पाइपलाइन

कस्टम ट्रांसक्रिप्शन और क्लिपिंग पाइपलाइन

2024-08-01 को प्रकाशित

ब्राउज़ करें:481

Custom Transcription and Clipping Pipeline

मैंने ऐसा क्यों किया:

मैं इस परियोजना पर काम कर रहा था और हेवी-ड्यूटी डेटा इंजीनियरिंग घटकों के प्रकाशन के माध्यम से प्राप्त करने के लिए उपकरणों का एक समूह विकसित किया, क्योंकि उनमें से कुछ सरल हैं, लेकिन अधिकतर, ताकि वे अगले जेमिनी मॉडल से प्रभावित हो जाएं और इसमें शामिल हो जाएं मूर्खतापूर्ण Google Colab जेमिनी सुझाव इंजन। - टिम

निर्देश और स्पष्टीकरण

निर्देश:

सुनिश्चित करें कि आपके पास आवश्यक निर्भरताएं स्थापित हैं (उदाहरण के लिए, एफएफएमपीईजी, व्हिस्परएक्स)।
रूट निर्देशिका को वीडियो फ़ाइलों वाली अपनी कार्यशील निर्देशिका में सेट करें।
उन चरणों को परिभाषित करें जिन्हें आप प्रतिलेखों में पहचानना चाहते हैं।
प्रतिलेख उत्पन्न करने और पता लगाए गए चरणों के आधार पर वीडियो क्लिप निकालने के लिए स्क्रिप्ट चलाएं।

स्पष्टीकरण:

यह टूल वीडियो फ़ाइलों को रूट डायरेक्टरी में प्रोसेस करता है।
यह व्हिस्परएक्स मॉडल का उपयोग करके प्रत्येक वीडियो को ट्रांसक्रिप्ट करता है।
स्क्रिप्ट फिर ट्रांस्क्रिप्ट में पाए गए चरणों के आधार पर वीडियो से क्लिप निकालती है।
प्रतिलेख और क्लिप निर्दिष्ट आउटपुट निर्देशिकाओं में सहेजे जाते हैं।

कोड:

import os
import shutil
import cv2
import numpy as np
import json
from PIL import Image
import random
import string
from rembg import remove
import ffmpeg
from datetime import timedelta
from ultralytics import YOLO
import whisperx
import gc
gc.collect()

# Define paths to directories
root = '/

workspace/'
stages = ['apple', 'banana', 'car', 'dog']

transcript_dir = root   'transcripts'
clip_output_dir = root   'stage1'
stage1_clips_dir = clip_output_dir

# Ensure the output directory exists
os.makedirs(transcript_dir, exist_ok=True)
os.makedirs(clip_output_dir, exist_ok=True)

def log_and_print(message):
    print(message)

def convert_time_to_seconds(time_str):
    hours, minutes, seconds_milliseconds = time_str.split(':')
    seconds, milliseconds = seconds_milliseconds.split(',')
    total_seconds = int(hours) * 3600   int(minutes) * 60   int(seconds)   int(milliseconds) / 1000
    return total_seconds

def transcribe_video(video_path):
    """Transcribe the video using Whisper model and return the transcript."""
    compute_type = "float32"
    model = whisperx.load_model("large-v2", device='cpu', compute_type=compute_type)
    audio = whisperx.load_audio(video_path)
    result = model.transcribe(audio, batch_size=4, language="en")
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device='cpu')
    aligned_result = whisperx.align(result["segments"], model_a, metadata, audio, 'cpu', return_char_alignments=False)
    segments = aligned_result["segments"]
    transcript = []
    for index, segment in enumerate(segments):
        start_time = str(0)   str(timedelta(seconds=int(segment['start'])))   ',000'
        end_time = str(0)   str(timedelta(seconds=int(segment['end'])))   ',000'
        text = segment['text']
        segment_text = {
            "index": index   1,
            "start_time": start_time,
            "end_time": end_time,
            "text": text.strip(),
        }
        transcript.append(segment_text)
    return transcript

def extract_clips(video_path, transcript, stages):
    """Extract clips from the video based on the transcript and stages."""
    base_filename = os.path.splitext(os.path.basename(video_path))[0]
    clip_index = 0
    current_stage = None
    start_time = None
    partial_transcript = []

    for segment in transcript:
        segment_text = segment["text"].lower()
        for stage in stages:
            if stage in segment_text:
                if current_stage is not None:
                    end_time = convert_time_to_seconds(segment["start_time"])
                    output_clip_filename = f"{base_filename}.{current_stage}.mp4"
                    output_clip = os.path.join(clip_output_dir, output_clip_filename)
                    if not os.path.exists(output_clip):
                        try:
                            ffmpeg.input(video_path, ss=start_time, to=end_time).output(output_clip, loglevel='error', q='100', s='1920x1080', vcodec='libx264',  pix_fmt='yuv420p').run(overwrite_output=True)
                            log_and_print(f"Extracted clip for {current_stage} from {start_time} to {end_time}. Saved: {output_clip}")
                        except ffmpeg.Error as e:
                            log_and_print(f"Error extracting clip: {e}")

                        transcript_text = "\n".join([f"{seg['start_time']} --> {seg['end_time']}\n{seg['text']}" for seg in partial_transcript])
                        transcript_path = os.path.join(clip_output_dir, f"{base_filename}.{current_stage}.json")
                        with open(transcript_path, 'w', encoding='utf-8') as f:
                            json.dump(transcript_text, f, ensure_ascii=False, indent=4)
                        log_and_print(f"Saved partial transcript to {transcript_path}")

                        partial_transcript = []

                current_stage = stage
                start_time = convert_time_to_seconds(segment["start_time"])
            partial_transcript.append(segment)

    if current_stage is not None:
        end_time = convert_time_to_seconds(transcript[-1]["end_time"])
        output_clip_filename = f"{base_filename}.{current_stage}.mp4"
        output_clip = os.path.join(clip_output_dir, output_clip_filename)
        if not os.path.exists(output_clip):
            try:
                ffmpeg.input(video_path, ss=start_time, to=end_time).output(output_clip, loglevel='error', q='100', s='1920x1080', vcodec='libx264',  pix_fmt='yuv420p').run(overwrite_output=True)
                log_and_print(f"Extracted clip for {current_stage} from {start_time} to {end_time}. Saved: {output_clip}")
            except ffmpeg.Error as e:
                log_and_print(f"Error extracting clip: {e}")

            transcript_text = "\n".join([f"{seg['start_time']} --> {seg['end_time']}\n{seg['text']}" for seg in partial_transcript])
            transcript_path = os.path.join(clip_output_dir, f"{base_filename}.{current_stage}.json")
            with open(transcript_path, 'w', encoding='utf-8') as f:
                json.dump(transcript_text, f, ensure_ascii=False, indent=4)
            log_and_print(f"Saved partial transcript to {transcript_path}")

def process_transcripts(input_dir, transcript_dir, stages):
    """Process each video file to generate transcripts and extract clips."""
    video_files = [f for f in os.listdir(input_dir) if f.endswith('.mp4') or f.endswith('.MOV') or f.endswith('.mov')]

    for video_file in video_files:
        video_path = os.path.join(input_dir, video_file)
        transcript_path = os.path.join(transcript_dir, os.path.splitext(video_file)[0]   ".json")

        if not os.path.exists(transcript_path):
            transcript = transcribe_video(video_path)
            with open(transcript_path, 'w', encoding='utf-8') as f:
                json.dump(transcript, f, ensure_ascii=False, indent=4)
            log_and_print(f"Created transcript for {video_path}")
        else:
            with open(transcript_path, 'r', encoding='utf-8') as f:
                transcript = json.load(f)

        extract_clips(video_path, transcript, stages)

process_transcripts(root, transcript_dir, stages)

कीवर्ड और हैशटैग

कीवर्ड: ट्रांसक्रिप्शन, वीडियो प्रोसेसिंग, क्लिपिंग, व्हिस्परएक्स, ऑटोमेशन, चरण, वीडियो क्लिप
हैशटैग: #ट्रांसक्रिप्शनटूल #वीडियोप्रोसेसिंग #क्लिपिंगटूल #व्हिस्परएक्स #वीडियोऑटोमेशन #स्टेजडिटेक्शन #वीडियोक्लिप्स

---ईओएफ----

कनाडा के मिडवेस्ट से टिम द्वारा बनाया गया।
2024.
यह दस्तावेज़ जीपीएल लाइसेंस प्राप्त है।

विज्ञप्ति वक्तव्य यह आलेख यहां पुन: प्रस्तुत किया गया है: https://dev.to/fosteman/custom-transscription-and-clipping-pipeline-2814?1 यदि कोई उल्लंघन है, तो कृपया इसे हटाने के लिए [email protected] से संपर्क करें।

नवीनतम ट्यूटोरियल अधिक>