Speech Transcription - gideonSamzz/Empetror_gs_docx GitHub Wiki

import io import cv2 import numpy as np from dotenv import load_dotenv from google.cloud import videointelligence_v1 as videointelligence

def transcribe_local_video(path="./resource/Trump.mp4", output_path="transcribed_output.mp4", transcript_txt="transcript.txt"): """Transcribe speech from a local video and overlay it frame-wise with word-level timing."""

video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.Feature.SPEECH_TRANSCRIPTION]

# Read video content
with io.open(path, "rb") as file:
    input_content = file.read()

# Setup transcription config
config = videointelligence.SpeechTranscriptionConfig(
    language_code="en-US",
    enable_automatic_punctuation=True,
)
video_context = videointelligence.VideoContext(speech_transcription_config=config)

print("\nProcessing video for speech transcription...")
operation = video_client.annotate_video(
    request={
        "features": features,
        "input_content": input_content,
        "video_context": video_context,
    }
)

result = operation.result(timeout=600)
print("Finished processing.\n")

annotation_results = result.annotation_results[0]

# Extract word-level timing
word_timings = []
all_transcripts = []

for transcription in annotation_results.speech_transcriptions:
    for alternative in transcription.alternatives:
        all_transcripts.append(f"{alternative.transcript} (Confidence: {alternative.confidence:.2f})\n")
        for word_info in alternative.words:
            word = word_info.word
            start = word_info.start_time.total_seconds()
            end = word_info.end_time.total_seconds()
            word_timings.append({
                "word": word,
                "start": start,
                "end": end
            })

# Save transcript to text file
with open(transcript_txt, "w", encoding="utf-8") as f:
    for line in all_transcripts:
        f.write(line + "\n")

print(f"Transcript saved to: {transcript_txt}")

# Load video for annotation
cap = cv2.VideoCapture(path)
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Output video setup
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

frame_id = 0
cv2.namedWindow("Speech Transcription", cv2.WINDOW_NORMAL)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    time_seconds = frame_id / fps
    displayed_words = [word['word'] for word in word_timings if word['start'] <= time_seconds <= word['end']]
    text = " ".join(displayed_words)

    if text:
        # Draw semi-transparent background box
        overlay = frame.copy()
        cv2.rectangle(overlay, (50, height - 80), (width - 50, height - 30), (0, 0, 0), -1)
        alpha = 0.5
        frame = cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0)

        # Draw transcription text
        cv2.putText(frame, text, (60, height - 45),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)

    # Resize for display
    max_display_width = 800
    if frame.shape[1] > max_display_width:
        scale_ratio = max_display_width / frame.shape[1]
        display_frame = cv2.resize(frame, (0, 0), fx=scale_ratio, fy=scale_ratio)
    else:
        display_frame = frame

    cv2.imshow("Speech Transcription", display_frame)
    out_writer.write(frame)

    if cv2.waitKey(int(1000 / fps)) & 0xFF == ord('q'):
        break

    frame_id += 1

cap.release()
out_writer.release()
cv2.destroyAllWindows()
print(f"Transcribed video saved to: {output_path}")

if name == 'main': load_dotenv() transcribe_local_video()