Speech Transcription - gideonSamzz/Empetror_gs_docx GitHub Wiki
import io import cv2 import numpy as np from dotenv import load_dotenv from google.cloud import videointelligence_v1 as videointelligence
def transcribe_local_video(path="./resource/Trump.mp4", output_path="transcribed_output.mp4", transcript_txt="transcript.txt"): """Transcribe speech from a local video and overlay it frame-wise with word-level timing."""
video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.Feature.SPEECH_TRANSCRIPTION]
# Read video content
with io.open(path, "rb") as file:
input_content = file.read()
# Setup transcription config
config = videointelligence.SpeechTranscriptionConfig(
language_code="en-US",
enable_automatic_punctuation=True,
)
video_context = videointelligence.VideoContext(speech_transcription_config=config)
print("\nProcessing video for speech transcription...")
operation = video_client.annotate_video(
request={
"features": features,
"input_content": input_content,
"video_context": video_context,
}
)
result = operation.result(timeout=600)
print("Finished processing.\n")
annotation_results = result.annotation_results[0]
# Extract word-level timing
word_timings = []
all_transcripts = []
for transcription in annotation_results.speech_transcriptions:
for alternative in transcription.alternatives:
all_transcripts.append(f"{alternative.transcript} (Confidence: {alternative.confidence:.2f})\n")
for word_info in alternative.words:
word = word_info.word
start = word_info.start_time.total_seconds()
end = word_info.end_time.total_seconds()
word_timings.append({
"word": word,
"start": start,
"end": end
})
# Save transcript to text file
with open(transcript_txt, "w", encoding="utf-8") as f:
for line in all_transcripts:
f.write(line + "\n")
print(f"Transcript saved to: {transcript_txt}")
# Load video for annotation
cap = cv2.VideoCapture(path)
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Output video setup
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
frame_id = 0
cv2.namedWindow("Speech Transcription", cv2.WINDOW_NORMAL)
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
time_seconds = frame_id / fps
displayed_words = [word['word'] for word in word_timings if word['start'] <= time_seconds <= word['end']]
text = " ".join(displayed_words)
if text:
# Draw semi-transparent background box
overlay = frame.copy()
cv2.rectangle(overlay, (50, height - 80), (width - 50, height - 30), (0, 0, 0), -1)
alpha = 0.5
frame = cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0)
# Draw transcription text
cv2.putText(frame, text, (60, height - 45),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)
# Resize for display
max_display_width = 800
if frame.shape[1] > max_display_width:
scale_ratio = max_display_width / frame.shape[1]
display_frame = cv2.resize(frame, (0, 0), fx=scale_ratio, fy=scale_ratio)
else:
display_frame = frame
cv2.imshow("Speech Transcription", display_frame)
out_writer.write(frame)
if cv2.waitKey(int(1000 / fps)) & 0xFF == ord('q'):
break
frame_id += 1
cap.release()
out_writer.release()
cv2.destroyAllWindows()
print(f"Transcribed video saved to: {output_path}")
if name == 'main': load_dotenv() transcribe_local_video()