openclaw-phone-assistant/audio_debug.py at main · langwatch/openclaw-phone-assistant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""Audio debug monitor — prints live volume bars and diagnostics."""

import struct
import time

from loguru import logger

from pipecat.frames.frames import (
    InputAudioRawFrame,
    OutputAudioRawFrame,
    StartFrame,
    EndFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor


def rms_from_bytes(audio_bytes: bytes, sample_width: int = 2) -> float:
    """Compute RMS volume from raw PCM bytes (16-bit signed LE)."""
    if not audio_bytes or len(audio_bytes) < sample_width:
        return 0.0
    n_samples = len(audio_bytes) // sample_width
    fmt = f"<{n_samples}h"
    try:
        samples = struct.unpack(fmt, audio_bytes[: n_samples * sample_width])
    except struct.error:
        return 0.0
    if not samples:
        return 0.0
    sq_sum = sum(s * s for s in samples)
    return (sq_sum / n_samples) ** 0.5


def volume_bar(rms: float, max_rms: float = 8000.0, width: int = 40) -> str:
    """Render a volume bar like Google Meet."""
    level = min(rms / max_rms, 1.0)
    filled = int(level * width)
    bar = "█" * filled + "░" * (width - filled)
    db = 20 * (max(rms, 1) / 32768)  # rough dB-ish scale
    return f"|{bar}| {rms:6.0f} rms"


class AudioInputMonitor(FrameProcessor):
    """Monitors audio input frames and prints live volume."""

    def __init__(self, interval: float = 0.5, **kwargs):
        super().__init__(**kwargs)
        self._interval = interval
        self._last_print = 0.0
        self._frame_count = 0
        self._total_bytes = 0
        self._peak_rms = 0.0
        self._silent_count = 0
        self._started = False

    async def process_frame(self, frame, direction: FrameDirection):
        await super().process_frame(frame, direction)

        if isinstance(frame, StartFrame):
            self._started = True
            logger.debug(
                "🎤 Audio input started — sample_rate={} channels={}",
                getattr(frame, "audio_in_sample_rate", "?"),
                "1",
            )
            await self.push_frame(frame, direction)
            return

        if isinstance(frame, InputAudioRawFrame):
            self._frame_count += 1
            self._total_bytes += len(frame.audio)
            rms = rms_from_bytes(frame.audio)

            if rms > self._peak_rms:
                self._peak_rms = rms

            if rms < 50:
                self._silent_count += 1

            now = time.monotonic()
            if now - self._last_print >= self._interval:
                bar = volume_bar(rms)
                silence_pct = (
                    (self._silent_count / self._frame_count * 100)
                    if self._frame_count
                    else 0
                )
                logger.debug(
                    "🎤 IN  {}  peak={:6.0f}  frames={}  silence={:.0f}%",
                    bar,
                    self._peak_rms,
                    self._frame_count,
                    silence_pct,
                )
                self._last_print = now

            # Log detailed every 200 frames
            if self._frame_count % 200 == 0:
                logger.debug(
                    "Audio IN: frame={} rms={:.0f} peak={:.0f} bytes={} silence={:.0f}%",
                    self._frame_count,
                    rms,
                    self._peak_rms,
                    self._total_bytes,
                    (self._silent_count / self._frame_count * 100)
                    if self._frame_count
                    else 0,
                )

        await self.push_frame(frame, direction)


class AudioOutputMonitor(FrameProcessor):
    """Monitors audio output frames and prints live volume."""

    def __init__(self, interval: float = 0.5, **kwargs):
        super().__init__(**kwargs)
        self._interval = interval
        self._last_print = 0.0
        self._frame_count = 0
        self._total_bytes = 0

    async def process_frame(self, frame, direction: FrameDirection):
        await super().process_frame(frame, direction)

        if isinstance(frame, OutputAudioRawFrame):
            self._frame_count += 1
            self._total_bytes += len(frame.audio)
            rms = rms_from_bytes(frame.audio)

            now = time.monotonic()
            if now - self._last_print >= self._interval:
                bar = volume_bar(rms)
                logger.debug(
                    "🔊 OUT {}  frames={}  bytes={}",
                    bar,
                    self._frame_count,
                    self._total_bytes,
                )
                self._last_print = now

        await self.push_frame(frame, direction)


def list_audio_devices():
    """No-op in WebRTC mode — audio is handled by the browser."""
    logger.debug("Audio devices managed by browser (WebRTC mode)")