语音活性检测器py-webrtcvad安装使用

谷歌为WebRTC项目开发的VAD是目前最优秀、最先进和免费的产品之一。webrtcvad是WebRTC语音活动检测器(VAD)的python接口。兼容python2和python3。功能是将一段音频数据分为静音与非静音。它对于电话和语音识别很有用。

1、安装pip

yum -y install epel-release

yum -y install python-pip

2、安装webrtcvad

yum -y install python-devel

pip install webrtcvad

3、webrtcvad测试脚本（test_webrtcvad.py）

import collections

import contextlib

import sys

import wave

import webrtcvad

def read_wave(path):

    with contextlib.closing(wave.open(path, 'rb')) as wf:

        num_channels = wf.getnchannels()

        assert num_channels == 1

        sample_width = wf.getsampwidth()

        assert sample_width == 2

        sample_rate = wf.getframerate()

        assert sample_rate in (8000, 16000, 32000)

        pcm_data = wf.readframes(wf.getnframes())

        return pcm_data, sample_rate

def write_wave(path, audio, sample_rate):

    with contextlib.closing(wave.open(path, 'wb')) as wf:

        wf.setnchannels(1)

        wf.setsampwidth(2)

        wf.setframerate(sample_rate)

        wf.writeframes(audio)

class Frame(object):

    def __init__(self, bytes, timestamp, duration):

        self.bytes = bytes

        self.timestamp = timestamp

        self.duration = duration

def frame_generator(frame_duration_ms, audio, sample_rate):

    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)

    offset = 0

    timestamp = 0.0

    duration = (float(n) / sample_rate) / 2.0

    while offset + n < len(audio):

        yield Frame(audio[offset:offset + n], timestamp, duration)

        timestamp += duration

        offset += n

def vad_collector(sample_rate, frame_duration_ms,

                  padding_duration_ms, vad, frames):

    num_padding_frames = int(padding_duration_ms / frame_duration_ms)

    ring_buffer = collections.deque(maxlen=num_padding_frames)

    triggered = False

    voiced_frames = []

    for frame in frames:

        sys.stdout.write(

            '' if vad.is_speech(frame.bytes, sample_rate) else '')

        if not triggered:

            ring_buffer.append(frame)

            num_voiced = len([f for f in ring_buffer

                              if vad.is_speech(f.bytes, sample_rate)])

            if num_voiced > 0.9 * ring_buffer.maxlen:

                sys.stdout.write('+(%s)' % (ring_buffer[0].timestamp,))

                triggered = True

                voiced_frames.extend(ring_buffer)

                ring_buffer.clear()

        else:

            voiced_frames.append(frame)

            ring_buffer.append(frame)

            num_unvoiced = len([f for f in ring_buffer

                                if not vad.is_speech(f.bytes, sample_rate)])

            if num_unvoiced > 0.9 * ring_buffer.maxlen:

                sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))

                triggered = False

                yield b''.join([f.bytes for f in voiced_frames])

                ring_buffer.clear()

                voiced_frames = []

    if triggered:

        sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))

    sys.stdout.write('\n')

    if voiced_frames:

        yield b''.join([f.bytes for f in voiced_frames])

def main(args):

    if len(args) != 2:

        sys.stderr.write(

            'Usage: example.py <aggressiveness> <path to wav file>\n')

        sys.exit(1)

    audio, sample_rate = read_wave(args[1])

    vad = webrtcvad.Vad(int(args[0]))

    frames = frame_generator(30, audio, sample_rate)

    frames = list(frames)

    segments = vad_collector(sample_rate, 30, 300, vad, frames)

    for i, segment in enumerate(segments):

        #path = 'chunk-%002d.wav' % (i,)

        print('--end')

        #write_wave(path, segment, sample_rate)

if __name__ == '__main__':

    main(sys.argv[1:])

4、运行命令（其中，第一个参数为敏感系数，取值0-3，越大表示越敏感，越激进，对细微的声音频段都可以识别出来；第二个参数为wav文件存放路径，目前仅支持8K，16K，32K的采样率，示例wav文件下载：73.wav 链接：https://pan.baidu.com/s/19YJB9u0zvCFGBLDRisK1KQ 密码：fgkf）

[root@host---- ~]# python test_webrtcvad.py  /home/.wav

+(2.1)-(3.36)--end

+(3.57)-(14.43)--end

+(15.3)-(16.14)--end

+(21.21)-(22.47)--end

+(22.68)-(24.6)--end

+(24.66)-(26.76)--end

+(26.76)-(27.81)--end

+(27.87)-(31.38)--end

+(31.38)-(32.91)--end

+(33.21)-(35.04)--end

+(35.73)-(41.43)--end

+(42.66)-(43.8)--end

+(43.95)-(51.03)--end

+(51.15)-(53.82)--end

+(53.82)-(59.85)--end

+(60.51)-(64.74)--end

+(65.46)-(67.26)--end

+(67.74)-(69.39)--end

+(69.42)-(74.55)--end

+(74.55)-(81.24)--end

+(81.51)-(87.66)--end

+(87.9)-(89.76)--end

+(91.08)-(92.04)--end

+(92.31)-(96.9)--end

+(97.23)-(102.27)--end

+(102.51)-(104.43)--end

+(104.43)-(105.9)--end

+(106.38)-(108.12)--end

+(108.69)-(110.16)--end

+(111.12)-(113.13)--end

+(113.13)-(114.87)--end

+(114.87)-(118.08)--end