Verified Commit 240f93b8 authored by Rafael Carício's avatar Rafael Carício 🏠
Browse files

vosktranscriber: Add new transcriber plugin using Vosk API

parent 5c00db62
Pipeline #625514 passed with stages
in 29 minutes and 10 seconds
......@@ -16,6 +16,7 @@ members = [
"net/onvif",
"net/reqwest",
"net/aws",
"net/vosk",
"utils/fallbackswitch",
"utils/togglerecord",
"utils/tracers",
......@@ -51,6 +52,7 @@ default-members = [
"net/onvif",
"net/reqwest",
"net/aws",
"net/vosk",
"utils/fallbackswitch",
"utils/togglerecord",
"utils/tracers",
......
......@@ -47,6 +47,7 @@ plugins = {
'gst-plugin-hlssink3': 'libgsthlssink3',
'gst-plugin-rspng': 'libgstrspng',
'gst-plugin-aws': 'libgstaws',
'gst-plugin-vosk': 'libgstvosk',
'gst-plugin-textwrap': 'libgstrstextwrap',
'gst-plugin-fmp4': 'libgstfmp4',
'gst-plugin-threadshare': 'libgstthreadshare',
......
[package]
name = "gst-plugin-vosk"
version = "0.9.0"
edition = "2021"
authors = ["Rafael Caricio <rafael@caricio.com>"]
repository = "https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs"
license = "MPL-2.0"
description = "GStreamer plugin for speech to text using the Vosk Toolkit."
build = "build.rs"
[dependencies]
gst = { package = "gstreamer", git = "https://gitlab.freedesktop.org/gstreamer/gstreamer-rs" }
gst-base = { package = "gstreamer-base", git = "https://gitlab.freedesktop.org/gstreamer/gstreamer-rs" }
once_cell = "1"
atomic_refcell = "0.1"
serde = "1"
serde_derive = "1"
serde_json = "1"
futures = "0.3"
tokio = { version = "1.0", features = [ "rt-multi-thread", "time" ] }
async-tungstenite = { version = "0.17", features = ["tokio", "tokio-runtime", "tokio-native-tls"] }
[build-dependencies]
gst-plugin-version-helper = { path="../../version-helper" }
[lib]
name = "gstvosk"
crate-type = ["cdylib", "rlib"]
path = "src/lib.rs"
[features]
static = []
capi = []
[package.metadata.capi]
min_version = "0.8.0"
[package.metadata.capi.header]
enabled = false
[package.metadata.capi.library]
install_subdir = "gstreamer-1.0"
versioning = false
[package.metadata.capi.pkg_config]
requires_private = "gstreamer-1.0, gstreamer-base-1.0, gobject-2.0, glib-2.0, gmodule-2.0"
# Vosk Speech Recognition GStreamer Plugin
Transcription of speech using [Vosk Toolkit](https://alphacephei.com/vosk/). Can be used to generate subtitles for
movies, live streams, lectures and interviews.
> Vosk is an offline open source speech recognition toolkit. It enables speech recognition for 20+ languages and
> dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese,
> Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, Ukrainian, Kazakh, Swedish, Japanese, Esperanto, Hindi.
> More to come.
>
> https://github.com/alphacep/vosk-api
This GStreamer plugin was inspired by the work of [@MathieuDuponchelle](https://github.com/mathieuduponchelle) in the
[AwsTranscriber](https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/tree/main/net/rusoto#awstranscriber) element.
## Build
Compiling this project will provide a shared library that can be used by your local GStreamer installation.
```bash
cargo build --release
```
The compiled shared library `./target/release/libgstvosk.dylib` must be made loadable to GStreamer. One possible
solution is to use the argument `--gst-plugin-path=` pointing to the location where the library file is every time you
run `gst-launch-1.0` command line tool.
## Example Usage
This plugin connects via websockets protocol to the [Vosk Server](https://alphacephei.com/vosk/server). The easiest
way to run the Vosk server is using [Docker](https://docs.docker.com/). You can run the server locally using
this command:
```bash
docker run --rm --name vosk-server -d -p 2700:2700 alphacep/kaldi-en:latest
```
Running the recognition server as a separated process comes with the additional benefit that you don't need to
install any special software. Plus the voice recognition work load is off your GStreamer pipeline process.
This example will just print out the raw text buffers that are published out by the Vosk transcriber:
```bash
gst-launch-1.0 \
vosktranscriber name=tc ! fakesink sync=true dump=true \
uridecodebin uri=https://studio.blender.org/download-source/d1/d1f3b354a8f741c6afabf305489fa510/d1f3b354a8f741c6afabf305489fa510-1080p.mp4 ! audioconvert ! tc.
```
fn main() {
gst_plugin_version_helper::info()
}
// Copyright (C) 2022 Rafael Caricio <rafael@caricio.com>
//
// This Source Code Form is subject to the terms of the Mozilla Public License, v2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at
// <https://mozilla.org/MPL/2.0/>.
//
// SPDX-License-Identifier: MPL-2.0
extern crate core;
use gst::glib;
mod transcriber;
mod vosk_client;
fn plugin_init(plugin: &gst::Plugin) -> Result<(), glib::BoolError> {
transcriber::register(plugin)?;
Ok(())
}
gst::plugin_define!(
vosk,
env!("CARGO_PKG_DESCRIPTION"),
plugin_init,
concat!(env!("CARGO_PKG_VERSION"), "-", env!("COMMIT_ID")),
"MPL",
env!("CARGO_PKG_NAME"),
env!("CARGO_PKG_NAME"),
env!("CARGO_PKG_REPOSITORY"),
env!("BUILD_REL_DATE")
);
This diff is collapsed.
// Copyright (C) 2020 Mathieu Duponchelle <mathieu@centricular.com>
// Copyright (C) 2022 Rafael Caricio <rafael@caricio.com>
//
// This Source Code Form is subject to the terms of the Mozilla Public License, v2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at
// <https://mozilla.org/MPL/2.0/>.
//
// SPDX-License-Identifier: MPL-2.0
use glib::prelude::*;
use gst::glib;
mod imp;
glib::wrapper! {
pub struct Transcriber(ObjectSubclass<imp::Transcriber>) @extends gst::Element, gst::Object;
}
unsafe impl Send for Transcriber {}
unsafe impl Sync for Transcriber {}
pub fn register(plugin: &gst::Plugin) -> Result<(), glib::BoolError> {
gst::Element::register(
Some(plugin),
"vosktranscriber",
gst::Rank::None,
Transcriber::static_type(),
)
}
// Copyright (C) 2022 Rafael Caricio <rafael@caricio.com>
//
// This Source Code Form is subject to the terms of the Mozilla Public License, v2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at
// <https://mozilla.org/MPL/2.0/>.
//
// SPDX-License-Identifier: MPL-2.0
use serde_derive::{Deserialize, Serialize};
#[derive(Deserialize, Serialize, Debug)]
pub struct Configuration {
config: ConfigInner,
}
#[derive(Deserialize, Serialize, Debug)]
struct ConfigInner {
/// Sample rate the audio will be provided at.
sample_rate: i32,
/// Show time ranges of each word in the transcription.
words: bool,
}
impl Configuration {
pub fn new(sample_rate: i32) -> Self {
Self {
config: ConfigInner {
sample_rate,
// We always want to receive the words with their time ranges.
words: true,
},
}
}
}
#[derive(Deserialize, Serialize, Debug)]
pub struct Transcript {
pub result: Vec<WordInfo>,
pub text: String,
}
#[derive(Deserialize, Serialize, Debug)]
pub struct WordInfo {
#[serde(rename = "conf")]
pub confidence: f64,
pub word: String,
pub start: f64,
pub end: f64,
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment