summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@jmvalin.ca>2016-11-22 13:57:11 -0500
committerJean-Marc Valin <jmvalin@jmvalin.ca>2016-11-22 13:57:11 -0500
commit6488f1a695187efac0a51565b40fffea58217b94 (patch)
tree7045b30b7a638a74122ce40175056a05d967a58a
parentaaafab328d3bf94dcfb901be6a35143475699fd5 (diff)
downloadopus-6488f1a695187efac0a51565b40fffea58217b94.tar.gz
Add VAD HMM, retrained MLP
-rw-r--r--celt/celt.h1
-rw-r--r--src/analysis.c34
-rw-r--r--src/analysis.h1
-rw-r--r--src/mlp_data.c186
4 files changed, 120 insertions, 102 deletions
diff --git a/celt/celt.h b/celt/celt.h
index 863a0644..d69cd44c 100644
--- a/celt/celt.h
+++ b/celt/celt.h
@@ -57,6 +57,7 @@ typedef struct {
float noisiness;
float activity;
float music_prob;
+ float vad_prob;
int bandwidth;
float activity_probability;
} AnalysisInfo;
diff --git a/src/analysis.c b/src/analysis.c
index e4714817..3a9cc3ad 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -170,9 +170,8 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
if (tonal->read_pos>=DETECT_SIZE)
tonal->read_pos-=DETECT_SIZE;
- /* Compensate for the delay in the features themselves.
- FIXME: Need a better estimate the 10 I just made up */
- curr_lookahead = IMAX(curr_lookahead-10, 0);
+ /* The -1 is to compensate for the delay in the features themselves. */
+ curr_lookahead = IMAX(curr_lookahead-1, 0);
psum=0;
/* Summing the probability of transition patterns that involve music at
@@ -182,7 +181,7 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
for (;i<DETECT_SIZE;i++)
psum += tonal->pspeech[i];
psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
- /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
+ /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/
info_out->music_prob = psum;
}
@@ -575,15 +574,11 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
#ifndef DISABLE_FLOAT_API
mlp_process(&net, features, frame_probs);
frame_probs[0] = .5f*(frame_probs[0]+1);
- //frame_probs[0] = MIN32(.98, frame_probs[0]+.1);
- //frame_probs[0] = .1 + .9*frame_probs[0];
/* Curve fitting between the MLP probability and the actual probability */
/*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
/* Probability of active audio (as opposed to silence) */
frame_probs[1] = .5f*frame_probs[1]+.5f;
frame_probs[1] *= frame_probs[1];
- /* Consider that silence has a 50-50 probability. */
- frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
/* Probability of speech or music vs noise */
info->activity_probability = frame_probs[1];
@@ -606,8 +601,28 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
float music0;
float p, q;
+ /* More silence transitions for speech than for music. */
+ tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);
+ p = MAX16(.05f,MIN16(.95f,frame_probs[1]));
+ q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));
+ beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
+ /* p0 and p1 are the probabilities of speech and music at this frame
+ using only information from previous frame and applying the
+ state transition model */
+ p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau;
+ p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;
+ /* We apply the current probability with exponent beta to work around
+ the fact that the probability estimates aren't independent. */
+ p0 *= (float)pow(1-frame_probs[1], beta);
+ p1 *= (float)pow(frame_probs[1], beta);
+ /* Normalise the probabilities to get the Marokv probability of music. */
+ tonal->vad_prob = p1/(p0+p1);
+ info->vad_prob = tonal->vad_prob;
+ /* Consider that silence has a 50-50 probability of being speech or music. */
+ frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f;
+
/* One transition every 3 minutes of active audio */
- tau = .0001f*frame_probs[1];
+ tau = .0001f;
/* Adapt beta based on how "unexpected" the new prob is */
p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
@@ -625,6 +640,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
tonal->music_prob = p1/(p0+p1);
info->music_prob = tonal->music_prob;
+ /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/
/* This chunk of code deals with delayed decision. */
psum=1e-20f;
/* Instantaneous probability of speech and music, with beta pre-applied. */
diff --git a/src/analysis.h b/src/analysis.h
index 971c8e0c..92e613b9 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -57,6 +57,7 @@ typedef struct {
float cmean[8];
float std[9];
float music_prob;
+ float vad_prob;
float Etracker;
float lowECount;
int E_count;
diff --git a/src/mlp_data.c b/src/mlp_data.c
index b63d583c..ac18ab1b 100644
--- a/src/mlp_data.c
+++ b/src/mlp_data.c
@@ -4,104 +4,104 @@
#include "mlp.h"
-/* RMS error was 0.315104, seed was 1479763182 */
-/* 0.006838 0.032708 (0.315104 0.315104) 7.21128e-09 8044 */
+/* RMS error was 0.307058, seed was 1479787111 */
+/* 0.006588 0.033188 (0.307072 0.307058) 3.61895e-07 6271 10 */
static const float weights[450] = {
/* hidden layer */
-1.16791f, 0.0117457f, -0.173725f, 0.00088526f, -0.182404f,
--0.0160565f, -0.0683622f, -0.101919f, -0.0460863f, 0.00236859f,
-0.0214204f, -0.0522124f, -0.00439659f, 0.126548f, -0.0747379f,
-0.00759737f, -0.0465382f, 0.0282859f, -0.00823783f, 0.49046f,
--0.0038811f, 4.67584f, -1.75408f, 0.189231f, 6.88353f,
--0.169735f, -0.302665f, 0.0344662f, 0.0376251f, 0.0824736f,
--0.00808218f, -0.00840575f, -0.0134915f, -0.0852928f, -0.0557674f,
--0.0524697f, -0.0267727f, -0.0711738f, 0.168746f, 0.345237f,
-0.114282f, 0.0130365f, 0.0224121f, -0.124843f, -0.119323f,
--0.147682f, -0.149418f, 1.44711f, -0.0385809f, 1.64565f,
-1.702f, 4.94977f, 1.1675f, -0.00709793f, -0.028046f,
-0.0130692f, 0.0110228f, 0.00442845f, 0.0270771f, -0.012915f,
--0.0205496f, 0.00731574f, 0.0290182f, -0.0137988f, -0.0496688f,
-0.0497328f, -0.0500723f, 0.0693743f, 0.125171f, 0.002266f,
-0.0820088f, 0.0801653f, -0.0815278f, -0.312179f, 0.499351f,
-0.0146673f, -0.0729864f, -3.07368f, -1.12587f, 0.0807415f,
-0.0317455f, 0.0629169f, 0.0489931f, -0.0143552f, -0.0121456f,
--0.000922163f, -0.0195092f, -0.0354053f, -0.0316398f, -0.0409961f,
--0.0762715f, -0.119086f, -0.0515177f, -0.286433f, -0.0256642f,
-0.00490787f, 0.089922f, 0.272454f, -0.00799747f, -3.92076f,
--0.923539f, -0.344524f, -1.1584f, -0.232077f, -2.54335f,
--0.0289305f, 0.180725f, -0.0541124f, 0.113015f, 0.0614053f,
--0.194218f, 0.126639f, -0.0941479f, -0.0399991f, 0.0308558f,
--0.097045f, -0.00974284f, -0.234078f, -0.117714f, -0.742824f,
--0.0562217f, 0.225729f, -0.0762788f, -1.06489f, 0.124149f,
-0.361664f, -0.174717f, -0.413253f, 1.55378f, 0.635495f,
-0.831817f, 0.00214014f, 0.0301272f, 0.0165128f, -0.00468816f,
--0.00518814f, -0.00861749f, -0.00566215f, -0.00880481f, -0.00152252f,
--0.0150631f, -0.0182496f, -0.0161498f, -0.0868953f, 0.0632226f,
--0.0385318f, 0.0151289f, 0.0268427f, -0.00495099f, 0.16565f,
-0.0273735f, 0.716388f, 1.38317f, -0.0482318f, -0.817091f,
--0.823276f, 0.126554f, 0.008395f, -0.0788533f, -0.0305483f,
--0.0141837f, 0.0287401f, 0.0220461f, 0.0798372f, -0.0162728f,
--0.034366f, 0.0578138f, -0.027475f, -0.369605f, -0.690334f,
--0.287187f, -0.174793f, 0.719982f, 0.792722f, 1.08376f,
-3.42282f, -0.0205459f, -1.11499f, -0.23692f, 5.31937f,
-0.338354f, 2.88862f, -0.0488146f, -0.0693933f, -0.0525298f,
--0.0691915f, -0.0748021f, -0.0479683f, 0.0557816f, -0.0234204f,
-0.0711225f, -0.0284554f, 0.0748894f, 0.0312238f, 0.0430777f,
--0.149758f, -0.0643999f, -0.328943f, 0.0220431f, -0.00670375f,
--0.150891f, 0.0826483f, -0.0416984f, 0.91942f, 0.288807f,
--0.784006f, -0.316274f, -0.914043f, 0.847168f, -0.0511541f,
-0.0591144f, 0.0444577f, -0.0523042f, 0.0435139f, -0.0368748f,
--0.0238474f, 0.0578224f, 0.033423f, -0.00959278f, 0.000368111f,
--0.033063f, 0.0498318f, -0.193449f, 0.342364f, 0.0248039f,
-0.106036f, -0.105387f, 0.660923f, -0.0940084f, 0.649895f,
--0.879327f, 0.567143f, 1.64079f, 4.26012f, -0.891701f,
-0.0566126f, 0.0565989f, 0.126332f, 0.0622828f, 0.00303243f,
--0.0209919f, -0.0316721f, -0.00332618f, -0.0104709f, -0.0439127f,
--0.052425f, -0.0328074f, -0.000365187f, -0.151337f, -0.0136578f,
-0.00736587f, -0.0245835f, 0.104102f, 0.246421f, -0.0320614f,
--2.13688f, -0.644779f, -0.587536f, -0.0876224f, -0.845826f,
--0.675112f, 0.00497933f, -0.0138469f, 0.0478949f, -0.0193546f,
-0.0743611f, 0.0919097f, -0.021396f, -0.00987072f, 0.0906004f,
-0.15022f, -0.0719682f, -0.123023f, -0.032162f, -0.226081f,
-0.0112323f, 0.143681f, 0.0963748f, 0.388256f, 0.499536f,
--0.0717579f, 0.245617f, -0.541487f, 0.495578f, 2.30802f,
-0.995929f, -0.829615f, -0.00364842f, 0.0295008f, -0.00799017f,
-0.0072746f, -0.063231f, -0.129824f, -0.020175f, -0.0221331f,
--0.121943f, -0.138491f, -0.0277175f, 0.0259129f, -0.113353f,
-0.0901211f, -0.0515522f, 0.00156965f, 0.0223951f, -0.0542028f,
--0.198752f, -0.00318196f, -0.711666f, 1.09381f, 0.000768032f,
--2.31925f, -0.019048f, -1.17297f, -0.0540454f, -0.050889f,
--0.00426479f, 0.185899f, 0.0558439f, 0.123861f, -0.00353509f,
--0.178127f, 0.0683067f, -0.00891811f, 0.0577757f, 0.12994f,
-0.298556f, -0.0244329f, -0.455871f, 0.106323f, -0.268203f,
-0.034243f, -0.42948f, -0.228146f, -0.421731f, -1.2974f,
-2.90981f, 1.83682f, 1.22827f, 0.646216f, 0.00618927f,
--0.163023f, -0.0588448f, -0.0680873f, 0.00265286f, 0.15079f,
-0.0180199f, 0.0521558f, -0.0251215f, 0.211758f, 0.000908394f,
--0.0665474f, -0.179624f, 0.0223256f, -0.258492f, 0.0329357f,
-0.0776958f, -0.0431089f, 0.627821f, 0.0300259f, -1.08477f,
-1.59281f, 0.523867f, -3.2745f, -1.30564f, -0.228395f,
--0.0872362f, 0.0218742f, -0.0650792f, -0.0307158f, 0.0112539f,
--0.0289815f, 0.00459511f, 0.00851279f, -0.0055035f, 0.032939f,
-0.00853459f, -0.0193472f, 0.0590096f, 0.323122f, 0.0439625f,
-0.0102308f, 0.103701f, -0.389139f, 0.629254f, 0.0838598f,
-1.43368f, -0.658163f, 0.0295287f, -0.46143f, 3.01521f,
--0.749628f, -0.0302256f, -0.106793f, 0.0680327f, -0.0972569f,
-0.0024169f, 0.0499889f, -0.0418845f, 0.0831352f, -0.0491967f,
-0.0701002f, -0.0329629f, -0.0539425f, 0.151929f, 0.187274f,
-0.29397f, 0.132495f, -0.0677414f, -0.0609771f, -0.475259f,
-0.0238476f, -1.7368f, -0.280829f, -0.0644994f, 0.342657f,
-2.05706f,
+0.346452f, -0.000986403f, 0.0238469f, 0.00787237f, -0.00327841f,
+-0.00144448f, -0.00834152f, 0.00662626f, 0.00827114f, -0.00625709f,
+-0.0111221f, -0.0101708f, -0.0220237f, -0.00579342f, 0.0148173f,
+-0.0502682f, 0.0129419f, -0.0202994f, -0.039075f, 0.0529568f,
+0.00938264f, -0.667579f, 0.817718f, 0.212861f, -0.592366f,
+0.182686f, 0.104437f, 0.0418449f, 0.0561753f, 0.186168f,
+0.0837228f, -0.0311366f, 0.00600564f, -0.0454366f, -0.0530324f,
+-0.0216303f, -0.0526483f, -0.0203131f, 0.0659832f, 0.0568102f,
+0.00508217f, -0.0504551f, -0.109038f, -0.0575989f, -0.0545253f,
+-0.499059f, -0.00813707f, 4.57328f, 0.565364f, -2.05258f,
+7.44677f, -1.00112f, -0.438463f, -0.0253055f, -0.0719549f,
+-0.0384969f, -0.0320396f, 0.0327494f, 0.0315827f, 0.0106198f,
+0.00800117f, 0.0197368f, 0.042733f, 0.0292928f, 0.00690012f,
+-0.0710759f, -0.00229299f, 0.123645f, -0.0151665f, 0.113013f,
+0.109826f, 0.20492f, 0.0322878f, 0.50752f, -0.767495f,
+-0.182978f, -0.288763f, -0.340734f, 0.0473468f, 0.0397243f,
+0.0974594f, 0.0526624f, 0.0725072f, -0.0974313f, -0.21646f,
+-0.134403f, -0.0713687f, 0.0559797f, 0.0945215f, -0.037486f,
+0.170421f, -0.299144f, 0.119146f, -0.0586984f, -0.0649505f,
+0.147221f, 0.213973f, -0.370208f, 0.0286924f, 2.32318f,
+-1.11646f, -1.33634f, -1.3209f, -1.33462f, 0.851591f,
+-0.0387395f, 0.00394112f, 0.0538956f, 0.0142521f, 0.0826941f,
+-0.00891185f, 0.031608f, -0.0519062f, 0.0840755f, 0.00711923f,
+0.0553999f, -0.0140473f, 0.289961f, 0.0159519f, 0.408393f,
+0.0706347f, -0.0821191f, 0.094304f, 0.258852f, -0.0872749f,
+-1.31161f, 1.76215f, 1.1734f, -0.218017f, 0.200685f,
+0.113221f, 0.0438638f, -0.11757f, -0.00730474f, -0.034559f,
+-0.0527583f, 0.124873f, -0.0630957f, 0.0199118f, 0.0204139f,
+-0.072775f, 0.0698309f, 0.109033f, 0.418322f, 0.0581625f,
+0.445942f, -0.0967659f, -0.475703f, -0.0364438f, -0.25813f,
+-0.0526156f, 1.95193f, -1.89237f, -0.2298f, 0.484577f,
+-0.70406f, -3.37848f, 0.037487f, 0.0842667f, 0.0862521f,
+0.0203154f, -0.0523732f, -0.120697f, -0.0424331f, -0.0358556f,
+-0.0237288f, -0.0563953f, 0.0413413f, 0.0725612f, 0.484948f,
+-0.0337245f, 0.0919432f, -0.0418379f, -0.289835f, 0.108197f,
+-0.731154f, 0.046635f, -0.0115669f, 1.96781f, 0.22253f,
+-1.44544f, -70.1674f, 1.56784f, -0.0231552f, 0.0226579f,
+0.0368208f, -0.182944f, -0.0664526f, -0.0419537f, -0.0599649f,
+0.177215f, -0.032529f, 0.0417212f, -0.020503f, -0.169923f,
+-0.00776405f, -0.0282886f, 0.582991f, 0.0255885f, -0.0161892f,
+0.0409108f, 1.13725f, 0.149326f, -3.03502f, 4.07061f,
+-1.22442f, -0.955553f, -0.652578f, -0.442688f, -0.0290697f,
+-0.246261f, -0.0545186f, -0.211004f, 0.0218397f, 0.0645374f,
+-0.0239089f, 0.0607779f, -0.00397812f, 0.103708f, 0.0654522f,
+0.0402445f, -0.161771f, -0.119996f, 0.0662066f, -0.0221619f,
+0.156356f, 0.158121f, -0.453376f, 0.100038f, -0.0380598f,
+-0.655184f, -0.874463f, 0.441085f, -0.0189581f, -1.64911f,
+0.116019f, 0.0778868f, 0.022377f, 0.0567152f, -0.0536254f,
+-0.0525457f, -0.0163194f, -0.023485f, -0.0900654f, -0.042401f,
+-0.0452115f, -0.0980355f, -0.30054f, -0.0340618f, -0.315325f,
+-0.0454255f, 0.0550435f, -0.117775f, 0.745518f, -0.111094f,
+-4.59615f, -2.11679f, 0.377472f, -0.398044f, 0.100191f,
+0.987546f, -0.105993f, 0.211353f, 0.0199354f, -0.110528f,
+0.0587012f, -0.146575f, 0.0240706f, 0.0808704f, 0.0662973f,
+-0.133194f, 0.063903f, -0.0663394f, 0.0840598f, -0.157854f,
+-0.0208042f, -0.0415707f, 0.0616584f, 0.134096f, 0.0971693f,
+0.0607324f, 2.18513f, 0.424779f, 1.18391f, 0.800226f,
+4.50274f, -1.97694f, -0.0996252f, 0.00959504f, -0.0687779f,
+0.0108385f, 0.0209571f, -0.0232143f, 0.0324491f, -0.0284241f,
+0.0349509f, 0.00658307f, 0.0670332f, 0.0135798f, -0.0242892f,
+0.114423f, 0.0906438f, -0.00625305f, 0.0991216f, 0.0018775f,
+-0.0631878f, 0.00596324f, 1.98581f, -0.35355f, 0.000361734f,
+1.20065f, 0.845175f, -0.812598f, -0.0566753f, 0.2034f,
+0.108179f, -0.0565119f, 0.0033704f, -0.138371f, -0.0822029f,
+-0.0330501f, -0.0454151f, -0.0938773f, -0.0865522f, 0.296783f,
+0.51366f, -0.138525f, 0.138197f, -0.280676f, 0.00719433f,
+0.368148f, 0.78515f, -0.0206407f, 0.501422f, 4.04115f,
+3.22981f, 1.79516f, 18.177f, -0.764188f, -0.0305839f,
+-0.0102731f, -0.0188846f, -0.0703807f, 0.0311089f, 0.000158915f,
+-0.0778763f, 0.0275067f, -0.00954754f, 0.0199833f, 0.0500232f,
+0.0661436f, 0.469303f, 0.50695f, -0.218825f, 0.00533274f,
+-0.512075f, -1.9125f, 0.639009f, -0.148785f, 1.4833f,
+0.852433f, 1.34851f, -0.734623f, 0.210932f, -0.789634f,
+-0.0114672f, 0.150499f, 0.0381371f, 0.0691792f, -8.753e-06f,
+0.0263655f, 0.0247899f, 0.0172167f, 0.00014426f, -0.00153884f,
+0.0203275f, -0.0194041f, -0.0110974f, 0.108975f, -0.0697057f,
+0.0964956f, -0.10174f, -0.0626501f, 0.0680276f, 0.0215191f,
+-1.26794f, 3.33681f, 0.366054f, -3.0353f, -1.45105f,
+-1.05225f, 0.0302715f, 0.0330566f, 0.0275157f, 0.0161609f,
+-0.10335f, -0.144689f, -0.052541f, -0.024565f, -0.169548f,
+-0.215189f, -0.0575187f, 0.00427752f, -0.0549234f, 0.103446f,
+-0.0833558f, 0.0133471f, -0.0620358f, -0.134324f, -0.291327f,
+-0.043899f, 0.135107f, -0.0220095f, 0.302584f, -1.94763f,
+-0.32957f,
/* output layer */
-1.33932, 1.70279, -1.27695, -4.52246, 0.0740156,
-1.71598, 8.63902, -1.26394, 1.60628, -1.2561,
-1.62678, -1.8133, -3.03604, 1.71492, 0.531547,
--1.01656, 1.71594, 6.91163, -0.566851, 0.438947,
-0.0745278, -7.43604, -0.0317548, 0.585059, -0.646684,
--0.632673, -5.15384, 8.68134, -0.0264247, 1.16378,
--0.232851, 1.29058, 2.1686, -0.757127, };
+-0.138449, 23.1555, 2.02078, 4.63684, 1.63151,
+-2.36045, 1.80583, 0.923012, -1.55375, 1.01092,
+-0.215321, -0.642902, 2.25817, -0.491944, 2.25117,
+-2.19328, -1.98994, 5.62795, 0.957799, 0.472934,
+3.45822, 0.248562, -0.117404, -0.274879, 0.683224,
+0.199083, -1.49602, -2.40808, -2.27289, -1.6856,
+0.325958, 1.21798, 3.14007, 1.50236, };
static const int topo[3] = {25, 16, 2};