//============================================================================= // // Copyright (c) 2006-2007, Carnegie Mellon University. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in // the documentation and/or other materials provided with the // distribution. // // This work was supported in part by funding from the Defense Advanced // Research Projects Agency and the National Science Foundation of the // United States of America, and the CMU Sphinx Speech Consortium. // // THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND // ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, // THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY // NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //============================================================================= //----------------------------------------------------------------------------- // // LIVEMODE.CPP - Functions maintaining the state of the server when // running in live mode and getting partial and final // recognition results from the engines. // // ---------------------------------------------------------------------------- // // BEFORE MAKING CHANGES TO THIS CODE, please read the appropriate // documentation, available in the Documentation folder. // // ANY SIGNIFICANT CHANGES made should be reflected back in the documentation // file(s) // // ANY CHANGES made (even small bug fixes, should be reflected in the history // below, in reverse chronological order // // HISTORY -------------------------------------------------------------------- // // [2005-08-05] (antoine) : started this // //----------------------------------------------------------------------------- #include using namespace Olympus; #include "GalaxyInterface.h" #include "EngineInterface.h" #include "LiveMode.h" #include "ad.h" /* CMU A2D routines/data structures */ #include "BufferedAD.h" #include "VAD.h" #include "PowerVAD.h" #include "GMMVAD.h" // Number of bytes of audio read at each iteration from the BufferedAD #define AUDIO_BUFFER_SIZE 16384 int iAudioState = ST_IDLE; TSpeechState ssState = ssSilence; char *sUttID = NULL; FILE *fpUttRawFile = NULL; char **asEngineLastHyp = NULL; int *aiEngineLastHypTimestamp = NULL; // Provides an id for each engine_proc_partial request // (mostly for debug purposes, to synchronize AudioServer // and engine logs) int iPartialRequestCounter = 0; long iLastPartialHypTimestamp = 0; long iPartialHypMinDelay = 10; extern Olympus::Hubs hubs; // Performs initializations specific to Live mode void InitLiveMode() { // Allocate the array storing the last partial hyp for each engine asEngineLastHyp = (char**)malloc(iNumEngines*sizeof(char*)); aiEngineLastHypTimestamp = (int*)malloc(iNumEngines*sizeof(int)); for (int i = 0; i < iNumEngines; i++) { asEngineLastHyp[i] = (char*)malloc(MAX_STRING_SIZE*sizeof(char)); aiEngineLastHypTimestamp[i] = -1; } } // Sends messages to the hub indicating a change in speech state (speech vs non-speech) void notifyStateChange(TSpeechState ssNewState) { // Log(STD_STREAM, "State=%d", ssNewState); bool bInSpeech = (ssState == ssMainSpeech) || (ssState == ssDTMF); bool bNewInSpeech = (ssNewState == ssMainSpeech) || (ssNewState == ssDTMF); // The state did not change since last time, do nothing if (bNewInSpeech == bInSpeech) return; // String representation of the state string sState; if (ssNewState==ssMainSpeech) sState = "speech"; else if (ssNewState==ssDTMF) sState = "dtmf"; else sState = "pause"; Log(STD_STREAM, "Start of %s segment detected [User:%s]", sState.c_str(), sUttID); // Fills in the properties frame Gal_Frame gfProperties = Gal_ReadFrameFromString(FormatString( "{c properties \ :state \"%s\"\ }", sState.c_str()).c_str()); // Computes the timestamp of the event int iTimestamp = (iABStartPointer + iABReadPointer)/iSampleRateMS + iTimestampOffset; // Start of speech detection typically introduces a delay if (ssNewState == ssMainSpeech) iTimestamp -= pvVAD->GetSilenceToSpeechDelay(); // Creates the result frame Gal_Frame gfStateTransition = Gal_MakeFrame("main", GAL_CLAUSE); Gal_SetProp(gfStateTransition, ":event_type", Gal_StringObject("vad")); Gal_SetProp(gfStateTransition, ":event_timestamp", Gal_IntObject(iTimestamp)); Gal_SetProp(gfStateTransition, ":properties", Gal_FrameObject(gfProperties)); // Sends state change notification to the hub int iLength; char *sFrame = Gal_PPFrameToString(gfStateTransition, NULL, &iLength); Log( STD_STREAM, "Sending VAD state change notification to the hub:\n%s", sFrame); // sends the frame to the hub GalIO_CommWriteFrame(pCommStruct, gfStateTransition, GAL_FALSE); Gal_FreeFrame(gfStateTransition); ssState = ssNewState; } // A: Main loop void ProcessAudio() { char sToEngine[MAX_MESSAGE_SIZE]; int16 pBuffer[AUDIO_BUFFER_SIZE]; if (iAudioState == ST_IDLE) { FlushAllEngineSockets(); return; } else if (iAudioState == ST_INSIDE_UTT) { // Gets the next block of audio data int iNumRead = BufferedADRead(pBuffer, AUDIO_BUFFER_SIZE); if (iNumRead > 0) { // Log the audio to the utterance raw file if (fpUttRawFile) { fwrite(pBuffer, iNumRead, sizeof(short), fpUttRawFile); } // Check whether we're in a speech or silence/noise segment TSpeechState ssNewState = pvVAD->GetCurrentSpeechState(pBuffer, iNumRead); notifyStateChange(ssNewState); // Announces the upcoming data to all engines sprintf_s( sToEngine, MAX_MESSAGE_SIZE, "engine_proc_raw %d \n", iNumRead * (int)sizeof(short)); SendMessageToAllEngines( sToEngine); // Sends the actual data SendDataToAllEngines((char*)pBuffer, iNumRead * (int)sizeof(short)); } long iTimestamp = GetSessionTimestamp(); if (iTimestamp - iLastPartialHypTimestamp > iPartialHypMinDelay) { iPartialRequestCounter++; Gal_Frame gfPartial = CollectPartialHypotheses(); // if we got some results, sends the frame to the hub if (gfPartial) { //if we're connected to multiple hubs, share this parse with all of them hubs.send(gfPartial); Gal_FreeFrame(gfPartial); } iLastPartialHypTimestamp = iTimestamp; } } else if (iAudioState == ST_OUTSIDE_UTT) { FlushAllEngineSockets(); // Gets the next block of audio data int iNumRead = BufferedADRead(pBuffer, AUDIO_BUFFER_SIZE); if (iNumRead > 0) { // Check whether we're in a speech or silence/noise segment TSpeechState ssNewState = pvVAD->GetCurrentSpeechState(pBuffer, iNumRead); notifyStateChange(ssNewState); } } } // A: Signals the beginning of an utterance void StartUtterance(int iTimestamp, char *sID) { SetBufferedADPointer(iTimestamp); if (sID) sUttID = _strdup(sID); else sUttID = _strdup("DUMMY_ID"); // Opens the utterance raw audio log file char sRawUttFileName[MAX_FILENAME_SIZE]; sprintf_s(sRawUttFileName, MAX_FILENAME_SIZE, "%s%s.raw", hub_log_dir, sUttID);; if (fopen_s(&fpUttRawFile, sRawUttFileName, "wb") != 0) { Log(ERR_STREAM, "Failed to open utterance log file %s", sRawUttFileName); } else { Log(STD_STREAM, "Opened utterance log file %s", sRawUttFileName); } char sToEngine[MAX_MESSAGE_SIZE]; sprintf_s(sToEngine, MAX_MESSAGE_SIZE, "engine_begin_utt %s %d \n", sUttID, iTimestamp); SendMessageToAllEngines( sToEngine); for (int i = 0; i < iNumEngines; i++) { strncpy_s(asEngineLastHyp[i], MAX_STRING_SIZE, "", 1); } iLastPartialHypTimestamp = 0; // (antoine) the following is just for debugging purposes // SendMessageToAllEngines("engine_end_utt \nengine_proc_result \n"); iAudioState = ST_INSIDE_UTT; } // Requests and gathers the current partial hypothesis for each engine Gal_Frame CollectPartialHypotheses() { int iTimestamp = 0; // Gets the current partial hypothesis from each engine Gal_Frame *gfResults = (Gal_Frame*)GetPartialResultFromAllEngines(); // Collect results from all engines bool bGotResult = false; Gal_Object *pgoUtterances = (Gal_Object *)malloc(sizeof(Gal_Object)*iNumEngines); for (int i = 0; i < iNumEngines; i++) { if (gfResults[i]) { // Only send if there is a partial hypothesis and it is different from // the last one we sent // extracts the list of hypothesis returned by this engine int iNumHyps = 0; Gal_Object *pgoHyps = Gal_GetList(gfResults[i], ":partial_results", &iNumHyps); // for now we only use the first hypothesis // THIS SHOULD BE CHANGED TO ALLOW N-BEST LISTS pgoUtterances[i] = pgoHyps[0]; for (int j = 1; j < iNumHyps; j++) { Gal_FreeObject(pgoHyps[j]); } // this tests whether at least one hypothesis is non-null Gal_Frame gfHyp = Gal_FrameValue(pgoUtterances[i]); // check whether this is an emptyhyp or not char *lpszConfHyp = Gal_GetString(gfHyp, ":hyp"); if (lpszConfHyp && (strlen(lpszConfHyp)>0)) { if (Gal_GetObject(gfHyp, ":end_timestamp")) { iTimestamp = Gal_GetInt(gfHyp, ":end_timestamp"); } if (iTimestamp > aiEngineLastHypTimestamp[i]) { Log( STD_STREAM, "%s: %s", asEngineName[i], lpszConfHyp); bGotResult = true; } else { // we didn't get any new result from this engine Gal_FreeObject(pgoUtterances[i]); Gal_Frame gfEmptyHypFrame = Gal_MakeFrame("hypothesis", GAL_CLAUSE); Gal_SetProp(gfEmptyHypFrame, ":engine_name", Gal_StringObject(asEngineName[i])); Gal_SetProp(gfEmptyHypFrame, ":uttid", Gal_StringObject(sUttID)); Gal_SetProp(gfEmptyHypFrame, ":emptyhyp", Gal_StringObject("This space intentionally left blank")); pgoUtterances[i] = Gal_FrameObject(gfEmptyHypFrame); } } else { // we didn't get any new result from this engine Gal_FreeObject(pgoUtterances[i]); Gal_Frame gfEmptyHypFrame = Gal_MakeFrame("hypothesis", GAL_CLAUSE); Gal_SetProp(gfEmptyHypFrame, ":engine_name", Gal_StringObject(asEngineName[i])); Gal_SetProp(gfEmptyHypFrame, ":uttid", Gal_StringObject(sUttID)); Gal_SetProp(gfEmptyHypFrame, ":emptyhyp", Gal_StringObject("This space intentionally left blank")); pgoUtterances[i] = Gal_FrameObject(gfEmptyHypFrame); } } else { // we didn't get any result from this engine Gal_Frame gfEmptyHypFrame = Gal_MakeFrame("hypothesis", GAL_CLAUSE); Gal_SetProp(gfEmptyHypFrame, ":engine_name", Gal_StringObject(asEngineName[i])); Gal_SetProp(gfEmptyHypFrame, ":uttid", Gal_StringObject(sUttID)); Gal_SetProp(gfEmptyHypFrame, ":emptyhyp", Gal_StringObject("This space intentionally left blank")); pgoUtterances[i] = Gal_FrameObject(gfEmptyHypFrame); } } if (bGotResult) { Gal_Frame gfRepl = Gal_MakeFrame("main", GAL_CLAUSE); Gal_SetProp(gfRepl, ":timestamp", Gal_IntObject(iTimestamp)); Gal_SetProp(gfRepl, ":partialhyps", Gal_ListObject(pgoUtterances, iNumEngines)); int iLength; char *sFrame = Gal_PPFrameToString(gfRepl, NULL, &iLength); Log( STD_STREAM, "Sending partial results [User:%s]\n%s", sUttID, sFrame); free(sFrame); return gfRepl; } else { Log( STD_STREAM, "No partial hypothesis yet [User:%s]", sUttID); return NULL; } } // A: Signals the end of an utterance and gets // the final result from the engines Gal_Frame FinalizeUtterance() { int iTimestamp = 0; iAudioState = ST_OUTSIDE_UTT; // Close the utterance file if (fpUttRawFile) { fclose(fpUttRawFile); } // Asks and wait for the final hypothesis from each engine Gal_Frame *gfResults = (Gal_Frame*)GetFinalResultFromAllEngines(); // Extracts the hypothesis from each engine bool bGotResult = false; Gal_Object *pgoUtterances = (Gal_Object *)malloc(sizeof(Gal_Object)*asTotalNBestSize); // (bthomson) Changed for NBest support. Previously * iNumEngines int curOutputN = 0; for (int i = 0; i < iNumEngines; i++) { if (gfResults[i]) { // extracts the list of hypothesis returned by this engine int iNumHyps = 0; Gal_Object *pgoHyps = Gal_GetList(gfResults[i], ":results", &iNumHyps); // (bthomson) CHANGED TO ALLOW N-BEST LISTS for (int j=0; j < asEngineNBestSize[i]; j++) { pgoUtterances[curOutputN] = pgoHyps[j]; // this tests whether at least one engine returned a non-null // hypothesis Gal_Frame gfHyp = Gal_FrameValue(pgoUtterances[curOutputN]); // check whether this is an emptyhyp or not char *lpszConfHyp = Gal_GetString(gfHyp, ":hyp"); if (lpszConfHyp && (strlen(lpszConfHyp)>0)) { if (Gal_GetObject(gfHyp, ":end_timestamp")) { iTimestamp = Gal_GetInt(gfHyp, ":end_timestamp"); } // this is a real hyp (non-empty) bGotResult = true; } Log( STD_STREAM, "%s: %s", asEngineName[i], lpszConfHyp); curOutputN ++; } // Free any hypotheses not used for (int j = asEngineNBestSize[i]; j < iNumHyps; j++) { Gal_FreeObject(pgoHyps[j]); } } else { for (int j=0; j < asEngineNBestSize[i]; j++) { // we didn't get any result from this engine Gal_Frame gfEmptyHypFrame = Gal_MakeFrame("hypothesis", GAL_CLAUSE); Gal_SetProp(gfEmptyHypFrame, ":engine_name", Gal_StringObject(asEngineName[i])); Gal_SetProp(gfEmptyHypFrame, ":uttid", Gal_StringObject(sUttID)); Gal_SetProp(gfEmptyHypFrame, ":emptyhyp", Gal_StringObject("This space intentionally left blank")); pgoUtterances[j] = Gal_FrameObject(gfEmptyHypFrame); Log( STD_STREAM, "%s: **NO RESULTS**", asEngineName[i]); } } } Gal_Frame gfRepl = Gal_MakeFrame("main", GAL_CLAUSE); if (bGotResult) { Gal_SetProp(gfRepl, ":timestamp", Gal_IntObject(iTimestamp)); Gal_SetProp(gfRepl, ":confhyps", Gal_ListObject(pgoUtterances, asTotalNBestSize )); // (bthomson) Change for NBest handling } else { // We didn't get any results from the engines, use current timestamp iTimestamp = GetSessionTimestamp(GetCurrentAbsoluteTimestamp()); Gal_SetProp(gfRepl, ":timestamp", Gal_IntObject(iTimestamp)); Gal_SetProp(gfRepl, ":emptyhyp", Gal_StringObject("This space intentionally left blank")); } int iLength; char *sFrame = Gal_PPFrameToString(gfRepl, NULL, &iLength); Log( STD_STREAM, "Sending final results [User:%s]\n%s", sUttID, sFrame); free(sFrame); return gfRepl; } // A: Cancels the current utterance void CancelUtterance() { int iTimestamp = 0; iAudioState = ST_OUTSIDE_UTT; // Close the utterance file if (fpUttRawFile) { fclose(fpUttRawFile); } // Notifies all engines to cancel the utterance SendMessageToAllEngines("engine_cancel_utt \n"); FlushAllEngineSockets(); }