#include using namespace Olympus; #include #ifdef __cplusplus extern "C" { #endif #include #include #ifdef __cplusplus } #endif #define DEFAULT_ARGFN "decoder-config/comm.arg" static char *argfn; #define DEFAULT_LISTEN_PORT 9990 #define RAW_MODE 0 #define CMD_MODE 1 #define SOCK_BUF_SIZE 131072 #define HYPSTR_SIZE 4096 // Time between two frames, in ms #define SPHINX_FRAME_SIZE 10 #define RAW_BUF_SIZE 131072 #define PARTIAL_RESULT_INTERVAL 25 #define UTTID_SIZE 256 #define UTT_IDLE 0 #define UTT_STARTED 1 #define UTT_ENDED 2 #define UTT_CANCELED 3 int Listen_Port; int Listen_Mode; char *client_name; /* the human-readable name of this engine */ SOCKET Client; /* always maintain one client */ char Socket_Buffer[SOCK_BUF_SIZE]; int Buffer_P; short Raw_Buf[RAW_BUF_SIZE]; float Conf_Thresh = 0.50; int nBytes; int iUttSize; int iMaxUttSize = 960000; int utt_state = UTT_IDLE; char utt_id[UTTID_SIZE]; int iUttStartTimestamp; ps_decoder_t *psd; static int pr_rpt_frame; /* last partial result reported at this frame */ static int slm; static int annotate_word_conf; static int iNBest = 0; /* the number of n-best hypotheses to generate */ // [2005-10-19] (dbohus): define the THypStruct type which encodes // internally information about a hypothesis typedef struct { char hypstr[HYPSTR_SIZE]; char confhypstr[HYPSTR_SIZE]; char utterance_id[UTTID_SIZE]; float decoder_score; float lm_score; float am_score; int frame_num; float avg_sphinxwordconf; float min_sphinxwordconf; float max_sphinxwordconf; float avg_wordconf; float min_wordconf; float max_wordconf; float avg_validwordconf; float min_validwordconf; float max_validwordconf; char starttimes_list[4096]; char endtimes_list[4096]; int nbest_index; int speech_start_time; int speech_end_time; } THypStruct; /* read next word from the string and change the pointer of string */ int get_word(char **string, char *word) { char *p = word; while ((**string != '\0') && ((**string == '\n') || (**string == '\t') || (**string == ' ') || (**string == '\r'))) (*string)++; while ((**string != '\0') && (**string != '\n') && (**string != '\t') && (**string != ' ') && (**string != '\r')) *p++ = *(*string)++; *p = '\0'; return (int)strlen(word); } static int decoder_init(char *argfn, bool reinit = false) { Log(STD_STREAM, "Initialize fbs ..."); static const arg_t ps_engine_args[] = { POCKETSPHINX_OPTIONS, CMDLN_EMPTY_OPTION }; cmd_ln_t *config; config = cmd_ln_parse_file_r(NULL, ps_engine_args, argfn, TRUE); if (config == NULL) return -1; if (!reinit) { psd = ps_init(config); if (psd == NULL) { return -1; } } else { int result = ps_reinit(psd, config); if (result < 0) { return -1; } } return 0; } float* hyp_conf_slm (bool useFixedScore = false) { const int MAX_TYPE_SIZE = 4096; int32 score, type[MAX_TYPE_SIZE]; int32 k = 0; ps_seg_t *seg_iter = ps_seg_iter(psd, &score); type[k++] = 3; //but use the trigram dummy for it if (seg_iter != NULL) { while (seg_iter = ps_seg_next(seg_iter)) { if (k == MAX_TYPE_SIZE) return NULL; int32 lscr, ascr; ps_seg_prob(seg_iter, &ascr, &lscr, &type[k++]); } } type[k++] = 3; // (tk) dummy trigram after utterance type[k++] = 3; // (tk) sometimes there's no end token, in which case // the list one was for the end token and this one is the dummy // (antoine) allocate the array of confidence scores float* conf = (float*)malloc(k*sizeof(float)); for (int32 i = 1; i < k-2; i++) { if(!useFixedScore) { int32 t = type[i-1] + type[i] + ((type[i+1] + type[i+2])<<1); // (tk) wtf? conf[i-1] = (float)((double)(t-6)/12.0); } else { conf[i-1] = 0.7f; } } return conf; } void strip_class(char *inout) { char *out; char *in; char *e; char *start; int removed_oov = 0; int special_char, has_word = 0; if (inout == NULL) { return; } start = in = out = inout; while (*in) { special_char = 0; if (*in == '_') { *in = ' '; special_char = 1; } // Removes class name (after a ':') else if (*in == ':') { /* find the first space after the underscore */ e = strchr(in, ' '); if (e == NULL) { /* No space after underscore. Position e to terminal nul char */ e = in + strlen(in); } in = e; special_char = 1; } *out = *in; if (!special_char) has_word = 1; if (*in) { /* if not at terminal nul, advance one char */ out++; in++; } } *out = *in; } /** * [2010-04-06] (bthomson) : Adding a simpler function for use in N-Best lists * Simply stores the hyp and score **/ void fillPartialHypStruct(THypStruct *phs, const char* hyp, int32 score, int nBest) { Log(STD_STREAM, "Filling partial hyp struct\n"); size_t h_len, ch_len; int n_words = 0, n_validwords, has_oov; char tmp[16384]; float *lm_conf = NULL; // Fill in confidence values for words in result and build filtered hypothesis if (slm) lm_conf = hyp_conf_slm(); else lm_conf = hyp_conf_slm(true); // do some initialization h_len = 0; ch_len = 0; n_words = 0; n_validwords = 0; phs->nbest_index = nBest; int32 best_score; const char* uttId; ps_get_hyp(psd, &best_score, &uttId); strcpy(phs->utterance_id, uttId); phs->avg_sphinxwordconf = score; phs->min_sphinxwordconf = score; phs->max_sphinxwordconf = score; phs->avg_wordconf = 0; phs->min_wordconf = 1; phs->max_wordconf = 0; phs->avg_validwordconf = 0; phs->min_validwordconf = 1; phs->max_validwordconf = 0; phs->confhypstr[0] = '\0'; strcpy(phs->hypstr, hyp); phs->starttimes_list[0] = 0; phs->endtimes_list[0] = 0; phs->speech_start_time = -1; phs->speech_end_time = 0; phs->frame_num = ps_get_n_frames(psd); if (n_words > 0) phs->avg_sphinxwordconf /= n_words; if (n_words > 0) phs->avg_wordconf /= n_words; if (n_validwords > 0) phs->avg_validwordconf /= n_validwords; Log(STD_STREAM, "hyp: %s\n", phs->hypstr); fflush(stdout); // if we couldn't get any results from the decoder if ((phs->hypstr == NULL) || (strlen(phs->hypstr) <= 0)) { // set all the result-dependent values to null phs->hypstr[0] = '\0'; phs->confhypstr[0] = '\0'; phs->decoder_score = 0; phs->lm_score = 0; phs->am_score = 0; phs->avg_wordconf = 0; phs->min_wordconf = 0; phs->max_wordconf = 0; phs->avg_validwordconf = 0; phs->min_validwordconf = 0; phs->max_validwordconf = 0; phs->starttimes_list[0] = '\0'; phs->endtimes_list[0] = '\0'; } Log(STD_STREAM, "Finished filling partial hyp struct\n"); } // [2008-02-19] (antoine): this function takes a partial hypothesis and a reference to a // THypStruct and fills in the hyp struct void fillPartialHypStruct(ps_seg_t* curr_seg_iter, THypStruct* phs, int fromNBest) { Log(STD_STREAM, "Filling partial hyp struct\n"); size_t h_len, ch_len; int n_words = 0, n_validwords, has_oov; char tmp[16384]; float *lm_conf = NULL; // Fill in confidence values for words in result and build filtered hypothesis if (slm) lm_conf = hyp_conf_slm(); else lm_conf = hyp_conf_slm(true); // do some initialization h_len = 0; ch_len = 0; n_words = 0; n_validwords = 0; phs->nbest_index = fromNBest; int32 best_score; const char* uttId; ps_get_hyp(psd, &best_score, &uttId); strcpy(phs->utterance_id, uttId); phs->avg_sphinxwordconf = 0; phs->min_sphinxwordconf = 1; phs->max_sphinxwordconf = 0; phs->avg_wordconf = 0; phs->min_wordconf = 1; phs->max_wordconf = 0; phs->avg_validwordconf = 0; phs->min_validwordconf = 1; phs->max_validwordconf = 0; phs->confhypstr[0] = '\0'; phs->hypstr[0] = '\0'; phs->starttimes_list[0] = 0; phs->endtimes_list[0] = 0; phs->speech_start_time = -1; phs->speech_end_time = 0; has_oov = 0; ps_seg_t *seg_iter = curr_seg_iter; if (seg_iter != NULL) { while (seg_iter = ps_seg_next(seg_iter)) { char const *psWord = ps_seg_word(seg_iter); if(psWord[0] != '<' && psWord[(int) strlen(psWord) - 2] != '>') { // (tk) remove variants char* currWord = new char[strlen(psWord)+1]; strcpy(currWord, psWord); char* end = strchr(currWord, '('); if (end) *end = '\0'; // (antoine) Update LM word confidence statistics phs->avg_wordconf += lm_conf[n_words]; if (phs->min_wordconf > lm_conf[n_words]) phs->min_wordconf = lm_conf[n_words]; if (phs->max_wordconf < lm_conf[n_words]) phs->max_wordconf = lm_conf[n_words]; int32 am_scr, lm_scr, lback, post; // (dhdfu) Get Sphinx probabilities and scores post = ps_seg_prob(seg_iter, &am_scr, &lm_scr, &lback); // Update acoustic and language model scores phs->decoder_score += lm_scr + am_scr; phs->lm_score += lm_scr; phs->am_score += am_scr; // (antoine): check if the word is an oov if (!strcmp("++OOV++", currWord)) { has_oov = 1; } if (!annotate_word_conf || (lm_conf[n_words] >= Conf_Thresh)) { // TODO: 1) make sure hypstr and confhypstr do not overflow!!! // 2) avoid outputing ++OOV++ // (dbohus): build the confhypstr if ((ch_len + 1 + strlen(currWord) < HYPSTR_SIZE) && (currWord[0] != '+')) { strcpy (phs->confhypstr + ch_len, currWord); ch_len += strlen(currWord); phs->confhypstr[ch_len++] = ' '; } // (dbohus): build the hypstr if ((h_len + 1 + strlen(currWord) < HYPSTR_SIZE) && (currWord[0] != '+')) { strcpy (phs->hypstr + h_len, currWord); h_len += strlen(currWord); phs->hypstr[h_len++] = ' '; } // (antoine) Update word confidence statistics for // confident ("valid") words phs->avg_validwordconf += lm_conf[n_words]; n_validwords++; if (phs->min_validwordconf > lm_conf[n_words]) phs->min_validwordconf = lm_conf[n_words]; if (phs->max_validwordconf < lm_conf[n_words]) phs->max_validwordconf = lm_conf[n_words]; } else { // (dbohus): build the confhypstr if ((ch_len + 5 + strlen(currWord) < HYPSTR_SIZE) && (currWord[0] != '+')) { strcpy (phs->confhypstr + ch_len, ".?"); ch_len += 2; strcpy (phs->confhypstr + ch_len, currWord); ch_len += strlen(currWord); strcpy (phs->confhypstr + ch_len, "?."); ch_len += 2; phs->confhypstr[ch_len++] = ' '; } // (dbohus): build the hypstr if ((h_len + 1 + strlen(currWord) < HYPSTR_SIZE) && (currWord[0] != '+')) { strcpy (phs->hypstr + h_len, currWord); h_len += strlen(currWord); phs->hypstr[h_len++] = ' '; } } // (dbohus): build the confhypstr phs->confhypstr[ch_len] = '\0'; // (dbohus): build the hypstr phs->hypstr[h_len] = '\0'; // added by J 01/25/2005 // Calculate startframes and endframes for all words in hyp if (currWord[0] != '+') { if (phs->starttimes_list[0]) { strcat(phs->starttimes_list, " "); strcat(phs->endtimes_list, " "); } int start_frame, end_frame; ps_seg_frames(seg_iter, &start_frame, &end_frame); itoa(iUttStartTimestamp+start_frame*SPHINX_FRAME_SIZE, tmp, 10); strcat(phs->starttimes_list, tmp); itoa(iUttStartTimestamp+end_frame*SPHINX_FRAME_SIZE, tmp, 10); strcat(phs->endtimes_list, tmp); if (phs->speech_start_time < 0) { phs->speech_start_time = iUttStartTimestamp+ start_frame*SPHINX_FRAME_SIZE; } phs->speech_end_time = iUttStartTimestamp+ end_frame*SPHINX_FRAME_SIZE; } delete[] currWord; } n_words++; } } // (antoine): free the LM confidence score array free(lm_conf); // (antoine): if there were some oovs, add a single "++OOV++" symbol // (this is to avoid getting very long strings of oov symbols) if (has_oov) { // for the hypstr if (h_len + 1 + strlen("++OOV++ ") < HYPSTR_SIZE) { strcpy (phs->hypstr + h_len, "++OOV++ "); h_len += strlen("++OOV++ "); strcpy(phs->starttimes_list, "N/A"); strcpy(phs->endtimes_list, "N/A"); } // for the confhypstr if (ch_len + 1 + strlen("++OOV++ ") < HYPSTR_SIZE) { strcpy (phs->confhypstr + ch_len, "++OOV++ "); ch_len += strlen("++OOV++ "); } } strip_class(phs->confhypstr); strip_class(phs->hypstr); phs->frame_num = ps_get_n_frames(psd); Log(STD_STREAM, "%d frames\n", phs->frame_num); if (n_words > 0) phs->avg_sphinxwordconf /= n_words; if (n_words > 0) phs->avg_wordconf /= n_words; if (n_validwords > 0) phs->avg_validwordconf /= n_validwords; Log(STD_STREAM, "hyp: %s\n", phs->hypstr); fflush(stdout); // if we couldn't get any results from the decoder if ((phs->hypstr == NULL)) Log(STD_STREAM, "hyp is NULL!\n"); if ((phs->hypstr == NULL) || (strlen(phs->hypstr) <= 0)) { // set all the result-dependent values to null phs->hypstr[0] = '\0'; phs->confhypstr[0] = '\0'; phs->decoder_score = 0; phs->lm_score = 0; phs->am_score = 0; phs->avg_wordconf = 0; phs->min_wordconf = 0; phs->max_wordconf = 0; phs->avg_validwordconf = 0; phs->min_validwordconf = 0; phs->max_validwordconf = 0; phs->starttimes_list[0] = '\0'; phs->endtimes_list[0] = '\0'; } Log(STD_STREAM, "Finished filling partial hyp struct\n"); } // [2005-10-19] (dbohus): this function takes a hypothesis and a reference to a THypStruct and // fills in the hyp struct void fillHypStruct(ps_seg_t* curr_seg_iter, THypStruct* phs, int fromNBest) { fillPartialHypStruct(curr_seg_iter, phs, fromNBest); } void getHypStructs(THypStruct *phsHyps, int& iHypsGenerated, int iNBest) { iHypsGenerated = 0; if(iNBest > 0) { Log(STD_STREAM, "Getting hypotheses for N-Best"); ps_nbest_t* nbest_iter; int32 score=0.5; ps_end_utt(psd); nbest_iter = ps_nbest(psd, 0, -1, NULL, NULL); int name_sf = 0; int name_ef = -1; if (nbest_iter == NULL) { Log(STD_STREAM, "unable to get hypothesis"); } else { /* while (nbest_iter && (iHypsGenerated < iNBest)) { for (ps_seg_t *seg = ps_nbest_seg(nbest_iter, &score); seg; seg = ps_seg_next(seg)) { char const* w = ps_seg_word(seg); int l=strlen(w); if (((l > 3) && (w[l-3] == ':') && (w[l-2] == 'f') && (w[l-1] == 'l')) || ((l > 2) && (w[l-2] == ':') && (w[l-1] == 'f')) || ((l > 2) && (w[l-2] == ':') && (w[l-1] == 'l')) ) { int sf, ef; ps_seg_frames(seg, &sf, &ef); if (name_sf == 0) { name_sf = sf; } if (name_ef < ef) { name_ef = ef; } } // if (l >3) ... } // for (ps_seg ...) const char *hyp = ps_nbest_hyp(nbest_iter, &score); if (!hyp) { nbest_iter = ps_nbest_next(nbest_iter); continue; } fillPartialHypStruct(phsHyps + iHypsGenerated, hyp, score, iHypsGenerated); //nbest_iter = ps_nbest_next(nbest_iter); iHypsGenerated++; } // while (nbest_it ..) */ //ps_nbest_t* nbest_iter; int32 out_score; Log(STD_STREAM, "Iterating..."); for (; nbest_iter; nbest_iter = ps_nbest_next(nbest_iter)) { if (iHypsGenerated < iNBest) { fillPartialHypStruct(ps_nbest_seg(nbest_iter, &out_score), phsHyps + iHypsGenerated, iHypsGenerated); if (phsHyps[iHypsGenerated].hypstr[0] != '\0') iHypsGenerated++; } else { if (nbest_iter != NULL) { ps_nbest_free(nbest_iter); } break; } } } // if (nbest_iter == NULL) } else // (iNBest == 0) { int32 best_score; ps_seg_t* seg_iter = ps_seg_iter(psd, &best_score); if (seg_iter == NULL) { Log(STD_STREAM, "unable to get hypothesis"); } else { // build a hyp structure fillPartialHypStruct(seg_iter, phsHyps, 0); iHypsGenerated++; } } } void engine_proc_result() { char to_audio[SOCK_BUF_SIZE]; char tmp[HYPSTR_SIZE]; int iUttEndTimestamp; int iHypsGenerated = 0; THypStruct *phsHyps; // allocate the phsHyps; phsHyps = (THypStruct *)malloc(sizeof(THypStruct) * (iNBest + 1)); int32 best_score; Log(STD_STREAM, "Getting final hypotheses frame"); getHypStructs(phsHyps, iHypsGenerated, iNBest); if (iHypsGenerated == 0) fillPartialHypStruct(phsHyps, "", 0, 0); // Always have at least the zero struct empty //if (iHypsGenerated > 0) { Log(STD_STREAM, "Got final hypotheses frame"); iUttEndTimestamp = iUttStartTimestamp + phsHyps[0].frame_num * SPHINX_FRAME_SIZE; // create a string-representation of the Galaxy frame for this result strcpy(to_audio, "{c result :results ( "); // send out the number of hyps that were actually generated for(int i = 0; i < iHypsGenerated || i == 0; i++) { sprintf(tmp, "{c hypothesis :hyp \"%s\" :confhyp \"%s\" :uttid \"%s\" :engine_name \"%s\" \ :nbest_index %d :decoder_score %f :lm_score %f :am_score %f :frame_num %d \ :avg_sphinxwordconf %f :min_sphinxwordconf %f :max_sphinxwordconf %f \ :avg_wordconf %f :min_wordconf %f :max_wordconf %f \ :avg_validwordconf %f :min_validwordconf %f :max_validwordconf %f \ :word_start_times \"%s\" :word_end_times \"%s\" :init_timestamp %d :end_timestamp %d \ :speech_start_timestamp %d :speech_end_timestamp %d } ", phsHyps[i].hypstr, phsHyps[i].confhypstr, phsHyps[i].utterance_id, client_name, phsHyps[i].nbest_index, phsHyps[i].decoder_score, phsHyps[i].lm_score, phsHyps[i].am_score, phsHyps[i].frame_num, phsHyps[i].avg_sphinxwordconf, phsHyps[i].min_sphinxwordconf, phsHyps[i].max_sphinxwordconf, phsHyps[i].avg_wordconf, phsHyps[i].min_wordconf, phsHyps[i].max_wordconf, phsHyps[i].avg_validwordconf, phsHyps[i].min_validwordconf, phsHyps[i].max_validwordconf, phsHyps[i].starttimes_list, phsHyps[i].endtimes_list, iUttStartTimestamp, iUttEndTimestamp, phsHyps[i].speech_start_time, phsHyps[i].speech_end_time); strcat(to_audio, tmp); } strcat(to_audio, " ) }"); Log(STD_STREAM, "Sending final hypotheses frame:\n%s\n", to_audio); sock_send_block(Client, to_audio, (int32) strlen(to_audio)); } } void engine_proc_result_partial() { char to_audio[SOCK_BUF_SIZE]; char tmp[HYPSTR_SIZE]; int i; int iUttEndTimestamp; int iHypsGenerated = 0; THypStruct *phsHyps; // allocate the phsHyps; phsHyps = (THypStruct *)malloc(sizeof(THypStruct) * (iNBest + 1)); Log(STD_STREAM, "Getting partial hypotheses frame"); getHypStructs(phsHyps, iHypsGenerated, 0); // For partial always do 1-best if (iHypsGenerated == 0) fillPartialHypStruct(phsHyps, "", 0, 0); // Always have at least the zero struct empty { Log(STD_STREAM, "Got partial hypotheses"); iUttEndTimestamp = iUttStartTimestamp + phsHyps[0].frame_num * SPHINX_FRAME_SIZE; // create a string-representation of the Galaxy frame for this result strcpy(to_audio, "{c partial_result :partial_results ( "); // send out the number of hyps that were actually generated for(i = 0; i < iHypsGenerated || i == 0; i++) { //Log(STD_STREAM, "Sending hypothesis frame %d:", i); char * hypstr = phsHyps[i].hypstr == NULL ? "" : phsHyps[i].hypstr; char * confhypstr = phsHyps[i].confhypstr == NULL ? "" : phsHyps[i].confhypstr; char * utterance_id = phsHyps[i].utterance_id == NULL ? "" : phsHyps[i].utterance_id; sprintf(tmp, "{c hypothesis :hyp \"%s\" :confhyp \"%s\" :uttid \"%s\" :engine_name \"%s\" \ :nbest_index %d :decoder_score %f :lm_score %f :am_score %f :frame_num %d \ :avg_sphinxwordconf %f :min_sphinxwordconf %f :max_sphinxwordconf %f \ :avg_wordconf %f :min_wordconf %f :max_wordconf %f \ :avg_validwordconf %f :min_validwordconf %f :max_validwordconf %f \ :word_start_times \"%s\" :word_end_times \"%s\" :init_timestamp %d :end_timestamp %d \ :speech_start_timestamp %d :speech_end_timestamp %d } ", hypstr, confhypstr, utterance_id, client_name, phsHyps[i].nbest_index, phsHyps[i].decoder_score, phsHyps[i].lm_score, phsHyps[i].am_score, phsHyps[i].frame_num, phsHyps[i].avg_sphinxwordconf, phsHyps[i].min_sphinxwordconf, phsHyps[i].max_sphinxwordconf, phsHyps[i].avg_wordconf, phsHyps[i].min_wordconf, phsHyps[i].max_wordconf, phsHyps[i].avg_validwordconf, phsHyps[i].min_validwordconf, phsHyps[i].max_validwordconf, phsHyps[i].starttimes_list, phsHyps[i].endtimes_list, iUttStartTimestamp, iUttEndTimestamp, phsHyps[i].speech_start_time, phsHyps[i].speech_end_time); strcat(to_audio, tmp); //Log(STD_STREAM, " DONE"); } strcat(to_audio, " ) }"); Log(STD_STREAM, "Sending partial hypotheses frame:\n%s\n", to_audio); sock_send_block(Client, to_audio, (int32) strlen(to_audio)); } } void process_client_command(char *command) { char word[4096], *p; Log(STD_STREAM, "Processing command: %s", command); p = command; while (get_word(&p, word)) { if (!strcmp(word, "engine_new_session")) { Log(STD_STREAM, "new session"); if (utt_state == UTT_ENDED) { Log(STD_STREAM, "Canceled utterance in UTT_ENDED state."); } utt_state = UTT_IDLE; ngram_model_t *lmset = ps_get_lmset(psd); ngram_model_set_select(lmset, "general"); ps_update_lmset(psd, lmset); slm = TRUE; if (get_word(&p, word)) { strcat(word, "-sphinx_"); strcat(word, client_name); strcat(word, ".log"); InitializeLogging(word, All); err_set_logfp(GetLogFp()); } if (get_word(&p, word)) { SetSessionStartTimestamp(_atoi64(word)); } break; } else if (!strcmp(word, "engine_begin_utt")) { if (!get_word(&p, word)) { // No utt id was specified, have sphinx generate one automatically iUttStartTimestamp = 0; Log(STD_STREAM, "begin utt: %s (%d)", utt_id, iUttStartTimestamp); ps_start_utt(psd, NULL); } else { sprintf( utt_id, "%s", word); // now look for the timestamp marking the start of the utt if (!get_word(&p, word)) { iUttStartTimestamp = 0; } else { iUttStartTimestamp = atoi(word); } Log(STD_STREAM, "begin utt: %s (%d)", utt_id, iUttStartTimestamp); ps_start_utt(psd, utt_id); } utt_state = UTT_STARTED; iUttSize = 0; pr_rpt_frame = 0; break; } else if (!strcmp(word, "engine_cancel_utt")) { if (utt_state == UTT_ENDED) { Log(STD_STREAM, "Canceled utterance in UTT_ENDED state."); } utt_state = UTT_IDLE; break; } else if (!strcmp(word, "engine_end_utt")) { if (utt_state == UTT_STARTED) { Log(STD_STREAM, "end utt: %s", utt_id); if(ps_end_utt(psd) < 0) { Log(ERR_STREAM, "Problem with end Utt"); } utt_state = UTT_ENDED; } break; } else if (!strcmp(word, "engine_set_lm")) { if (!get_word(&p, word)) { Log(STD_STREAM, "ERROR: set lm but not found lm"); break; } // If there is currently an utterance being processed, // cancel it before setting the LM. if (utt_state != UTT_IDLE) { Log(STD_STREAM, "Warning: set lm called during an utterance. Ignored."); continue; } Log(STD_STREAM, "set lm: %s", word); if(word[0]=='f'&&word[1]=='s'&&word[2]=='g'&&word[3]=='_') { fsg_set_t *fsgset = ps_get_fsgset(psd); fsg_set_select(fsgset, word + 4); ps_update_fsgset(psd); slm = FALSE; } else if (word[0]=='s'&&word[1]=='l'&&word[2]=='m'&&word[3]=='_') { ngram_model_t *lmset = ps_get_lmset(psd); ngram_model_set_select(lmset, word + 4); ps_update_lmset(psd, lmset); slm = TRUE; } else { ngram_model_t *lmset = ps_get_lmset(psd); ngram_model_set_select(lmset, word); ps_update_lmset(psd, lmset); slm = TRUE; } break; } else if (!strcmp(word, "get_acoustic_model")) { //return the current acoustic model Log(STD_STREAM, "looking for the acoustic model"); cmd_ln_t* config = ps_get_config(psd); const char* hmm = cmd_ln_str_r(config, "-hmm"); if(hmm != NULL) { ostringstream result; string hmm_s(hmm); for(string::size_type i = 0; i= SOCK_BUF_SIZE) Buffer_P = 0; return; } memcpy(Command, Socket_Buffer, i + 1); Command[i + 1] = '\0'; Buffer_P -= (i + 1); memmove(Socket_Buffer, Socket_Buffer + i + 1, Buffer_P); process_client_command(Command); continue; } else { if ((nBytes == 0)||(utt_state != UTT_STARTED)) { Log(STD_STREAM, "no data to read"); Listen_Mode = CMD_MODE; return; } if (Buffer_P < nBytes) return; Log(STD_STREAM, "Processing raw data ..."); p = (char *)Raw_Buf; for (i = 0; i < nBytes; i++) { *p = Socket_Buffer[i]; p++; } Log(STD_STREAM, "%d bytes raw data received", i); Buffer_P -= i; memmove(Socket_Buffer, Socket_Buffer + i, Buffer_P); if (iUttSize + nBytes <= iMaxUttSize) { ps_process_raw (psd, Raw_Buf, nBytes / sizeof(short), FALSE, FALSE /* block if TRUE */); } else { Log(ERR_STREAM, "Utterance is longer than maximum authorized buffer size (%d > %d), "\ "ignoring subsequent data.", iUttSize + nBytes, iMaxUttSize); } Log(STD_STREAM, "ps_process_raw finished"); iUttSize += nBytes; nBytes = 0; Listen_Mode = CMD_MODE; continue; } } } /* Usage: -name client_name -argfn argument_file -port port_number */ void print_usage() { printf("Usage: sphinx_client -name client_name -argfn argument_file -port port_number\n"); } int main(int argc, char *argv[]) { SOCKET listen_socket, max; fd_set fds; int i; client_name = NULL; argfn = NULL; Listen_Port = DEFAULT_LISTEN_PORT; // starts the high resolution timer for logging purposes InitializeHighResolutionTimer(); annotate_word_conf = 1; for (i = 1; i < argc; i++) { if (!strcmp(argv[i], "-argfn")) argfn = argv[++i]; else if (!strcmp(argv[i], "-port")) Listen_Port = atoi(argv[++i]); else if (!strcmp(argv[i], "-name")) client_name = argv[++i]; else if (!strcmp(argv[i], "-no_word_conf")) annotate_word_conf = 0; else if (!strcmp(argv[i], "-n_best")) iNBest = atoi(argv[++i]); else { Log(STD_STREAM, "ERROR: Unknown parameter: %s", argv[i]); print_usage(); exit(1); } } if (client_name == NULL) { Log(STD_STREAM, "ERROR: You must specify a client name with the -name option."); exit(1); } if (argfn == NULL) { if (decoder_init(DEFAULT_ARGFN) < 0) { Log(STD_STREAM, "ERROR: decoder initialization failed"); exit(1); } } else { if (decoder_init(argfn) < 0) { Log(STD_STREAM, "ERROR: decoder initialization failed"); exit(1); } } Client = INVALID_SOCKET; Buffer_P = 0; Listen_Mode = CMD_MODE; nBytes = 0; listen_socket = sock_listen(Listen_Port, 1); Log(STD_STREAM, "Sphinx client (%s): start listening and decoding ...", client_name); while (1) { FD_ZERO(&fds); FD_SET(listen_socket, &fds); max = listen_socket; if (Client != INVALID_SOCKET) { FD_SET(Client, &fds); if ((int)Client > max) max = Client; } max++; if (select((int) max, &fds, NULL, NULL, NULL) == SOCKET_ERROR) { Log(STD_STREAM, "ERROR: select error"); } if (Client != INVALID_SOCKET){ Log(STD_STREAM, "Processing input..."); process_client_input(); } else { Log(STD_STREAM, "Reconnecting socket..."); Client = sock_await_connection(listen_socket); } } }