sumom****@users*****
sumom****@users*****
2009年 2月 10日 (火) 02:27:47 JST
Index: julius4/libsent/src/ngram/init_ngram.c diff -u julius4/libsent/src/ngram/init_ngram.c:1.4 julius4/libsent/src/ngram/init_ngram.c:1.5 --- julius4/libsent/src/ngram/init_ngram.c:1.4 Sat Jan 31 18:11:22 2009 +++ julius4/libsent/src/ngram/init_ngram.c Tue Feb 10 02:27:46 2009 @@ -12,7 +12,7 @@ * @author Akinobu LEE * @date Wed Feb 16 07:40:53 2005 * - * $Revision: 1.4 $ + * $Revision: 1.5 $ * */ /* @@ -123,52 +123,62 @@ * @param ndata [i/o] word/class N-gram, the unknown word information will be set. * @param winfo [i/o] word dictionary, the word-to-ngram-entry mapping will be done here. */ -void +boolean make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo) { int i; + boolean ok_flag = TRUE; + int count = 0; jlog("Stat: init_ngram: mapping dictonary words to n-gram entries\n"); ndata->unk_num = 0; for (i = 0; i < winfo->num; i++) { winfo->wton[i] = make_ngram_ref(ndata, winfo->wname[i]); + if (winfo->wton[i] == WORD_INVALID) { + ok_flag = FALSE; + count++; + continue; + } if (winfo->wton[i] == ndata->unk_id) { (ndata->unk_num)++; } } + if (ok_flag == FALSE) { + jlog("Error: --- Failed to map %d words in dictionary to N-gram\n", count); + jlog("Error: --- Please fix the dict, or use open vocabulary N-gram that has either \"%s\" or \"%s\"\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2); + return FALSE; + } + if (ndata->unk_num == 0) { ndata->unk_num_log = 0.0; /* for safe */ } else { ndata->unk_num_log = (float)log10(ndata->unk_num); } jlog("Stat: init_ngram: finished word-to-ngram mapping\n"); + return TRUE; } /** * @brief Set unknown word ID to the N-gram data. * - * In CMU-Cam SLM toolkit, OOV words are always mapped to UNK, which - * always appear at the very beginning of N-gram entry, so we fix the - * unknown word ID at "0". + * Unknown word string should be UNK_WORD_DEFAULT or UNK_WORD_DEFAULT2, + * whose default is "<unk>" and "<UNK>". If any of these is not found + * in vocabulary, treat the LM as closed vocabulary. * * @param ndata [out] N-gram data to set unknown word ID. */ void set_unknown_id(NGRAM_INFO *ndata) { -#if 0 - ndata->unk_id = ngram_lookup_word(ndata, unkword); + ndata->isopen = TRUE; + ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT); + if (ndata->unk_id == WORD_INVALID) { + ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2); + } if (ndata->unk_id == WORD_INVALID) { - jlog("word %s not found, so assume this is a closed vocabulary model\n", - unkword); + jlog("Stat: \"%s\" or \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2); ndata->isopen = FALSE; - } else { - ndata->isopen = TRUE; } -#endif - ndata->isopen = TRUE; - ndata->unk_id = 0; /* unknown (OOV) words are always mapped to - the number 0 (by CMU-TK)*/ } /** Index: julius4/libsent/src/ngram/ngram_lookup.c diff -u julius4/libsent/src/ngram/ngram_lookup.c:1.2 julius4/libsent/src/ngram/ngram_lookup.c:1.3 --- julius4/libsent/src/ngram/ngram_lookup.c:1.2 Tue Dec 18 17:45:54 2007 +++ julius4/libsent/src/ngram/ngram_lookup.c Tue Feb 10 02:27:46 2009 @@ -12,7 +12,7 @@ * @author Akinobu LEE * @date Wed Feb 16 16:42:38 2005 * - * $Revision: 1.2 $ + * $Revision: 1.3 $ * */ /* @@ -88,8 +88,13 @@ nw = ngram_lookup_word(ndata, wstr); if (nw == WORD_INVALID) { /* not found */ - jlog("Warning: ngram_lookup: word %s not exist in N-gram vocabulary, treat as <UNK>\n", wstr); - return(ndata->unk_id); + if (ndata->isopen) { + jlog("Warning: ngram_lookup: \"%s\" not exist in N-gram, treat as unknown\n", wstr); + return(ndata->unk_id); + } else { + jlog("Error: ngram_lookup: \"%s\" not exist in N-gram\n", wstr); + return WORD_INVALID; + } } else { return(nw); } Index: julius4/libsent/src/ngram/ngram_malloc.c diff -u julius4/libsent/src/ngram/ngram_malloc.c:1.2 julius4/libsent/src/ngram/ngram_malloc.c:1.3 --- julius4/libsent/src/ngram/ngram_malloc.c:1.2 Tue Dec 18 17:45:54 2007 +++ julius4/libsent/src/ngram/ngram_malloc.c Tue Feb 10 02:27:46 2009 @@ -12,7 +12,7 @@ * @author Akinobu LEE * @date Wed Feb 16 16:48:56 2005 * - * $Revision: 1.2 $ + * $Revision: 1.3 $ * */ /* @@ -38,6 +38,7 @@ new = (NGRAM_INFO *)mymalloc(sizeof(NGRAM_INFO)); new->bo_wt_1 = NULL; new->p_2 = NULL; + new->bos_eos_swap = FALSE; return(new); } Index: julius4/libsent/src/ngram/ngram_read_arpa.c diff -u julius4/libsent/src/ngram/ngram_read_arpa.c:1.13 julius4/libsent/src/ngram/ngram_read_arpa.c:1.14 --- julius4/libsent/src/ngram/ngram_read_arpa.c:1.13 Sat Jan 31 00:04:18 2009 +++ julius4/libsent/src/ngram/ngram_read_arpa.c Tue Feb 10 02:27:46 2009 @@ -20,7 +20,7 @@ * @author Akinobu LEE * @date Wed Feb 16 16:52:24 2005 * - * $Revision: 1.13 $ + * $Revision: 1.14 $ * */ /* @@ -30,7 +30,7 @@ * All rights reserved */ -/* $Id: ngram_read_arpa.c,v 1.13 2009/01/30 15:04:18 sumomo Exp $ */ +/* $Id: ngram_read_arpa.c,v 1.14 2009/02/09 17:27:46 sumomo Exp $ */ /* words should be alphabetically sorted */ @@ -129,12 +129,12 @@ while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { if ((p = strtok(buf, DELM)) == NULL) { - jlog("Error: ngram_read_arpa: LR 1-gram: failed to parse, corrupted or invalid data?\n"); + jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n"); return FALSE; } prob = (LOGPROB)atof(p); if ((p = strtok(NULL, DELM)) == NULL) { - jlog("Error: ngram_read_arpa: LR 1-gram: failed to parse, corrupted or invalid data?\n"); + jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n"); return FALSE; } name = strcpy((char *)mymalloc(strlen(p)+1), p); @@ -222,7 +222,7 @@ } else { bo_wt = (LOGPROB)atof(p); } - + /* add bo_wt_rl to existing 1-gram entry */ nid = ngram_lookup_word(ndata, name); if (nid == WORD_INVALID) { @@ -251,9 +251,8 @@ } /** - * Read reverse 2-gram data from RL 3-gram file, and set RL 2-gram - * probabilities and back-off values for RL 3-gram to the corresponding - * LR 2-gram data. + * Read forward 2-gram data and set the LR 2-gram probabilities to the + * already loaded RL N-gram. * * @param fp [in] file pointer * @param ndata [i/o] N-gram to set the read data. @@ -381,7 +380,7 @@ cid = cid_last = NNID_INVALID; for(i=0;i<n;i++) w_last[i] = WORD_INVALID; - /* read in 2-gram */ + /* read in N-gram */ for (;;) { if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break; strcpy(pbuf, buf); @@ -389,7 +388,7 @@ jlog("Stat: ngram_read_arpa: %d-gram read %d (%d%%)\n", n, nnid, nnid * 100 / t->totalnum); } - /* 2-gram probability */ + /* N-gram probability */ if ((s = strtok(buf, DELM)) == NULL) { jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n); return FALSE; @@ -656,7 +655,43 @@ /* set unknown (=OOV) word id */ set_unknown_id(ndata); - + + /* swap <s> and </s> for backward SRILM N-gram */ + if (ndata->dir == DIR_RL) { + WORD_ID bos, eos; + char *p; + bos = ngram_lookup_word(ndata, BEGIN_WORD_DEFAULT); + eos = ngram_lookup_word(ndata, END_WORD_DEFAULT); + if (!ndata->bos_eos_swap) { + /* check */ + if (bos != WORD_INVALID && eos != WORD_INVALID && ndata->d[0].prob[bos] == -99) { + jlog("Stat: \"P(%s) = -99\" in reverse N-gram, may be trained by SRILM\n", BEGIN_WORD_DEFAULT); + jlog("Stat: going to swap \"%s\" and \"%s\"\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT); + ndata->bos_eos_swap = TRUE; + } + } + if (ndata->bos_eos_swap) { + if (bos == WORD_INVALID) { + jlog("Error: ngram_read_arpa: try to swap bos/eos but \"%s\" not found in N-gram\n", BEGIN_WORD_DEFAULT); + } + if (eos == WORD_INVALID) { + jlog("Error: ngram_read_arpa: try to swap bos/eos but \"%s\" not found in N-gram\n", END_WORD_DEFAULT); + } + if (bos == WORD_INVALID || eos == WORD_INVALID) { + return FALSE; + } + /* do swap */ + jlog("Stat: ngram_read_arpa: swap \"%s\" and \"%s\" at backward N-gram\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT); + /* swap name buffer */ + p = ndata->wname[bos]; + ndata->wname[bos] = ndata->wname[eos]; + ndata->wname[eos] = p; + /* replace index */ + ptree_replace_data(BEGIN_WORD_DEFAULT, eos, ndata->root); + ptree_replace_data(END_WORD_DEFAULT, bos, ndata->root); + } + } + } #ifdef CLASS_NGRAM Index: julius4/libsent/src/ngram/ngram_util.c diff -u julius4/libsent/src/ngram/ngram_util.c:1.4 julius4/libsent/src/ngram/ngram_util.c:1.5 --- julius4/libsent/src/ngram/ngram_util.c:1.4 Wed Jan 21 00:48:04 2009 +++ julius4/libsent/src/ngram/ngram_util.c Tue Feb 10 02:27:46 2009 @@ -12,7 +12,7 @@ * @author Akinobu LEE * @date Wed Feb 16 17:18:55 2005 * - * $Revision: 1.4 $ + * $Revision: 1.5 $ * */ /* @@ -92,7 +92,7 @@ fprintf(fp, "\t OOV word = %s(id=%d)\n", ndata->wname[ndata->unk_id],ndata->unk_id); fprintf(fp, "\t OOV size = %d words in dict\n", ndata->unk_num); } else { - fprintf(fp, "\t OOV word = none\n"); + fprintf(fp, "\t OOV word = none (assume close vocabulary)\n"); } fprintf(fp, "\t wordset size = %d\n", ndata->max_word_num); for(i=0;i<ndata->n;i++) {