CVS update: julius4/libsent/src/ngram (Julius-cvs 382) - Julius

Index: julius4/libsent/src/ngram/init_ngram.c
diff -u julius4/libsent/src/ngram/init_ngram.c:1.4 julius4/libsent/src/ngram/init_ngram.c:1.5

--- julius4/libsent/src/ngram/init_ngram.c:1.4	Sat Jan 31 18:11:22 2009
+++ julius4/libsent/src/ngram/init_ngram.c	Tue Feb 10 02:27:46 2009
@@ -12,7 +12,7 @@
  * @author Akinobu LEE
  * @date   Wed Feb 16 07:40:53 2005
  *
- * $Revision: 1.4 $
+ * $Revision: 1.5 $
  * 
  */
 /*
@@ -123,52 +123,62 @@
  * @param ndata [i/o] word/class N-gram, the unknown word information will be set.
  * @param winfo [i/o] word dictionary, the word-to-ngram-entry mapping will be done here.
  */
-void
+boolean
 make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo)
 {
   int i;
+  boolean ok_flag = TRUE;
+  int count = 0;
 
   jlog("Stat: init_ngram: mapping dictonary words to n-gram entries\n");
   ndata->unk_num = 0;
   for (i = 0; i < winfo->num; i++) {
     winfo->wton[i] = make_ngram_ref(ndata, winfo->wname[i]);
+    if (winfo->wton[i] == WORD_INVALID) {
+      ok_flag = FALSE;
+      count++;
+      continue;
+    }
     if (winfo->wton[i] == ndata->unk_id) {
       (ndata->unk_num)++;
     }
   }
+  if (ok_flag == FALSE) {
+    jlog("Error: --- Failed to map %d words in dictionary to N-gram\n", count);
+    jlog("Error: --- Please fix the dict, or use open vocabulary N-gram that has either \"%s\" or \"%s\"\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2);
+    return FALSE;
+  }
+      
   if (ndata->unk_num == 0) {
     ndata->unk_num_log = 0.0;	/* for safe */
   } else {
     ndata->unk_num_log = (float)log10(ndata->unk_num);
   }
   jlog("Stat: init_ngram: finished word-to-ngram mapping\n");
+  return TRUE;
 }
 
 /** 
  * @brief  Set unknown word ID to the N-gram data.
  *
- * In CMU-Cam SLM toolkit, OOV words are always mapped to UNK, which
- * always appear at the very beginning of N-gram entry, so we fix the
- * unknown word ID at "0".
+ * Unknown word string should be UNK_WORD_DEFAULT or UNK_WORD_DEFAULT2,
+ * whose default is "<unk>" and "<UNK>".  If any of these is not found
+ * in vocabulary, treat the LM as closed vocabulary.
  * 
  * @param ndata [out] N-gram data to set unknown word ID.
  */
 void
 set_unknown_id(NGRAM_INFO *ndata)
 {
-#if 0
-  ndata->unk_id = ngram_lookup_word(ndata, unkword);
+  ndata->isopen = TRUE;
+  ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT);
+  if (ndata->unk_id == WORD_INVALID) {
+    ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2);
+  }
   if (ndata->unk_id == WORD_INVALID) {
-    jlog("word %s not found, so assume this is a closed vocabulary model\n",
-	    unkword);
+    jlog("Stat: \"%s\" or \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2);
     ndata->isopen = FALSE;
-  } else {
-    ndata->isopen = TRUE;
   }
-#endif
-  ndata->isopen = TRUE;
-  ndata->unk_id = 0;		/* unknown (OOV) words are always mapped to
-				   the number 0 (by CMU-TK)*/
 }
 
 /** 
Index: julius4/libsent/src/ngram/ngram_lookup.c
diff -u julius4/libsent/src/ngram/ngram_lookup.c:1.2 julius4/libsent/src/ngram/ngram_lookup.c:1.3
--- julius4/libsent/src/ngram/ngram_lookup.c:1.2	Tue Dec 18 17:45:54 2007
+++ julius4/libsent/src/ngram/ngram_lookup.c	Tue Feb 10 02:27:46 2009
@@ -12,7 +12,7 @@
  * @author Akinobu LEE
  * @date   Wed Feb 16 16:42:38 2005
  *
- * $Revision: 1.2 $
+ * $Revision: 1.3 $
  * 
  */
 /*
@@ -88,8 +88,13 @@
 
   nw = ngram_lookup_word(ndata, wstr);
   if (nw == WORD_INVALID) {	/* not found */
-    jlog("Warning: ngram_lookup: word %s not exist in N-gram vocabulary, treat as <UNK>\n", wstr);
-    return(ndata->unk_id);
+    if (ndata->isopen) {
+      jlog("Warning: ngram_lookup: \"%s\" not exist in N-gram, treat as unknown\n", wstr);
+      return(ndata->unk_id);
+    } else {
+      jlog("Error: ngram_lookup: \"%s\" not exist in N-gram\n", wstr);
+      return WORD_INVALID;
+    }
   } else {
     return(nw);
   }
Index: julius4/libsent/src/ngram/ngram_malloc.c
diff -u julius4/libsent/src/ngram/ngram_malloc.c:1.2 julius4/libsent/src/ngram/ngram_malloc.c:1.3
--- julius4/libsent/src/ngram/ngram_malloc.c:1.2	Tue Dec 18 17:45:54 2007
+++ julius4/libsent/src/ngram/ngram_malloc.c	Tue Feb 10 02:27:46 2009
@@ -12,7 +12,7 @@
  * @author Akinobu LEE
  * @date   Wed Feb 16 16:48:56 2005
  *
- * $Revision: 1.2 $
+ * $Revision: 1.3 $
  * 
  */
 /*
@@ -38,6 +38,7 @@
   new = (NGRAM_INFO *)mymalloc(sizeof(NGRAM_INFO));
   new->bo_wt_1 = NULL;
   new->p_2 = NULL;
+  new->bos_eos_swap = FALSE;
 
   return(new);
 }
Index: julius4/libsent/src/ngram/ngram_read_arpa.c
diff -u julius4/libsent/src/ngram/ngram_read_arpa.c:1.13 julius4/libsent/src/ngram/ngram_read_arpa.c:1.14
--- julius4/libsent/src/ngram/ngram_read_arpa.c:1.13	Sat Jan 31 00:04:18 2009
+++ julius4/libsent/src/ngram/ngram_read_arpa.c	Tue Feb 10 02:27:46 2009
@@ -20,7 +20,7 @@
  * @author Akinobu LEE
  * @date   Wed Feb 16 16:52:24 2005
  *
- * $Revision: 1.13 $
+ * $Revision: 1.14 $
  * 
  */
 /*
@@ -30,7 +30,7 @@
  * All rights reserved
  */
 
-/* $Id: ngram_read_arpa.c,v 1.13 2009/01/30 15:04:18 sumomo Exp $ */
+/* $Id: ngram_read_arpa.c,v 1.14 2009/02/09 17:27:46 sumomo Exp $ */
 
 /* words should be alphabetically sorted */
 
@@ -129,12 +129,12 @@
   
   while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
     if ((p = strtok(buf, DELM)) == NULL) {
-      jlog("Error: ngram_read_arpa: LR 1-gram: failed to parse, corrupted or invalid data?\n");
+      jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n");
       return FALSE;
     }
     prob = (LOGPROB)atof(p);
     if ((p = strtok(NULL, DELM)) == NULL) {
-      jlog("Error: ngram_read_arpa: LR 1-gram: failed to parse, corrupted or invalid data?\n");
+      jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n");
       return FALSE;
     }
     name = strcpy((char *)mymalloc(strlen(p)+1), p);
@@ -222,7 +222,7 @@
     } else {
       bo_wt = (LOGPROB)atof(p);
     }
-  
+
     /* add bo_wt_rl to existing 1-gram entry */
     nid = ngram_lookup_word(ndata, name);
     if (nid == WORD_INVALID) {
@@ -251,9 +251,8 @@
 }
 
 /** 
- * Read reverse 2-gram data from RL 3-gram file, and set RL 2-gram
- * probabilities and back-off values for RL 3-gram to the corresponding
- * LR 2-gram data.
+ * Read forward 2-gram data and set the LR 2-gram probabilities to the
+ * already loaded RL N-gram.
  * 
  * @param fp [in] file pointer
  * @param ndata [i/o] N-gram to set the read data.
@@ -381,7 +380,7 @@
   cid = cid_last = NNID_INVALID;
   for(i=0;i<n;i++) w_last[i] = WORD_INVALID;
 
-  /* read in 2-gram */
+  /* read in N-gram */
   for (;;) {
     if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break;
     strcpy(pbuf, buf);
@@ -389,7 +388,7 @@
       jlog("Stat: ngram_read_arpa: %d-gram read %d (%d%%)\n", n, nnid, nnid * 100 / t->totalnum);
     }
 
-    /* 2-gram probability */
+    /* N-gram probability */
     if ((s = strtok(buf, DELM)) == NULL) {
       jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n);
       return FALSE;
@@ -656,7 +655,43 @@
     
     /* set unknown (=OOV) word id */
     set_unknown_id(ndata);
-    
+
+    /* swap <s> and </s> for backward SRILM N-gram */
+    if (ndata->dir == DIR_RL) {
+      WORD_ID bos, eos;
+      char *p;
+      bos = ngram_lookup_word(ndata, BEGIN_WORD_DEFAULT);
+      eos = ngram_lookup_word(ndata, END_WORD_DEFAULT);
+      if (!ndata->bos_eos_swap) {
+	/* check */
+	if (bos != WORD_INVALID && eos != WORD_INVALID && ndata->d[0].prob[bos] == -99) {
+	  jlog("Stat: \"P(%s) = -99\" in reverse N-gram, may be trained by SRILM\n", BEGIN_WORD_DEFAULT);
+	  jlog("Stat: going to swap \"%s\" and \"%s\"\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT);
+	  ndata->bos_eos_swap = TRUE;
+	}
+      }
+      if (ndata->bos_eos_swap) {
+	if (bos == WORD_INVALID) {
+	  jlog("Error: ngram_read_arpa: try to swap bos/eos but \"%s\" not found in N-gram\n", BEGIN_WORD_DEFAULT);
+	}
+	if (eos == WORD_INVALID) {
+	  jlog("Error: ngram_read_arpa: try to swap bos/eos but \"%s\" not found in N-gram\n", END_WORD_DEFAULT);
+	}
+	if (bos == WORD_INVALID || eos == WORD_INVALID) {
+	  return FALSE;
+	}
+	/* do swap */
+	jlog("Stat: ngram_read_arpa: swap \"%s\" and \"%s\" at backward N-gram\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT);
+	/* swap name buffer */
+	p = ndata->wname[bos];
+	ndata->wname[bos] = ndata->wname[eos];
+	ndata->wname[eos] = p;
+	/* replace index */
+	ptree_replace_data(BEGIN_WORD_DEFAULT, eos, ndata->root);
+	ptree_replace_data(END_WORD_DEFAULT, bos, ndata->root);
+      }
+    }
+
   }
     
 #ifdef CLASS_NGRAM
Index: julius4/libsent/src/ngram/ngram_util.c
diff -u julius4/libsent/src/ngram/ngram_util.c:1.4 julius4/libsent/src/ngram/ngram_util.c:1.5
--- julius4/libsent/src/ngram/ngram_util.c:1.4	Wed Jan 21 00:48:04 2009
+++ julius4/libsent/src/ngram/ngram_util.c	Tue Feb 10 02:27:46 2009
@@ -12,7 +12,7 @@
  * @author Akinobu LEE
  * @date   Wed Feb 16 17:18:55 2005
  *
- * $Revision: 1.4 $
+ * $Revision: 1.5 $
  * 
  */
 /*
@@ -92,7 +92,7 @@
     fprintf(fp, "\t        OOV word = %s(id=%d)\n", ndata->wname[ndata->unk_id],ndata->unk_id);
     fprintf(fp, "\t        OOV size = %d words in dict\n", ndata->unk_num);
   } else {
-    fprintf(fp, "\t        OOV word = none\n");
+    fprintf(fp, "\t        OOV word = none (assume close vocabulary)\n");
   }
   fprintf(fp, "\t    wordset size = %d\n", ndata->max_word_num);
   for(i=0;i<ndata->n;i++) {


Julius

[Julius-cvs 382] CVS update: julius4/libsent/src/ngram