[RavenclawDev 238] [15] Tools: Moved makelm into tools.

tk@edam.speech.cs.cmu.edu tk at edam.speech.cs.cmu.edu
Thu Apr 5 10:26:19 EDT 2007


An HTML attachment was scrubbed...
URL: http://mailman.srv.cs.cmu.edu/pipermail/ravenclaw-developers/attachments/20070405/c0857a14/attachment-0001.html
-------------- next part --------------
Added: Tools/CMU-Cam_Toolkit_v2/README
===================================================================
--- Tools/CMU-Cam_Toolkit_v2/README	                        (rev 0)
+++ Tools/CMU-Cam_Toolkit_v2/README	2007-04-05 14:26:18 UTC (rev 15)
@@ -0,0 +1,84 @@
+CMU-Cambridge Statistical Language Modeling Tookit v2
+=====================================================
+
+Documentation:
+--------------
+
+For installation and usage instructions for the toolkit, see 
+
+doc/toolkit_documentation.html
+
+(for the sake of convenience, the installation instructions are also
+given below).
+
+Installation:
+-------------
+
+For "big-endian" machines (eg those running HP-UX, IRIX, SunOS,
+Solaris) the installation procedure is simply to type
+
+  cd src
+  make install
+
+The executables will then be copied into the bin/ directory, and the
+library file SLM2.a will be copied into the lib/ directory.
+
+For "little-endian" machines (eg those running Ultrix, Linux) the
+variable "BYTESWAP_FLAG" will need to be set in the Makefile. This can
+be done by editing src/Makefile directly, so that the line
+
+#BYTESWAP_FLAG  = -DSLM_SWAP_BYTES
+
+is changed to 
+
+BYTESWAP_FLAG  = -DSLM_SWAP_BYTES
+
+Then the program can be installed as before.
+
+If you are unsure of the "endian-ness" of your machine, then the shell
+script endian.sh should be able to provide some assistance.
+
+In case of problems, then more information can be found by examining
+src/Makefile.
+
+Files:
+------
+
+endian.sh  Shell script to report "endian-ness" (see installation 
+   instructions). Not terribly robust; needs to be able to see gcc, 
+   for example.
+
+doc/toolkit_documentation.html   The standard html documentation for the 
+   toolkit. View using netscape or equivalent.
+
+doc/toolkit_documentation_no_tables.html   As above, but doesn't use 
+   tables, so is suitable for use with browsers which don't support
+   tables (eg lynx).
+
+doc/toolkit_documentation.txt   The documentation in flat text.
+
+doc/change_log.html   List of changes from version to version.
+
+doc/change_log.txt   The above in flat text. 
+
+src/*.c src/*.h  The toolkit source files
+
+src/Makefile  The standard make file.
+
+src/install-sh  Shell script to install executables in the appropriate
+   directory. An improvement on cp, as it will check to see whether it is 
+   about to overwrite an execuatable which is already in use.
+
+include/SLM2.h   File containing all of src/*.h, allowing 
+   functions from the toolkit to be included in new software.
+
+bin/   Directory where executables will be installed.
+
+lib/   Directory where SLM2.a will be stored (useful in conjunction with 
+   include/SLM2.h for including functions from the toolkit to be included 
+   in new software.)
+
+
+
+
+

Added: Tools/CMU-Cam_Toolkit_v2/bin/binlm2arpa.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/binlm2arpa.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/blah.idngram
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/blah.idngram
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/evallm.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/evallm.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/idngram2lm.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/idngram2lm.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/idngram2stats.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/idngram2stats.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/interpolate.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/interpolate.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/mergeidngram.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/mergeidngram.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/ngram2mgram.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/ngram2mgram.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/text2idngram.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/text2idngram.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/text2wfreq.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/text2wfreq.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/text2wngram.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/text2wngram.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/wfreq2vocab.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/wfreq2vocab.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/bin/wngram2idngram.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/wngram2idngram.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/CMU-Cam_Toolkit_v2/endian.sh
===================================================================
--- Tools/CMU-Cam_Toolkit_v2/endian.sh	                        (rev 0)
+++ Tools/CMU-Cam_Toolkit_v2/endian.sh	2007-04-05 14:26:18 UTC (rev 15)
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+# Dumb shell script to report endianness.
+
+echo "  
+  #define BIG_ENDIAN      0
+  #define LITTLE_ENDIAN   1
+
+  int little_endian(void)
+  {
+      short int w = 0x0001;
+      char *byte = (char *) &w;
+      return(byte[0] ? LITTLE_ENDIAN : BIG_ENDIAN);
+  }
+
+main () {  
+  if(!little_endian()) {
+     printf(\"Big-endian, DO NOT set -DSLM_SWAP_BYTES in Makefile\\n\");
+  } 
+  else {
+     printf(\"Little-endian, set -DSLM_SWAP_BYTES in Makefile\\n\");
+  } 
+}" > test_endian.c
+gcc test_endian.c -o test_endian
+
+# Can use cc if gcc not available.
+
+./test_endian
+rm -f test_endian test_endian.c

Added: Tools/CMU-Cam_Toolkit_v2/test_endian.exe
===================================================================
(Binary files differ)


Property changes on: Tools/CMU-Cam_Toolkit_v2/test_endian.exe
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Added: Tools/generate_random_samples.pl
===================================================================
--- Tools/generate_random_samples.pl	                        (rev 0)
+++ Tools/generate_random_samples.pl	2007-04-05 14:26:18 UTC (rev 15)
@@ -0,0 +1,1015 @@
+#!/usr/bin/perl
+# This line tells emacs, that this is a -*-Perl-*- program
+
+$licence = <<LICENCE;
+
+Copyright (C) 1996 Interactive System Labs and Klaus Ries
+
+LICENCE
+
+$n_default=10;
+$break_prob_default = 0;
+$leave_out_prob_default = 0.1;
+$repeat_prob_default = 0.25;
+
+$n=$n_default;
+$break_prob = $break_prob_default;
+$leave_out_prob = $leave_out_prob_default;
+$repeat_prob = $repeat_prob_default;
+
+
+while($ARGV[0] =~ /^-.*$/) {	# PROCESS FLAGS
+$_ = shift;
+if($_ eq "-h") {
+$help++;
+next;
+}
+if($_ eq "-n") {
+$n=shift;
+next;
+}
+if($_ eq "-leave_out") {
+$leave_out_prob=shift;
+next;
+}
+if($_ eq "-repeat") {
+$repeat_prob=shift;
+next;
+}
+if($_ eq "-show_break") {
+$show_break++;
+next;
+}
+if($_ eq "-d") {
+$dir=shift;
+next;
+}
+if($_ eq "-forms") {
+$forms=shift;
+next;
+}
+if($_ eq "-warn") {
+$warn++;
+next;
+}
+if($_ eq "-capital") {
+$capital++;
+next;
+}
+if($_ eq "-dictionary") {
+$dictionary++;
+next;
+}
+if($_ eq "-makedict") {
+$makedict++;
+next;
+}
+if($_ eq "-unkdict") {
+$unkdict++;
+next;
+}
+if($_ eq "-vocabulary") {
+$vocabulary++;
+next;
+}
+if($_ eq "-class") {
+$classes++;
+next;
+}
+if($_ eq "-noclass") {
+$noclasses++;
+next;
+}
+if($_ eq "-novariants") {
+$novariants++;
+next;
+}
+if($_ eq "-iclass") {
+$inverse_map++;
+next;
+}
+if($_ eq "-modifyLM") {
+$modify++;
+next;
+}
+if($_ eq "-clausi") {
+$clausi++;
+next;
+}
+die "Unrecognized command line option $_\nUse generate_random_samples -h for help\n";
+}
+
+help() if($help);
+
+if(defined $dir) {
+$currdir=$ENV{PWD};
+die "Cannot change to $dir\n" unless chdir $dir;
+warn "Changed to $dir\n";
+}
+
+
+# First load the forms of that grammer:
+
+$FORMS = "forms";
+$FORMS .= ".$forms" if defined $forms;
+
+open FORMS;
+
+while(<FORMS>) {
+
+    $random_perm++ if /%%randomperm%%/;
+    
+    $prob=get_prob();
+    
+    if(s/^\s*FUNCTION:\s*(\S+)//) {
+	my @F = ();
+	my @G = ();
+	$name=$1;
+	print STDERR "The name of the function is >>$name<<\n";
+	undef $nets;
+	push(@top_level,$name);
+	push(@top_level_prob,$prob);
+	push(@top_level_slots,\@F);
+	push(@top_level_probs,\@G);
+	push(@top_level_perm,$random_perm);
+	undef $random_perm;
+	$top_level{$name}++;
+    }    
+    next unless defined $name;
+    
+    if(/^\s*NETS:/) {
+	$nets++; next;
+    }
+    elsif($nets) {
+	1;
+    }
+    else {
+	next;
+    }
+    
+    if(s/^(\t|\s{2,})(\S+)/$2/) {
+	$entry=$2;
+	if(defined $top_level{$name,$entry}) {
+	    warn "Multiple entry for $entry in function $name\n" ;
+	    next;
+	}
+	push(@{$top_level_slots[$#top_level_slots]},$entry);
+	push(@{$top_level_probs[$#top_level_probs]},$prob);
+	$top_level{$name,$entry}++;
+	$public{$entry}++;
+    }
+}
+close FORMS;
+
+#warn join($/, @{$top_level_slots[$#top_level_slots]}), $/;
+
+normalize_random_entry(\@top_level_prob);
+
+foreach(@top_level_prob) {
+normalize_random_entry($_);
+}
+
+
+die "No top-level slots" unless () ne keys %top_level;
+
+# Now load the nets themselves
+
+$NETS = "nets";
+
+open NETS;
+while(<NETS>) {
+    next unless /\S/;
+    chomp;
+    s/\s+$//;
+    
+    if(defined $nets{$_} || defined $nets{"[$_]"}) {
+	warn "Multiple entry for >>$_<< in nets\n" ;
+	next;
+    }
+    $nets{$_}++;
+    
+#if(/[^A-Za-z]/) {
+    $_="[$_]";
+#}
+    $public{$_}++;
+}
+close NETS;
+
+$nets{"noises"}++ if -r "noises.gra";
+
+ at noise=("SegmentNoise","BeginNoise","EndNoise","BreakNoise","RandomNoise");
+
+if($nets{"noises"}) {
+    foreach(@noise) {	
+	$public{$_}++;
+    }
+}
+
+# Next thing: Load all the grammars
+
+#foreach(keys %nets) {
+#    print STDERR "Loading grammar file $_ ............\r";
+#    
+#    $NAME=$_;
+#    load_grammar_file("$NAME.gra");
+#}
+load_grammar_file('zap2flat.gra');
+
+if($nets{"noises"}) {
+    foreach(@noise) {	
+	undef $public{$_}  unless defined $rules{$_};
+    }
+}
+
+chdir $currdir if defined $currdir;
+print STDERR "Finished loading grammar file                                         \n";
+
+
+# Next thing: Generate sentences
+
+srand;
+
+check_grammar() if $warn;
+
+
+if($dictionary || $vocabulary || $makedict || $unkdict) { # Print all the defined dictionary/vocabulary entries
+foreach $slot_ref (@top_level_slots) {
+    foreach(@{$slot_ref}) {
+	$slots{$_}++;
+    }
+}
+ at netname = keys %slots;
+foreach(@noise) {
+push(@netname,$_) if defined $public{$_};
+}
+
+build_reachable_dictionary();
+
+
+open(OUT,"| sort -u | perl -pe 's/\\(0\\)\\}/\\}/;'");
+if($vocabulary) {
+foreach(keys %reachable_t) {
+    print OUT "$_\n";
+}
+print OUT "<s>\n</s>\n";
+}
+elsif($dictionary) {
+foreach(keys %reachable_t) {
+    if($class{$_}) {
+	$_=join("\n",print_net_all($_));
+    }
+    print OUT "$_\n";
+}
+}
+else {			# Makedict !!
+			# or unkdict !!
+# Calculate all words that we need from the dictionary
+# Assign an empty list to all of them to get
+# the effective entries
+
+foreach(keys %reachable_t) {
+    next if $class{$_};
+    if(/\266/) {	# In case we have entered that phrase to
+			# the dictionary
+	next if defined $vocab{$_};
+	my @F=();
+	$vocab{$_}=\@F;
+    }
+    if(/\266S$/) {
+	my $word2=$_;
+	$word2=~s/\266S$/\'S/;
+	next if defined $vocab{$word2};
+	my @F=();
+	$vocab{$word2}=\@F;
+    }
+    foreach(split(/\266/,$_)) {
+	next if defined $vocab{$_};
+	my @F=();
+	$vocab{$_}=\@F;
+    }
+}
+foreach(keys %class) {
+    next unless $reachable{$_};
+    foreach(print_net_all($_)) {
+	($word,$class)=split(/:/,$_,2);
+	if($word=~/\266/) { # In case we have entered that phrase to
+			    # the dictionary
+	    next if defined $vocab{$word};
+	    my @F=();
+	    $vocab{$word}=\@F;
+	}
+	if($word=~/\266S$/) {
+	    my $word2=$word;
+	    $word2=~s/\266S$/\'S/;
+	    next if defined $vocab{$word2};
+	    my @F=();
+	    $vocab{$word2}=\@F;
+	}
+	foreach(split(/\266/,$word)) {
+	    next if defined $vocab{$_};
+	    my @F=();
+	    $vocab{$_}=\@F;
+	}
+    }
+}
+
+# Now selectively read the real dictionary from stdin/commandline
+# Also read pronounciation variants
+
+while(<>) {
+    s/^\{?\s*([^\s\}\(]+)(\([^\)]+\))?\s*\}?\s+//;
+    $word=$1;
+    $ref = $vocab{$word};	    
+    s/^\s*\{?\s*//;
+    s/\s*\}?\s*$//;	    
+    foreach $phonem (split) {
+	next unless $phonem=~/\S/ && $phonem=~/^[a-z]/i;
+	$phonem{$phonem}++;
+    }
+    next unless defined $ref;
+    next if $novariants && @{$ref}>0;
+    $pron=$_;
+    push(@{$ref},$pron) unless grep($_ eq $pron,@{$ref})>0;
+}
+
+
+if($unkdict) {
+    while(($word,$pron) = each %vocab) {
+	next if $word=~/\266/;
+	print OUT "$word\n" unless @{$pron} ;
+    }
+}
+else {
+    @phonem = sort { $phonem{$a} <=> $phonem{$b} }
+    grep($phonem{$_}>2 && $_ ne "SIL",keys %phonem);
+    
+    # Pick a very seldom phonem for the words that are artifically in the dictionary
+    # to keep the vocabulary module happy
+    # These are specifically the class symbols
+
+    $badphonem=$phonem[0];
+    $badphonem.=" $badphonem";$badphonem.=" $badphonem";
+    $badphonem.=" $badphonem";$badphonem.=" $badphonem";
+
+    foreach(keys %reachable_t) {
+	next if $class{$_};
+	print OUT lookup_pronounciation($_,split(/\266/))
+    }
+    foreach(keys %class) {
+	next unless $reachable{$_};
+	print OUT "\{$_\} \{$badphonem\}\n";
+	foreach $symbol (print_net_all($_)) {
+	    ($word,$class)=split(/:/,$symbol,2);
+	    print OUT lookup_pronounciation($symbol,split(/\266/,$word));
+	}
+    }
+    print OUT "{\$} {SIL}\n{(} {SIL}\n{)} {SIL}\n";
+}
+close OUT;
+}
+}
+elsif($classes) {
+foreach $class (keys %class) {
+$_=join(" ",$class,print_net_all($class));
+print "$_\n";
+}
+}
+elsif($inverse_map) {
+foreach $class (keys %class) {
+foreach $classword (print_net_all($class)) {
+    ($word)=split(/:/,$classword);
+    $imap=$inversemap{$word};
+    if(defined $imap) {
+       push(@{$imap},$classword);
+    }
+    else {
+       my(@F)=($classword);
+       $inversemap{$word}=\@F;
+    }
+}
+}
+while(($word,$imap)=each %inversemap) {
+print join("\#",@{$imap})." $word\n";
+}
+}
+elsif($modify) {
+
+while(<>) {
+if(/^\\subsmodel:/) {
+    $skipsmodel++;
+    last;
+}
+print;
+}			       
+if($skipsmodel) {
+while(<>) {
+    if(/^\\end\\/) {
+	last;
+    }
+}	
+print while <>;
+}
+
+print "\\subsmodel:\n";
+foreach $class (sort keys %class) {
+print "-99.9 $class $class\n";
+ at classmembers = print_net_all($class);
+$classpenalty = -log(scalar(@classmembers))/log(10);
+foreach $classmember (sort @classmembers) {
+    $_="$classpenalty $classmember $class\n";
+    print;
+}
+}
+print "\\end\\\n";
+}
+else {				# Print random sentences
+foreach($i=0;$i<$n;$i++) { # Select a top-level frame and generate a sentence for it
+print_random_sentence(select_random_entry(\@top_level_prob,"FORM"));    
+}
+}
+
+
+sub print_random_sentence {
+
+    my($select) = @_;
+    my($select_local,$parse,$parse_local);
+    
+    #warn "print_random_sentence: top_level[select] -> $top_level[$select] >> $/";
+    
+    if($top_level_perm[$select]) {
+	begin_noise();
+	$break=1;
+	while($break) {
+	    $break=0;
+	    print_net_random(${$top_level_slots[$select]}[$select_local]);
+	    if($break) {
+		break_noise();
+	    }
+	    else {
+		segment_noise();
+	    }
+	}
+	end_noise();
+    }
+    else {
+	begin_noise();
+	#foreach $select_local (@{$top_level_slots[$select]}) {
+	# tk - fix 
+	# I don't understand, but without this fix every line would be a sucession of each net.
+	$select_local = $top_level_slots[$select][int(rand(scalar(@{$top_level_slots[$select]})))];
+	$break=1;
+	while($break) {
+	    $break=0;
+	    print_net_random($select_local);
+	    if($break) {
+		break_noise();
+	    }
+	    else {
+		segment_noise();
+	    }
+	}
+	#}
+	end_noise();
+    }
+    print "\n";
+}
+
+sub begin_noise {
+print_net_random("BeginNoise") if defined $public{"BeginNoise"};
+}    
+
+
+sub end_noise {
+print_net_random("EndNoise") if defined $public{"EndNoise"};
+}    
+
+
+sub segment_noise {
+print_net_random("SegmentNoise") if defined $public{"SegmentNoise"};
+}    
+
+sub break_noise {
+print_net_random("BreakNoise") if defined $public{"BreakNoise"};
+}    
+
+sub random_noise {
+print_net_random("RandomNoise") if defined $public{"RandomNoise"};
+}    
+
+sub random_break {
+if(1-$break_prob<rand()) {
+$break++;
+print " **<** " if $show_break;
+}
+}    
+
+
+sub select_random_entry {
+    my($array,$name) = @_;
+    my $prob=0;
+    my $random = rand();
+    my $return=0;
+    my $i=0;
+    
+    foreach(@$array) {
+	$prob+=$_;
+	return $i if $random<$prob;
+	$i++;
+    }
+    die "$name not a probability distribution: ".join(" ",@{$array})."\n";
+}
+
+sub normalize_random_entry {
+local($array) = @_;
+my $i;
+my $prob = 0;
+my $zero = 0;
+
+if(@{$array}==1) {
+$$array[0]=1;
+return;
+}
+
+for( $i=0; $i<=$#$array; $i++ ) {
+$prob+=$$array[$i];
+if($$array[$i] == 0) {
+    $zero++;
+}
+}
+die "Not a probability distribution $prob>1\n" if $prob>1;
+
+if($zero>0) {
+$prob = (1-$prob)/$zero;
+for( $i=0; $i<=$#$array; $i++ ) {
+    $$array[$i] = $prob if $$array[$i]==0;
+}
+}	
+}
+
+sub print_net_random {
+
+    my($netname) = @_;
+    my($repeat,$type,$body);
+    
+    my($text) = "";
+    
+    unless(defined $rules{$netname}) {
+	warn  "Rule for $netname not defined\n" if $warn;
+	return "";
+    }
+    
+    my($rule_ref) = ${$rules{$netname}}[select_random_entry($rules_prob{$netname},$netname)];
+    
+    foreach(@$rule_ref) {
+	($repeat,$type,$body) = split(/,/,$_);
+	
+	while(1) {
+	    if($repeat=~s/\*//g) {
+		last if rand()<$leave_out_prob;
+		$repeat="1" unless $repeat=~/\S/;
+	    }
+	    if($type eq "T" || $class{$body}) {
+		if($clausi) {
+		    print "$body\n"; random_noise();
+		}
+		else {
+		    print "$body "; random_noise();
+		}
+	    }
+	    else {
+		print_net_random($body);
+		random_break();	    
+	    }
+	    return if $break;
+	    last if $repeat eq "1";
+	    last unless rand()<$repeat_prob;
+	}
+    }
+}    
+
+
+
+sub print_net_all {
+local($global_netname) = @_;
+local(%already_visited);
+
+my(@return) = print_net_all2($global_netname);
+my($ret_length)=scalar(@return);
+ at return=grep(/\S/, at return);
+warn "$global_netname could expand to empty, ignored\n"
+unless @return==$ret_length;
+ at return=grep(($_.=":$global_netname") || 1, at return);
+return @return;
+}
+
+
+sub print_net_all2 {
+
+my($netname) = shift @_;
+my($repeat,$type,$body,$ruleref);
+my(@returnarray)=();
+
+unless(defined $rules{$netname}) {
+warn  "Rule for $netname not defined\n" if $warn;
+return ();
+}
+
+return @_ if $already_visited{$netname}>1;
+$already_visited{$netname}++;
+foreach $ruleref (@{$rules{$netname}}) {
+my(@rulereturn)=@_;
+foreach(@{$ruleref}) {
+    ($repeat,$type,$body) = split(/,/,$_);
+
+    my(@localrulereturn)=();
+    
+    if($repeat eq "*") {
+	@localrulereturn=@rulereturn;
+	@localrulereturn=("") unless @localrulereturn>0;
+    }
+    warn "Rule for $netname contains repetition $repeat specification -- ignored"
+	unless $repeat eq "1" || $repeat eq "*";
+
+    if($type eq "T") {
+	$body = "+$body+" if $body =~ s/^&//;
+	if(@rulereturn) {		    
+	    for($i=0;$i<=$#rulereturn;$i++) {
+		if($rulereturn[$i]=~/\S/) {
+		    $rulereturn[$i].="\266$body";
+		} else {
+		    $rulereturn[$i]="$body";
+		}
+	    }
+	}
+	else {		    
+	    @rulereturn=($body);
+	}
+    }
+    else {		    
+	@rulereturn=print_net_all2($body, at rulereturn);
+    }
+    push(@rulereturn, at localrulereturn);
+}
+push(@returnarray, at rulereturn);
+}
+$already_visited{$netname}--;
+return @returnarray;
+}    
+
+
+sub build_reachable_dictionary {
+
+my($netname);
+my($repeat,$type,$body,$rule_ref);
+
+my($text) = "";
+
+while(@netname) {
+$netname = shift @netname;
+
+if($class{$netname}) {
+    $reachable_t{$netname}++;
+    $reachable{$netname}++;
+    next;
+}
+
+unless(defined $rules{$netname}) {
+    warn  "Rule for $netname not defined in build_reachable_dictionary\n" if $warn;
+    return "";
+}
+
+foreach $rule_ref (@{$rules{$netname}}) {
+    foreach(@{$rule_ref}) {
+	($repeat,$type,$body) = split(/,/,$_);
+	if($type eq "T") {
+	    $reachable_t{$body}++;
+	}
+	else {
+	    unless(exists $reachable{$body}) {
+		push(@netname,$body);
+		$reachable{$body}++;
+	    }
+	}
+    }
+}
+}
+}		
+
+
+sub check_grammar {
+
+while(($definition) = each %public) {
+warn "Net rule $definition not defined\n" unless defined $rules{$definition};
+}
+while(($definition,$count) = each %definition) {
+for($i=1;$i<=$count;$i++) {
+    warn "$definition:$count not defined" unless defined $rules{"$definition:$count"};
+}
+}
+}
+
+
+sub get_prob {
+
+if(s/[\#;]([\s\S]+)//) {
+my $comment=$1;
+
+if($comment=~/%%CLASS%%/) {
+    $class=1;
+} else {
+    $class=0;
+}
+
+return $1 if $comment=~/%%(\S+)%%/;
+}
+$class=0;
+return 0;
+}
+
+
+sub load_grammar_file {
+local($GRAMMAR_FILE) = @_;
+local($concept, at rules, at rules_prob);
+local(*GRAMMAR);
+
+#    print STDERR "Reading $GRAMMAR_FILE of $NAME\n";
+
+$concept = "";
+ at rules   = ();
+$conceptclass=0;
+
+open(GRAMMAR,$GRAMMAR_FILE);
+while(<GRAMMAR>) {
+if(/^\#include\s+(\S+)/) {
+    flush_concept();
+    $conceptclass=0;
+    load_grammar_file($1,$NAME);
+}
+else {	    
+    # Get rid of comments and empty lines
+
+    $prob=get_prob();  next unless /\S/;
+  
+    # Classify line	    
+    if(/^(\S+)/) {		# It's a concept
+	$new_concept=$1;
+	flush_concept();
+	$conceptclass=$class;
+	$concept=$new_concept;
+    }
+    else {
+	die "No concept defined" unless $concept=~/\S/;
+	die "No begin-parenthesis in $concept rules $_\n" unless s/^\s+\(\s*//;
+	die "No end-parenthesis in $concept rules $_\n"   unless s/\s*\)\s*$//;
+
+	push(@rules,$_);
+	push(@rules_prob,$prob);
+    }
+}
+}
+flush_concept();
+close GRAMMAR;
+}
+
+
+sub select_random_array {
+local($array_ref) =  $_[0];;
+
+return @$array_ref[int(rand(scalar(@$array_ref)))];
+}
+
+
+sub flush_concept {
+
+# Determine the symbol-table-entry for the concept defined
+# If it is a top-level entry, it is globally visible.
+# Otherwise, it is only local and may be multiple definded and
+# overwritten
+
+return unless $concept=~/\S/;
+
+if($public{$concept}) {
+$real_name=$concept;
+}
+else {
+$definition{$NAME,$concept};
+$real_name="$NAME:$concept:".($definition{"$NAME:$concept"}+1);
+}
+
+die "Multiple definition of $real_name\n" if defined $rules{$real_name};
+
+
+my @new_rules = ();
+
+foreach(@rules) {
+my @rule = ();
+foreach $body (split) {
+    $repeat = "1";
+    $repeat = $1 if $body=~s/^([\+\*]+)//;
+    $body =~ s/^\s+//; $body =~ s/\s+$//; 
+    if( ($body=~/^[^A-Z_\-]/) && !($body =~ /^\[.*\]$/)) {
+	$type="T";
+	$body=~s/\+/'/g; # '
+	$body=~tr/a-z/A-Z/ if $capital;
+	$body = "+$body+" if $body =~ s/^&//;
+	$dictionary{$body}++;
+    }
+    else {
+	$type="N";
+
+	unless($public{$body}) {
+	    $body="$NAME:$body:".($definition{"$NAME:$body"}+1);
+	}
+    }
+    push(@rule,"$repeat,$type,$body");
+}
+
+push(@new_rules,\@rule);
+}
+
+$rules{$real_name}=\@new_rules;
+
+my @new_rules_prob = @rules_prob;
+normalize_random_entry(\@new_rules_prob);
+$rules_prob{$real_name}=\@new_rules_prob;
+
+
+unless($public{$concept}) {
+$definition{"$NAME:$concept"}++;
+}
+
+ at rules = ();
+ at rules_prob = ();
+$class{$real_name}++ if $conceptclass && !$noclasses;
+
+$concept="";
+
+}
+
+sub lookup_pronounciation {
+
+local($word) = shift @_;
+
+local($error)="";
+
+my(@pron);
+my($pronword)=join("\266", at _);
+
+if(defined $vocab{join("\266", at _)} && @{$vocab{join("\266", at _)}}>0) {
+ at pron = allchunk(join("\266", at _));
+}
+elsif(($pronword=~s/\266S$/\'S/) && (defined $vocab{$pronword}) && @{$vocab{$pronword}}>0) {
+ at pron = allchunk($pronword);
+}
+else {
+ at pron = allchunk(@_);
+}
+return $error if $error=~/\S/;	# Error case
+my($ret) = "{$word(0)} {$pron[0]}\n";
+for($i=1;$i<=$#pron;$i++) {
+$ret .= "{$word(".($i+1).")} {$pron[$i]}\n";
+}
+return $ret;
+}
+
+sub allchunk {
+my $prefix = shift @_;
+my $dictref = $vocab{$prefix};
+my @ret=();
+
+unless((defined $dictref) && (@{$dictref}>0)) {
+$error .= "ERROR: Not all subwords ($prefix) known for $word\n";
+allchunk(@_) unless @_==0;
+
+return ();
+}
+
+return @{$dictref} if @_==0;
+
+my($suff,$pref);
+
+foreach $suff (allchunk(@_)) {
+foreach $pref (@{$dictref}) {
+    push(@ret,"$pref $suff");
+}
+}
+return @ret;	
+}
+    
+	
+	
+
+
+sub help {
+print <<EOT;
+generate_random_samples -h -n N -break prob -show_break -d dir -forms ext -noclass -warn -capital
+		-clausi -repeat prob -leave_out prob -vocabulary -class -iclass
+		-dictionary -modifyLM [LanguageModel]
+
+
+Generates random sentences from a PHOENIX grammar that is in the
+current directory as if it were a stochastic context free grammmar.
+
+
+-h             --  print this help
+-n N           --  print N random sentences ( default $n_default )
+-break prob    --  probability of a sentence break ( default $break_prob_default )
+-show_break    --  visualizes the occurence of breaks for debugging
+-d dir         --  change to directory dir as the PHOENIX grammar dir
+-forms ext     --  use forms.ext instead of forms to generate from
+-noclass       --  don\'t use classes when generating text, which is more human-readable
+-warn          --  only test the grammar
+-capital       --  capitalize all words
+-clausi        --  use special output format
+-repeat prob   --  repetition probability ( default $repeat_prob_default )
+-leave_out prob--  leaving out an optional item probability ( default $leave_out_prob_default )
+-vocabulary    --  print the vocabulary used in the grammar usable for the LM
+-class         --  print the classes used
+-iclass        --  print the inverse classes used
+-dictionary    --  only print all the words in the grammar for pronounciation
+	   dictionary construction
+-modifyLM      --  takes an LM from stdin or from files after the options
+	   (possibly with old classes, these will be eliminated)
+	   and adds classes according to the specification by the grammar 
+-makedict      --  takes a dictionary from stdin or from files after the options
+	   and generates a pronounciation dictionary
+-unkdict       --  takes a dictionary from stdin or from files after the options
+	   and generates a list of unknown words related to that dictionary
+	   The assumtion is, that all word strings are comosed into subwords
+
+Written by Klaus Ries <ries\@cs.cmu.edu> <kries\@ira.uka.de>
+Copyright (C) 1996,1997 Interactive System Labs and Klaus Ries
+
+
+Detailed description:
+
+All productions are assumed to be equally likely, unless one specifies
+probabilities after some of them ( the remain probability mass is
+distributed over the other prodcuctions ) in the form   %%0.5%% ,
+which is interpreted as a comment by PHOENIX.
+A network can have the specification %%CLASS%% in a comment like in
+
+[townname_l]  # %%CLASS%% 
+( split )
+( bihac )
+(banja-luka)
+(sarajevo) # %%0.9%% since we are always talk about it
+(tuzla)
+(drvar)
+
+Elements within one class are assumed to be equiprobable.
+The specification of \"+\" is ignored and recursions are broken after
+the second visit of a net.
+
+The probability for leaving out an optional event is $leave_out_prob,
+the probability for repeating an repeatable event is $repeat_prob.
+One can either change these probabilies in the code or can rewrite the
+grammar such that one can specify them with corresponding rules.
+
+The generation of noises is possible if a file "noises.gra" is placed in
+same directory with the other grammar files, not all slots have to be present.
+The file "noises.gra" should look like
+______________________________________
+
+
+# &garbage is expanded to +garbage+ by definition, since "+" at
+# the beginning of a tokebn always means repetition
+BeginNoise      # called before every utterance
+( )                             # %%0.6%%
+( &garbage )                    # %%0.25%%
+( &garbage &garbage )
+( &garbage &garbage &garbage )
+
+EndNoise      # called after every utterance
+( )                             # %%0.8%%
+( &garbage )                    # %%0.15%%
+( &garbage &garbage )
+( &garbage &garbage &garbage )
+
+SegmentNoise # called between slots in the forms-file
+( )                             # %%0.97%%
+( &garbage )                    # %%0.015%%
+( &garbage &garbage )           # %%0.01%%
+( &garbage &garbage &garbage )
+
+BreakNoise   # called after a break occured instead of SegmentNoise
+( )                             # %%0.3%%
+( &garbage )                    # %%0.6%%
+( &garbage &garbage )
+
+RandomNoise  # Called after every word         
+( )                             # %%0.99%%
+( &garbage )                    # %%0.008%%
+( &garbage &garbage )
+______________________________________
+
+A break is a reset of a slot that is entered in the forms file
+and simulates restarts resp. corrections.
+It may occur at every word and a probabilty for this event can be specified.
+Since it is usually marked by discourse markers a separate noise slot is defined for
+this event.
+
+
+EOT
+exit(0);
+}

Added: Tools/makelm.pl
===================================================================
--- Tools/makelm.pl	                        (rev 0)
+++ Tools/makelm.pl	2007-04-05 14:26:18 UTC (rev 15)
@@ -0,0 +1,277 @@
+#!/usr/bin/perl
+
+use LWP::UserAgent;
+use HTTP::Request::Common;
+use File::Spec;
+use File::Copy;
+use File::stat;
+$ENV{'LC_COLLATE'} = 'C';
+$ENV{'LC_ALL'} = 'C';
+use locale;
+
+my $project = 'zap2';
+my $SAMPSIZE = scalar @ARGV? shift: 30000;
+
+$GRAMMARDIR = File::Spec->catdir(File::Spec->updir(), 'Grammar');
+$GRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'.gra');
+$FLATGRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'flat.gra');
+$CORPUS = 'tempfile';
+$BASEDIC = File::Spec->catfile($GRAMMARDIR, 'base.dic');
+$VOCAB = 'vocab';
+$CCS = 'temp.ccs';
+$PHOENIX = File::Spec->catfile($GRAMMARDIR, 'compile.exe');
+$TEXT2IDNGRAM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'text2idngram');
+$IDNGRAM2LM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'idngram2lm');
+$RANDOMSAMPS = 'generate_random_samples.pl';
+$IDNGRAM = $project.'.idngram';
+$LM = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'LanguageModel', 
+                          $project.'LM.arpa');
+$DICT = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'Dictionary',
+                            $project.'.dict');
+$REDUCED_DICT = $DICT.'.reduced_phoneset';
+
+#get language model
+#copy('../Grammar/ZapTask.gra', '../Grammar/Zap.gra');
+#copy('../Grammar/ZapTask.forms', '../Grammar/forms');
+#chdir '../Grammar';
+#system('mk_nets2.pl Zap.gra');
+#chdir '../MakeLM';
+#system("$PHOENIX -g $GRAMMARDIR -f ..\\Grammar\\Zap");
+&say('compile', 'compiling grammar...');
+chdir($GRAMMARDIR);
+system('cmp.bat');
+&say('compile', 'compiling language model...');
+chdir('../MakeLM');
+&say('compile', 'generating corpus...');
+&getcorpus($CORPUS);
+&say('compile', 'getting vocabulary...');
+&getvocab($BASEDIC, $VOCAB, $CCS);
+&say('compile', 'computing ngrams...');
+my $cmd = "$TEXT2IDNGRAM -vocab $VOCAB -temp . -write_ascii < $CORPUS > $IDNGRAM";
+&say('compile', $cmd);
+die "text2idngram failed$/" if system($cmd);
+&say('compile', 'computing language model...');
+$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -good_turing -disc_ranges 0 0 0 -ascii_input";
+#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -absolute -ascii_input";
+#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -ascii_input";
+&say('compile', "$cmd$/");
+die "idngram2lm failed$/" if system($cmd);
+
+#get dictionary
+&say('compile', 'compiling dictionary...');
+&getdict($DICT, $REDUCED_DICT, $VOCAB);
+&say('compile', "done\n");
+
+sub getcorpus {
+    my $corpusfile = shift;
+    #flatten grammar
+    open(ZAPGRA, $GRAMMARFILE) || die "Can't open grammar file$/";
+    open(ZAPFLAT, ">$FLATGRAMMARFILE") || die "Can't open grammar flat file$/";
+    print ZAPFLAT &flat(<ZAPGRA>);
+    close ZAPGRA;
+    close ZAPFLAT;
+
+    open(RANDOM, "$RANDOMSAMPS -n $SAMPSIZE -d $GRAMMARDIR|") || 
+	die "Cannot execute $RANDOMSAMPS$/";
+    open(CORPUS, ">$corpusfile") || die "Can't open $corpusfile$/";
+    binmode CORPUS;
+    while (<RANDOM>) {
+	$_ = uc($_);
+	chomp;
+	s/<\/?S> //g;
+	print CORPUS "<s> $_ </s>\n";
+    }
+    close CORPUS;
+}
+
+sub getvocab {
+    my $basefile = shift;
+    my $vocab = shift;
+    my $ccs = shift;
+    open(VOCAB, ">$vocab") || die "Can't open $vocab$/";
+    binmode VOCAB;
+    open(BASE, "<$basefile") || die "Can't open $basefile$/";
+    my @base = map { /(.*) .*/? uc("$1\n"): () } <BASE>;
+    print VOCAB grep !/<\/?S>/, sort(@base);
+    print VOCAB "<s>\n";
+    print VOCAB "</s>\n";
+    close VOCAB;
+    open(CCS, ">$ccs") || die "Can't open $ccs$/";
+    binmode CCS;
+    print CCS "<s>\n";
+    close CCS;
+}
+
+sub getdict {
+    my $dict = shift;
+    my $reduced = shift;
+    my $vocab = shift;
+    my @long;
+    open(DICT, ">$dict") || die "Can't open $dict$/";
+    binmode DICT;
+    open(REDUCED, ">$reduced") || die "Can't open $reduced$/";
+    binmode REDUCED;
+    open(VOCAB, "<$vocab") || die "Can't open $vocab$/";
+    for (<VOCAB>) {
+	s/=/_/g;
+	push @long, $_;
+    }
+    foreach (&shorten(@long)) {
+	s/_/=/g;
+	print DICT "$_\n";
+	s/\bIX\b/AX/g;
+	print REDUCED "$_\n";
+    }
+    close DICT;
+    close REDUCED;
+}
+
+sub shorten {
+    open (OUTFILE, ">outfile.tmp");
+    open (LONGFILE, ">longfile.tmp");
+
+    my $counter = 0;
+    my $not_done = 1;
+    my @listlongwords;
+
+    foreach (@_) {
+	my @blah = split / /, $_;
+	for my $i (@blah) {
+	    my $len = length $i;
+	    if ($len >= 35) {
+		for my $m (@listlongwords) {
+		    if ($m =~ /$i/) {
+			$not_done = 0;
+		    }
+		}
+		
+		if ($not_done) {
+		    push(@listlongwords, $i);
+		}
+		
+		$not_done = 1;
+		my @words = split /_/, $i;
+		for my $j (@words) {
+		    print LONGFILE "$j ";
+		}
+		print LONGFILE "\n";
+	    } else {
+		print OUTFILE "$i ";
+	    }
+	}
+    }
+    
+    close(OUTFILE);
+    close(LONGFILE);
+
+    my @diclines = 
+	grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('outfile.tmp');
+    unlink 'outfile.tmp';
+    my @longlines = stat('longfile.tmp')->size? 
+	grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('longfile.tmp'): ();
+    unlink 'longfile.tmp';
+
+    my $not_used = 1;
+    
+    for my $i (@listlongwords) {
+	chomp $i;
+	my $lotsaphones = "$i ";
+	my @word_parts = split /_/, $i;
+	for my $word (@word_parts) {
+	    $word =~ s/(\W)/\\$1/;
+	    #warn "the word to look up is $word\n";
+	    for my $j (@longlines) {
+		if ($not_used) {
+		    if ($j =~ /^$word\t/) {
+			my @pronounciation = split /\t/, $j;
+			chomp $pronounciation[1];
+			$lotsaphones .= "$pronounciation[1] ";
+			$not_used = 0;
+		    }
+		}
+	    }
+	    $not_used = 1;
+	}
+	$lotsaphones .= "\n";
+	push @diclines, $lotsaphones;
+    }	
+    return @diclines;
+}
+
+sub getdic {
+    if ($#_ == -1) {
+	die "Need the corpus location as an argument\n";
+    }
+
+    my $ua = new LWP::UserAgent;
+    my $res = $ua->request(POST 'http://fife.speech.cs.cmu.edu/cgi-bin/tools/lmtool.2.pl', 
+			   Content_Type => 'form-data', 
+			   Content => [formtype => 'simple', 
+				       corpus => [$_[0]], 
+				       #handdict => undef,
+				       #extrawords => undef,
+				       #phoneset => '40',
+				       #bracket => 'Yes',
+				       #model => 'Bigram',
+				       #class => 'nil',
+				       #discount => '0.5',
+				       submit => 'COMPILE KNOWLEDGE BASE']);
+
+    my $result;
+    if ($res->is_success) {
+	$result = $res->content;
+    } else {
+	die "Couldn't execute the perl script, probably error in the form$/";
+    }
+    
+    if ($result =~ /\!-- DIC.*ct\/\/(.*)\">/) {
+	
+	my $blah = "http://fife.speech.cs.cmu.edu/tools/product//$1";
+	$res = $ua->request(GET $blah);
+    
+	if ($res->is_success) {
+	    return split(/\n/, $res->content);
+	} else {
+	    die "Can't find dictionary file$/";
+	}
+    } else {
+	die "Couldn't parse the result: $result$/";
+    }
+}
+
+sub say {
+    print shift, ": ", shift, $/;
+}
+
+sub flat {
+    my @unflat = @_;
+    my @result;
+    for (@unflat) {
+	if (!s/^\s*\((.*)\)/$1/) {
+	    push @result, $_;
+	} else {
+	    my @stack;
+	    my %flathash;
+	    push(@stack, [split]);
+	    while (my $buffref = shift @stack) {
+		my $i = 0;
+		my @buff = @$buffref;
+		my $flat;
+		for (@buff) {
+		    if (/^\*(.*)/) {
+			$flat .= "$1 ";
+			push(@stack, [ @buff[0..$i-1], @buff[$i+1..$#buff] ]);
+		    } else {
+			$flat .= "$_ ";
+		    }
+		    $i++;
+		}
+		$flathash{$flat} = 1;
+	    }
+	    foreach (keys %flathash) {
+		push @result, "\t( $_)\n";
+	    }
+	}
+    }
+    @result;
+}


More information about the Ravenclaw-developers mailing list