[RavenclawDev 238] [15] Tools: Moved makelm into tools.
tk@edam.speech.cs.cmu.edu
tk at edam.speech.cs.cmu.edu
Thu Apr 5 10:26:19 EDT 2007
An HTML attachment was scrubbed...
URL: http://mailman.srv.cs.cmu.edu/pipermail/ravenclaw-developers/attachments/20070405/c0857a14/attachment-0001.html
-------------- next part --------------
Added: Tools/CMU-Cam_Toolkit_v2/README
===================================================================
--- Tools/CMU-Cam_Toolkit_v2/README (rev 0)
+++ Tools/CMU-Cam_Toolkit_v2/README 2007-04-05 14:26:18 UTC (rev 15)
@@ -0,0 +1,84 @@
+CMU-Cambridge Statistical Language Modeling Tookit v2
+=====================================================
+
+Documentation:
+--------------
+
+For installation and usage instructions for the toolkit, see
+
+doc/toolkit_documentation.html
+
+(for the sake of convenience, the installation instructions are also
+given below).
+
+Installation:
+-------------
+
+For "big-endian" machines (eg those running HP-UX, IRIX, SunOS,
+Solaris) the installation procedure is simply to type
+
+ cd src
+ make install
+
+The executables will then be copied into the bin/ directory, and the
+library file SLM2.a will be copied into the lib/ directory.
+
+For "little-endian" machines (eg those running Ultrix, Linux) the
+variable "BYTESWAP_FLAG" will need to be set in the Makefile. This can
+be done by editing src/Makefile directly, so that the line
+
+#BYTESWAP_FLAG = -DSLM_SWAP_BYTES
+
+is changed to
+
+BYTESWAP_FLAG = -DSLM_SWAP_BYTES
+
+Then the program can be installed as before.
+
+If you are unsure of the "endian-ness" of your machine, then the shell
+script endian.sh should be able to provide some assistance.
+
+In case of problems, then more information can be found by examining
+src/Makefile.
+
+Files:
+------
+
+endian.sh Shell script to report "endian-ness" (see installation
+ instructions). Not terribly robust; needs to be able to see gcc,
+ for example.
+
+doc/toolkit_documentation.html The standard html documentation for the
+ toolkit. View using netscape or equivalent.
+
+doc/toolkit_documentation_no_tables.html As above, but doesn't use
+ tables, so is suitable for use with browsers which don't support
+ tables (eg lynx).
+
+doc/toolkit_documentation.txt The documentation in flat text.
+
+doc/change_log.html List of changes from version to version.
+
+doc/change_log.txt The above in flat text.
+
+src/*.c src/*.h The toolkit source files
+
+src/Makefile The standard make file.
+
+src/install-sh Shell script to install executables in the appropriate
+ directory. An improvement on cp, as it will check to see whether it is
+ about to overwrite an execuatable which is already in use.
+
+include/SLM2.h File containing all of src/*.h, allowing
+ functions from the toolkit to be included in new software.
+
+bin/ Directory where executables will be installed.
+
+lib/ Directory where SLM2.a will be stored (useful in conjunction with
+ include/SLM2.h for including functions from the toolkit to be included
+ in new software.)
+
+
+
+
+
Added: Tools/CMU-Cam_Toolkit_v2/bin/binlm2arpa.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/binlm2arpa.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/blah.idngram
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/blah.idngram
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/evallm.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/evallm.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/idngram2lm.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/idngram2lm.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/idngram2stats.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/idngram2stats.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/interpolate.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/interpolate.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/mergeidngram.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/mergeidngram.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/ngram2mgram.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/ngram2mgram.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/text2idngram.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/text2idngram.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/text2wfreq.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/text2wfreq.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/text2wngram.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/text2wngram.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/wfreq2vocab.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/wfreq2vocab.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/bin/wngram2idngram.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/bin/wngram2idngram.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/CMU-Cam_Toolkit_v2/endian.sh
===================================================================
--- Tools/CMU-Cam_Toolkit_v2/endian.sh (rev 0)
+++ Tools/CMU-Cam_Toolkit_v2/endian.sh 2007-04-05 14:26:18 UTC (rev 15)
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+# Dumb shell script to report endianness.
+
+echo "
+ #define BIG_ENDIAN 0
+ #define LITTLE_ENDIAN 1
+
+ int little_endian(void)
+ {
+ short int w = 0x0001;
+ char *byte = (char *) &w;
+ return(byte[0] ? LITTLE_ENDIAN : BIG_ENDIAN);
+ }
+
+main () {
+ if(!little_endian()) {
+ printf(\"Big-endian, DO NOT set -DSLM_SWAP_BYTES in Makefile\\n\");
+ }
+ else {
+ printf(\"Little-endian, set -DSLM_SWAP_BYTES in Makefile\\n\");
+ }
+}" > test_endian.c
+gcc test_endian.c -o test_endian
+
+# Can use cc if gcc not available.
+
+./test_endian
+rm -f test_endian test_endian.c
Added: Tools/CMU-Cam_Toolkit_v2/test_endian.exe
===================================================================
(Binary files differ)
Property changes on: Tools/CMU-Cam_Toolkit_v2/test_endian.exe
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: Tools/generate_random_samples.pl
===================================================================
--- Tools/generate_random_samples.pl (rev 0)
+++ Tools/generate_random_samples.pl 2007-04-05 14:26:18 UTC (rev 15)
@@ -0,0 +1,1015 @@
+#!/usr/bin/perl
+# This line tells emacs, that this is a -*-Perl-*- program
+
+$licence = <<LICENCE;
+
+Copyright (C) 1996 Interactive System Labs and Klaus Ries
+
+LICENCE
+
+$n_default=10;
+$break_prob_default = 0;
+$leave_out_prob_default = 0.1;
+$repeat_prob_default = 0.25;
+
+$n=$n_default;
+$break_prob = $break_prob_default;
+$leave_out_prob = $leave_out_prob_default;
+$repeat_prob = $repeat_prob_default;
+
+
+while($ARGV[0] =~ /^-.*$/) { # PROCESS FLAGS
+$_ = shift;
+if($_ eq "-h") {
+$help++;
+next;
+}
+if($_ eq "-n") {
+$n=shift;
+next;
+}
+if($_ eq "-leave_out") {
+$leave_out_prob=shift;
+next;
+}
+if($_ eq "-repeat") {
+$repeat_prob=shift;
+next;
+}
+if($_ eq "-show_break") {
+$show_break++;
+next;
+}
+if($_ eq "-d") {
+$dir=shift;
+next;
+}
+if($_ eq "-forms") {
+$forms=shift;
+next;
+}
+if($_ eq "-warn") {
+$warn++;
+next;
+}
+if($_ eq "-capital") {
+$capital++;
+next;
+}
+if($_ eq "-dictionary") {
+$dictionary++;
+next;
+}
+if($_ eq "-makedict") {
+$makedict++;
+next;
+}
+if($_ eq "-unkdict") {
+$unkdict++;
+next;
+}
+if($_ eq "-vocabulary") {
+$vocabulary++;
+next;
+}
+if($_ eq "-class") {
+$classes++;
+next;
+}
+if($_ eq "-noclass") {
+$noclasses++;
+next;
+}
+if($_ eq "-novariants") {
+$novariants++;
+next;
+}
+if($_ eq "-iclass") {
+$inverse_map++;
+next;
+}
+if($_ eq "-modifyLM") {
+$modify++;
+next;
+}
+if($_ eq "-clausi") {
+$clausi++;
+next;
+}
+die "Unrecognized command line option $_\nUse generate_random_samples -h for help\n";
+}
+
+help() if($help);
+
+if(defined $dir) {
+$currdir=$ENV{PWD};
+die "Cannot change to $dir\n" unless chdir $dir;
+warn "Changed to $dir\n";
+}
+
+
+# First load the forms of that grammer:
+
+$FORMS = "forms";
+$FORMS .= ".$forms" if defined $forms;
+
+open FORMS;
+
+while(<FORMS>) {
+
+ $random_perm++ if /%%randomperm%%/;
+
+ $prob=get_prob();
+
+ if(s/^\s*FUNCTION:\s*(\S+)//) {
+ my @F = ();
+ my @G = ();
+ $name=$1;
+ print STDERR "The name of the function is >>$name<<\n";
+ undef $nets;
+ push(@top_level,$name);
+ push(@top_level_prob,$prob);
+ push(@top_level_slots,\@F);
+ push(@top_level_probs,\@G);
+ push(@top_level_perm,$random_perm);
+ undef $random_perm;
+ $top_level{$name}++;
+ }
+ next unless defined $name;
+
+ if(/^\s*NETS:/) {
+ $nets++; next;
+ }
+ elsif($nets) {
+ 1;
+ }
+ else {
+ next;
+ }
+
+ if(s/^(\t|\s{2,})(\S+)/$2/) {
+ $entry=$2;
+ if(defined $top_level{$name,$entry}) {
+ warn "Multiple entry for $entry in function $name\n" ;
+ next;
+ }
+ push(@{$top_level_slots[$#top_level_slots]},$entry);
+ push(@{$top_level_probs[$#top_level_probs]},$prob);
+ $top_level{$name,$entry}++;
+ $public{$entry}++;
+ }
+}
+close FORMS;
+
+#warn join($/, @{$top_level_slots[$#top_level_slots]}), $/;
+
+normalize_random_entry(\@top_level_prob);
+
+foreach(@top_level_prob) {
+normalize_random_entry($_);
+}
+
+
+die "No top-level slots" unless () ne keys %top_level;
+
+# Now load the nets themselves
+
+$NETS = "nets";
+
+open NETS;
+while(<NETS>) {
+ next unless /\S/;
+ chomp;
+ s/\s+$//;
+
+ if(defined $nets{$_} || defined $nets{"[$_]"}) {
+ warn "Multiple entry for >>$_<< in nets\n" ;
+ next;
+ }
+ $nets{$_}++;
+
+#if(/[^A-Za-z]/) {
+ $_="[$_]";
+#}
+ $public{$_}++;
+}
+close NETS;
+
+$nets{"noises"}++ if -r "noises.gra";
+
+ at noise=("SegmentNoise","BeginNoise","EndNoise","BreakNoise","RandomNoise");
+
+if($nets{"noises"}) {
+ foreach(@noise) {
+ $public{$_}++;
+ }
+}
+
+# Next thing: Load all the grammars
+
+#foreach(keys %nets) {
+# print STDERR "Loading grammar file $_ ............\r";
+#
+# $NAME=$_;
+# load_grammar_file("$NAME.gra");
+#}
+load_grammar_file('zap2flat.gra');
+
+if($nets{"noises"}) {
+ foreach(@noise) {
+ undef $public{$_} unless defined $rules{$_};
+ }
+}
+
+chdir $currdir if defined $currdir;
+print STDERR "Finished loading grammar file \n";
+
+
+# Next thing: Generate sentences
+
+srand;
+
+check_grammar() if $warn;
+
+
+if($dictionary || $vocabulary || $makedict || $unkdict) { # Print all the defined dictionary/vocabulary entries
+foreach $slot_ref (@top_level_slots) {
+ foreach(@{$slot_ref}) {
+ $slots{$_}++;
+ }
+}
+ at netname = keys %slots;
+foreach(@noise) {
+push(@netname,$_) if defined $public{$_};
+}
+
+build_reachable_dictionary();
+
+
+open(OUT,"| sort -u | perl -pe 's/\\(0\\)\\}/\\}/;'");
+if($vocabulary) {
+foreach(keys %reachable_t) {
+ print OUT "$_\n";
+}
+print OUT "<s>\n</s>\n";
+}
+elsif($dictionary) {
+foreach(keys %reachable_t) {
+ if($class{$_}) {
+ $_=join("\n",print_net_all($_));
+ }
+ print OUT "$_\n";
+}
+}
+else { # Makedict !!
+ # or unkdict !!
+# Calculate all words that we need from the dictionary
+# Assign an empty list to all of them to get
+# the effective entries
+
+foreach(keys %reachable_t) {
+ next if $class{$_};
+ if(/\266/) { # In case we have entered that phrase to
+ # the dictionary
+ next if defined $vocab{$_};
+ my @F=();
+ $vocab{$_}=\@F;
+ }
+ if(/\266S$/) {
+ my $word2=$_;
+ $word2=~s/\266S$/\'S/;
+ next if defined $vocab{$word2};
+ my @F=();
+ $vocab{$word2}=\@F;
+ }
+ foreach(split(/\266/,$_)) {
+ next if defined $vocab{$_};
+ my @F=();
+ $vocab{$_}=\@F;
+ }
+}
+foreach(keys %class) {
+ next unless $reachable{$_};
+ foreach(print_net_all($_)) {
+ ($word,$class)=split(/:/,$_,2);
+ if($word=~/\266/) { # In case we have entered that phrase to
+ # the dictionary
+ next if defined $vocab{$word};
+ my @F=();
+ $vocab{$word}=\@F;
+ }
+ if($word=~/\266S$/) {
+ my $word2=$word;
+ $word2=~s/\266S$/\'S/;
+ next if defined $vocab{$word2};
+ my @F=();
+ $vocab{$word2}=\@F;
+ }
+ foreach(split(/\266/,$word)) {
+ next if defined $vocab{$_};
+ my @F=();
+ $vocab{$_}=\@F;
+ }
+ }
+}
+
+# Now selectively read the real dictionary from stdin/commandline
+# Also read pronounciation variants
+
+while(<>) {
+ s/^\{?\s*([^\s\}\(]+)(\([^\)]+\))?\s*\}?\s+//;
+ $word=$1;
+ $ref = $vocab{$word};
+ s/^\s*\{?\s*//;
+ s/\s*\}?\s*$//;
+ foreach $phonem (split) {
+ next unless $phonem=~/\S/ && $phonem=~/^[a-z]/i;
+ $phonem{$phonem}++;
+ }
+ next unless defined $ref;
+ next if $novariants && @{$ref}>0;
+ $pron=$_;
+ push(@{$ref},$pron) unless grep($_ eq $pron,@{$ref})>0;
+}
+
+
+if($unkdict) {
+ while(($word,$pron) = each %vocab) {
+ next if $word=~/\266/;
+ print OUT "$word\n" unless @{$pron} ;
+ }
+}
+else {
+ @phonem = sort { $phonem{$a} <=> $phonem{$b} }
+ grep($phonem{$_}>2 && $_ ne "SIL",keys %phonem);
+
+ # Pick a very seldom phonem for the words that are artifically in the dictionary
+ # to keep the vocabulary module happy
+ # These are specifically the class symbols
+
+ $badphonem=$phonem[0];
+ $badphonem.=" $badphonem";$badphonem.=" $badphonem";
+ $badphonem.=" $badphonem";$badphonem.=" $badphonem";
+
+ foreach(keys %reachable_t) {
+ next if $class{$_};
+ print OUT lookup_pronounciation($_,split(/\266/))
+ }
+ foreach(keys %class) {
+ next unless $reachable{$_};
+ print OUT "\{$_\} \{$badphonem\}\n";
+ foreach $symbol (print_net_all($_)) {
+ ($word,$class)=split(/:/,$symbol,2);
+ print OUT lookup_pronounciation($symbol,split(/\266/,$word));
+ }
+ }
+ print OUT "{\$} {SIL}\n{(} {SIL}\n{)} {SIL}\n";
+}
+close OUT;
+}
+}
+elsif($classes) {
+foreach $class (keys %class) {
+$_=join(" ",$class,print_net_all($class));
+print "$_\n";
+}
+}
+elsif($inverse_map) {
+foreach $class (keys %class) {
+foreach $classword (print_net_all($class)) {
+ ($word)=split(/:/,$classword);
+ $imap=$inversemap{$word};
+ if(defined $imap) {
+ push(@{$imap},$classword);
+ }
+ else {
+ my(@F)=($classword);
+ $inversemap{$word}=\@F;
+ }
+}
+}
+while(($word,$imap)=each %inversemap) {
+print join("\#",@{$imap})." $word\n";
+}
+}
+elsif($modify) {
+
+while(<>) {
+if(/^\\subsmodel:/) {
+ $skipsmodel++;
+ last;
+}
+print;
+}
+if($skipsmodel) {
+while(<>) {
+ if(/^\\end\\/) {
+ last;
+ }
+}
+print while <>;
+}
+
+print "\\subsmodel:\n";
+foreach $class (sort keys %class) {
+print "-99.9 $class $class\n";
+ at classmembers = print_net_all($class);
+$classpenalty = -log(scalar(@classmembers))/log(10);
+foreach $classmember (sort @classmembers) {
+ $_="$classpenalty $classmember $class\n";
+ print;
+}
+}
+print "\\end\\\n";
+}
+else { # Print random sentences
+foreach($i=0;$i<$n;$i++) { # Select a top-level frame and generate a sentence for it
+print_random_sentence(select_random_entry(\@top_level_prob,"FORM"));
+}
+}
+
+
+sub print_random_sentence {
+
+ my($select) = @_;
+ my($select_local,$parse,$parse_local);
+
+ #warn "print_random_sentence: top_level[select] -> $top_level[$select] >> $/";
+
+ if($top_level_perm[$select]) {
+ begin_noise();
+ $break=1;
+ while($break) {
+ $break=0;
+ print_net_random(${$top_level_slots[$select]}[$select_local]);
+ if($break) {
+ break_noise();
+ }
+ else {
+ segment_noise();
+ }
+ }
+ end_noise();
+ }
+ else {
+ begin_noise();
+ #foreach $select_local (@{$top_level_slots[$select]}) {
+ # tk - fix
+ # I don't understand, but without this fix every line would be a sucession of each net.
+ $select_local = $top_level_slots[$select][int(rand(scalar(@{$top_level_slots[$select]})))];
+ $break=1;
+ while($break) {
+ $break=0;
+ print_net_random($select_local);
+ if($break) {
+ break_noise();
+ }
+ else {
+ segment_noise();
+ }
+ }
+ #}
+ end_noise();
+ }
+ print "\n";
+}
+
+sub begin_noise {
+print_net_random("BeginNoise") if defined $public{"BeginNoise"};
+}
+
+
+sub end_noise {
+print_net_random("EndNoise") if defined $public{"EndNoise"};
+}
+
+
+sub segment_noise {
+print_net_random("SegmentNoise") if defined $public{"SegmentNoise"};
+}
+
+sub break_noise {
+print_net_random("BreakNoise") if defined $public{"BreakNoise"};
+}
+
+sub random_noise {
+print_net_random("RandomNoise") if defined $public{"RandomNoise"};
+}
+
+sub random_break {
+if(1-$break_prob<rand()) {
+$break++;
+print " **<** " if $show_break;
+}
+}
+
+
+sub select_random_entry {
+ my($array,$name) = @_;
+ my $prob=0;
+ my $random = rand();
+ my $return=0;
+ my $i=0;
+
+ foreach(@$array) {
+ $prob+=$_;
+ return $i if $random<$prob;
+ $i++;
+ }
+ die "$name not a probability distribution: ".join(" ",@{$array})."\n";
+}
+
+sub normalize_random_entry {
+local($array) = @_;
+my $i;
+my $prob = 0;
+my $zero = 0;
+
+if(@{$array}==1) {
+$$array[0]=1;
+return;
+}
+
+for( $i=0; $i<=$#$array; $i++ ) {
+$prob+=$$array[$i];
+if($$array[$i] == 0) {
+ $zero++;
+}
+}
+die "Not a probability distribution $prob>1\n" if $prob>1;
+
+if($zero>0) {
+$prob = (1-$prob)/$zero;
+for( $i=0; $i<=$#$array; $i++ ) {
+ $$array[$i] = $prob if $$array[$i]==0;
+}
+}
+}
+
+sub print_net_random {
+
+ my($netname) = @_;
+ my($repeat,$type,$body);
+
+ my($text) = "";
+
+ unless(defined $rules{$netname}) {
+ warn "Rule for $netname not defined\n" if $warn;
+ return "";
+ }
+
+ my($rule_ref) = ${$rules{$netname}}[select_random_entry($rules_prob{$netname},$netname)];
+
+ foreach(@$rule_ref) {
+ ($repeat,$type,$body) = split(/,/,$_);
+
+ while(1) {
+ if($repeat=~s/\*//g) {
+ last if rand()<$leave_out_prob;
+ $repeat="1" unless $repeat=~/\S/;
+ }
+ if($type eq "T" || $class{$body}) {
+ if($clausi) {
+ print "$body\n"; random_noise();
+ }
+ else {
+ print "$body "; random_noise();
+ }
+ }
+ else {
+ print_net_random($body);
+ random_break();
+ }
+ return if $break;
+ last if $repeat eq "1";
+ last unless rand()<$repeat_prob;
+ }
+ }
+}
+
+
+
+sub print_net_all {
+local($global_netname) = @_;
+local(%already_visited);
+
+my(@return) = print_net_all2($global_netname);
+my($ret_length)=scalar(@return);
+ at return=grep(/\S/, at return);
+warn "$global_netname could expand to empty, ignored\n"
+unless @return==$ret_length;
+ at return=grep(($_.=":$global_netname") || 1, at return);
+return @return;
+}
+
+
+sub print_net_all2 {
+
+my($netname) = shift @_;
+my($repeat,$type,$body,$ruleref);
+my(@returnarray)=();
+
+unless(defined $rules{$netname}) {
+warn "Rule for $netname not defined\n" if $warn;
+return ();
+}
+
+return @_ if $already_visited{$netname}>1;
+$already_visited{$netname}++;
+foreach $ruleref (@{$rules{$netname}}) {
+my(@rulereturn)=@_;
+foreach(@{$ruleref}) {
+ ($repeat,$type,$body) = split(/,/,$_);
+
+ my(@localrulereturn)=();
+
+ if($repeat eq "*") {
+ @localrulereturn=@rulereturn;
+ @localrulereturn=("") unless @localrulereturn>0;
+ }
+ warn "Rule for $netname contains repetition $repeat specification -- ignored"
+ unless $repeat eq "1" || $repeat eq "*";
+
+ if($type eq "T") {
+ $body = "+$body+" if $body =~ s/^&//;
+ if(@rulereturn) {
+ for($i=0;$i<=$#rulereturn;$i++) {
+ if($rulereturn[$i]=~/\S/) {
+ $rulereturn[$i].="\266$body";
+ } else {
+ $rulereturn[$i]="$body";
+ }
+ }
+ }
+ else {
+ @rulereturn=($body);
+ }
+ }
+ else {
+ @rulereturn=print_net_all2($body, at rulereturn);
+ }
+ push(@rulereturn, at localrulereturn);
+}
+push(@returnarray, at rulereturn);
+}
+$already_visited{$netname}--;
+return @returnarray;
+}
+
+
+sub build_reachable_dictionary {
+
+my($netname);
+my($repeat,$type,$body,$rule_ref);
+
+my($text) = "";
+
+while(@netname) {
+$netname = shift @netname;
+
+if($class{$netname}) {
+ $reachable_t{$netname}++;
+ $reachable{$netname}++;
+ next;
+}
+
+unless(defined $rules{$netname}) {
+ warn "Rule for $netname not defined in build_reachable_dictionary\n" if $warn;
+ return "";
+}
+
+foreach $rule_ref (@{$rules{$netname}}) {
+ foreach(@{$rule_ref}) {
+ ($repeat,$type,$body) = split(/,/,$_);
+ if($type eq "T") {
+ $reachable_t{$body}++;
+ }
+ else {
+ unless(exists $reachable{$body}) {
+ push(@netname,$body);
+ $reachable{$body}++;
+ }
+ }
+ }
+}
+}
+}
+
+
+sub check_grammar {
+
+while(($definition) = each %public) {
+warn "Net rule $definition not defined\n" unless defined $rules{$definition};
+}
+while(($definition,$count) = each %definition) {
+for($i=1;$i<=$count;$i++) {
+ warn "$definition:$count not defined" unless defined $rules{"$definition:$count"};
+}
+}
+}
+
+
+sub get_prob {
+
+if(s/[\#;]([\s\S]+)//) {
+my $comment=$1;
+
+if($comment=~/%%CLASS%%/) {
+ $class=1;
+} else {
+ $class=0;
+}
+
+return $1 if $comment=~/%%(\S+)%%/;
+}
+$class=0;
+return 0;
+}
+
+
+sub load_grammar_file {
+local($GRAMMAR_FILE) = @_;
+local($concept, at rules, at rules_prob);
+local(*GRAMMAR);
+
+# print STDERR "Reading $GRAMMAR_FILE of $NAME\n";
+
+$concept = "";
+ at rules = ();
+$conceptclass=0;
+
+open(GRAMMAR,$GRAMMAR_FILE);
+while(<GRAMMAR>) {
+if(/^\#include\s+(\S+)/) {
+ flush_concept();
+ $conceptclass=0;
+ load_grammar_file($1,$NAME);
+}
+else {
+ # Get rid of comments and empty lines
+
+ $prob=get_prob(); next unless /\S/;
+
+ # Classify line
+ if(/^(\S+)/) { # It's a concept
+ $new_concept=$1;
+ flush_concept();
+ $conceptclass=$class;
+ $concept=$new_concept;
+ }
+ else {
+ die "No concept defined" unless $concept=~/\S/;
+ die "No begin-parenthesis in $concept rules $_\n" unless s/^\s+\(\s*//;
+ die "No end-parenthesis in $concept rules $_\n" unless s/\s*\)\s*$//;
+
+ push(@rules,$_);
+ push(@rules_prob,$prob);
+ }
+}
+}
+flush_concept();
+close GRAMMAR;
+}
+
+
+sub select_random_array {
+local($array_ref) = $_[0];;
+
+return @$array_ref[int(rand(scalar(@$array_ref)))];
+}
+
+
+sub flush_concept {
+
+# Determine the symbol-table-entry for the concept defined
+# If it is a top-level entry, it is globally visible.
+# Otherwise, it is only local and may be multiple definded and
+# overwritten
+
+return unless $concept=~/\S/;
+
+if($public{$concept}) {
+$real_name=$concept;
+}
+else {
+$definition{$NAME,$concept};
+$real_name="$NAME:$concept:".($definition{"$NAME:$concept"}+1);
+}
+
+die "Multiple definition of $real_name\n" if defined $rules{$real_name};
+
+
+my @new_rules = ();
+
+foreach(@rules) {
+my @rule = ();
+foreach $body (split) {
+ $repeat = "1";
+ $repeat = $1 if $body=~s/^([\+\*]+)//;
+ $body =~ s/^\s+//; $body =~ s/\s+$//;
+ if( ($body=~/^[^A-Z_\-]/) && !($body =~ /^\[.*\]$/)) {
+ $type="T";
+ $body=~s/\+/'/g; # '
+ $body=~tr/a-z/A-Z/ if $capital;
+ $body = "+$body+" if $body =~ s/^&//;
+ $dictionary{$body}++;
+ }
+ else {
+ $type="N";
+
+ unless($public{$body}) {
+ $body="$NAME:$body:".($definition{"$NAME:$body"}+1);
+ }
+ }
+ push(@rule,"$repeat,$type,$body");
+}
+
+push(@new_rules,\@rule);
+}
+
+$rules{$real_name}=\@new_rules;
+
+my @new_rules_prob = @rules_prob;
+normalize_random_entry(\@new_rules_prob);
+$rules_prob{$real_name}=\@new_rules_prob;
+
+
+unless($public{$concept}) {
+$definition{"$NAME:$concept"}++;
+}
+
+ at rules = ();
+ at rules_prob = ();
+$class{$real_name}++ if $conceptclass && !$noclasses;
+
+$concept="";
+
+}
+
+sub lookup_pronounciation {
+
+local($word) = shift @_;
+
+local($error)="";
+
+my(@pron);
+my($pronword)=join("\266", at _);
+
+if(defined $vocab{join("\266", at _)} && @{$vocab{join("\266", at _)}}>0) {
+ at pron = allchunk(join("\266", at _));
+}
+elsif(($pronword=~s/\266S$/\'S/) && (defined $vocab{$pronword}) && @{$vocab{$pronword}}>0) {
+ at pron = allchunk($pronword);
+}
+else {
+ at pron = allchunk(@_);
+}
+return $error if $error=~/\S/; # Error case
+my($ret) = "{$word(0)} {$pron[0]}\n";
+for($i=1;$i<=$#pron;$i++) {
+$ret .= "{$word(".($i+1).")} {$pron[$i]}\n";
+}
+return $ret;
+}
+
+sub allchunk {
+my $prefix = shift @_;
+my $dictref = $vocab{$prefix};
+my @ret=();
+
+unless((defined $dictref) && (@{$dictref}>0)) {
+$error .= "ERROR: Not all subwords ($prefix) known for $word\n";
+allchunk(@_) unless @_==0;
+
+return ();
+}
+
+return @{$dictref} if @_==0;
+
+my($suff,$pref);
+
+foreach $suff (allchunk(@_)) {
+foreach $pref (@{$dictref}) {
+ push(@ret,"$pref $suff");
+}
+}
+return @ret;
+}
+
+
+
+
+
+sub help {
+print <<EOT;
+generate_random_samples -h -n N -break prob -show_break -d dir -forms ext -noclass -warn -capital
+ -clausi -repeat prob -leave_out prob -vocabulary -class -iclass
+ -dictionary -modifyLM [LanguageModel]
+
+
+Generates random sentences from a PHOENIX grammar that is in the
+current directory as if it were a stochastic context free grammmar.
+
+
+-h -- print this help
+-n N -- print N random sentences ( default $n_default )
+-break prob -- probability of a sentence break ( default $break_prob_default )
+-show_break -- visualizes the occurence of breaks for debugging
+-d dir -- change to directory dir as the PHOENIX grammar dir
+-forms ext -- use forms.ext instead of forms to generate from
+-noclass -- don\'t use classes when generating text, which is more human-readable
+-warn -- only test the grammar
+-capital -- capitalize all words
+-clausi -- use special output format
+-repeat prob -- repetition probability ( default $repeat_prob_default )
+-leave_out prob-- leaving out an optional item probability ( default $leave_out_prob_default )
+-vocabulary -- print the vocabulary used in the grammar usable for the LM
+-class -- print the classes used
+-iclass -- print the inverse classes used
+-dictionary -- only print all the words in the grammar for pronounciation
+ dictionary construction
+-modifyLM -- takes an LM from stdin or from files after the options
+ (possibly with old classes, these will be eliminated)
+ and adds classes according to the specification by the grammar
+-makedict -- takes a dictionary from stdin or from files after the options
+ and generates a pronounciation dictionary
+-unkdict -- takes a dictionary from stdin or from files after the options
+ and generates a list of unknown words related to that dictionary
+ The assumtion is, that all word strings are comosed into subwords
+
+Written by Klaus Ries <ries\@cs.cmu.edu> <kries\@ira.uka.de>
+Copyright (C) 1996,1997 Interactive System Labs and Klaus Ries
+
+
+Detailed description:
+
+All productions are assumed to be equally likely, unless one specifies
+probabilities after some of them ( the remain probability mass is
+distributed over the other prodcuctions ) in the form %%0.5%% ,
+which is interpreted as a comment by PHOENIX.
+A network can have the specification %%CLASS%% in a comment like in
+
+[townname_l] # %%CLASS%%
+( split )
+( bihac )
+(banja-luka)
+(sarajevo) # %%0.9%% since we are always talk about it
+(tuzla)
+(drvar)
+
+Elements within one class are assumed to be equiprobable.
+The specification of \"+\" is ignored and recursions are broken after
+the second visit of a net.
+
+The probability for leaving out an optional event is $leave_out_prob,
+the probability for repeating an repeatable event is $repeat_prob.
+One can either change these probabilies in the code or can rewrite the
+grammar such that one can specify them with corresponding rules.
+
+The generation of noises is possible if a file "noises.gra" is placed in
+same directory with the other grammar files, not all slots have to be present.
+The file "noises.gra" should look like
+______________________________________
+
+
+# &garbage is expanded to +garbage+ by definition, since "+" at
+# the beginning of a tokebn always means repetition
+BeginNoise # called before every utterance
+( ) # %%0.6%%
+( &garbage ) # %%0.25%%
+( &garbage &garbage )
+( &garbage &garbage &garbage )
+
+EndNoise # called after every utterance
+( ) # %%0.8%%
+( &garbage ) # %%0.15%%
+( &garbage &garbage )
+( &garbage &garbage &garbage )
+
+SegmentNoise # called between slots in the forms-file
+( ) # %%0.97%%
+( &garbage ) # %%0.015%%
+( &garbage &garbage ) # %%0.01%%
+( &garbage &garbage &garbage )
+
+BreakNoise # called after a break occured instead of SegmentNoise
+( ) # %%0.3%%
+( &garbage ) # %%0.6%%
+( &garbage &garbage )
+
+RandomNoise # Called after every word
+( ) # %%0.99%%
+( &garbage ) # %%0.008%%
+( &garbage &garbage )
+______________________________________
+
+A break is a reset of a slot that is entered in the forms file
+and simulates restarts resp. corrections.
+It may occur at every word and a probabilty for this event can be specified.
+Since it is usually marked by discourse markers a separate noise slot is defined for
+this event.
+
+
+EOT
+exit(0);
+}
Added: Tools/makelm.pl
===================================================================
--- Tools/makelm.pl (rev 0)
+++ Tools/makelm.pl 2007-04-05 14:26:18 UTC (rev 15)
@@ -0,0 +1,277 @@
+#!/usr/bin/perl
+
+use LWP::UserAgent;
+use HTTP::Request::Common;
+use File::Spec;
+use File::Copy;
+use File::stat;
+$ENV{'LC_COLLATE'} = 'C';
+$ENV{'LC_ALL'} = 'C';
+use locale;
+
+my $project = 'zap2';
+my $SAMPSIZE = scalar @ARGV? shift: 30000;
+
+$GRAMMARDIR = File::Spec->catdir(File::Spec->updir(), 'Grammar');
+$GRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'.gra');
+$FLATGRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'flat.gra');
+$CORPUS = 'tempfile';
+$BASEDIC = File::Spec->catfile($GRAMMARDIR, 'base.dic');
+$VOCAB = 'vocab';
+$CCS = 'temp.ccs';
+$PHOENIX = File::Spec->catfile($GRAMMARDIR, 'compile.exe');
+$TEXT2IDNGRAM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'text2idngram');
+$IDNGRAM2LM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'idngram2lm');
+$RANDOMSAMPS = 'generate_random_samples.pl';
+$IDNGRAM = $project.'.idngram';
+$LM = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'LanguageModel',
+ $project.'LM.arpa');
+$DICT = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'Dictionary',
+ $project.'.dict');
+$REDUCED_DICT = $DICT.'.reduced_phoneset';
+
+#get language model
+#copy('../Grammar/ZapTask.gra', '../Grammar/Zap.gra');
+#copy('../Grammar/ZapTask.forms', '../Grammar/forms');
+#chdir '../Grammar';
+#system('mk_nets2.pl Zap.gra');
+#chdir '../MakeLM';
+#system("$PHOENIX -g $GRAMMARDIR -f ..\\Grammar\\Zap");
+&say('compile', 'compiling grammar...');
+chdir($GRAMMARDIR);
+system('cmp.bat');
+&say('compile', 'compiling language model...');
+chdir('../MakeLM');
+&say('compile', 'generating corpus...');
+&getcorpus($CORPUS);
+&say('compile', 'getting vocabulary...');
+&getvocab($BASEDIC, $VOCAB, $CCS);
+&say('compile', 'computing ngrams...');
+my $cmd = "$TEXT2IDNGRAM -vocab $VOCAB -temp . -write_ascii < $CORPUS > $IDNGRAM";
+&say('compile', $cmd);
+die "text2idngram failed$/" if system($cmd);
+&say('compile', 'computing language model...');
+$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -good_turing -disc_ranges 0 0 0 -ascii_input";
+#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -absolute -ascii_input";
+#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -ascii_input";
+&say('compile', "$cmd$/");
+die "idngram2lm failed$/" if system($cmd);
+
+#get dictionary
+&say('compile', 'compiling dictionary...');
+&getdict($DICT, $REDUCED_DICT, $VOCAB);
+&say('compile', "done\n");
+
+sub getcorpus {
+ my $corpusfile = shift;
+ #flatten grammar
+ open(ZAPGRA, $GRAMMARFILE) || die "Can't open grammar file$/";
+ open(ZAPFLAT, ">$FLATGRAMMARFILE") || die "Can't open grammar flat file$/";
+ print ZAPFLAT &flat(<ZAPGRA>);
+ close ZAPGRA;
+ close ZAPFLAT;
+
+ open(RANDOM, "$RANDOMSAMPS -n $SAMPSIZE -d $GRAMMARDIR|") ||
+ die "Cannot execute $RANDOMSAMPS$/";
+ open(CORPUS, ">$corpusfile") || die "Can't open $corpusfile$/";
+ binmode CORPUS;
+ while (<RANDOM>) {
+ $_ = uc($_);
+ chomp;
+ s/<\/?S> //g;
+ print CORPUS "<s> $_ </s>\n";
+ }
+ close CORPUS;
+}
+
+sub getvocab {
+ my $basefile = shift;
+ my $vocab = shift;
+ my $ccs = shift;
+ open(VOCAB, ">$vocab") || die "Can't open $vocab$/";
+ binmode VOCAB;
+ open(BASE, "<$basefile") || die "Can't open $basefile$/";
+ my @base = map { /(.*) .*/? uc("$1\n"): () } <BASE>;
+ print VOCAB grep !/<\/?S>/, sort(@base);
+ print VOCAB "<s>\n";
+ print VOCAB "</s>\n";
+ close VOCAB;
+ open(CCS, ">$ccs") || die "Can't open $ccs$/";
+ binmode CCS;
+ print CCS "<s>\n";
+ close CCS;
+}
+
+sub getdict {
+ my $dict = shift;
+ my $reduced = shift;
+ my $vocab = shift;
+ my @long;
+ open(DICT, ">$dict") || die "Can't open $dict$/";
+ binmode DICT;
+ open(REDUCED, ">$reduced") || die "Can't open $reduced$/";
+ binmode REDUCED;
+ open(VOCAB, "<$vocab") || die "Can't open $vocab$/";
+ for (<VOCAB>) {
+ s/=/_/g;
+ push @long, $_;
+ }
+ foreach (&shorten(@long)) {
+ s/_/=/g;
+ print DICT "$_\n";
+ s/\bIX\b/AX/g;
+ print REDUCED "$_\n";
+ }
+ close DICT;
+ close REDUCED;
+}
+
+sub shorten {
+ open (OUTFILE, ">outfile.tmp");
+ open (LONGFILE, ">longfile.tmp");
+
+ my $counter = 0;
+ my $not_done = 1;
+ my @listlongwords;
+
+ foreach (@_) {
+ my @blah = split / /, $_;
+ for my $i (@blah) {
+ my $len = length $i;
+ if ($len >= 35) {
+ for my $m (@listlongwords) {
+ if ($m =~ /$i/) {
+ $not_done = 0;
+ }
+ }
+
+ if ($not_done) {
+ push(@listlongwords, $i);
+ }
+
+ $not_done = 1;
+ my @words = split /_/, $i;
+ for my $j (@words) {
+ print LONGFILE "$j ";
+ }
+ print LONGFILE "\n";
+ } else {
+ print OUTFILE "$i ";
+ }
+ }
+ }
+
+ close(OUTFILE);
+ close(LONGFILE);
+
+ my @diclines =
+ grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('outfile.tmp');
+ unlink 'outfile.tmp';
+ my @longlines = stat('longfile.tmp')->size?
+ grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('longfile.tmp'): ();
+ unlink 'longfile.tmp';
+
+ my $not_used = 1;
+
+ for my $i (@listlongwords) {
+ chomp $i;
+ my $lotsaphones = "$i ";
+ my @word_parts = split /_/, $i;
+ for my $word (@word_parts) {
+ $word =~ s/(\W)/\\$1/;
+ #warn "the word to look up is $word\n";
+ for my $j (@longlines) {
+ if ($not_used) {
+ if ($j =~ /^$word\t/) {
+ my @pronounciation = split /\t/, $j;
+ chomp $pronounciation[1];
+ $lotsaphones .= "$pronounciation[1] ";
+ $not_used = 0;
+ }
+ }
+ }
+ $not_used = 1;
+ }
+ $lotsaphones .= "\n";
+ push @diclines, $lotsaphones;
+ }
+ return @diclines;
+}
+
+sub getdic {
+ if ($#_ == -1) {
+ die "Need the corpus location as an argument\n";
+ }
+
+ my $ua = new LWP::UserAgent;
+ my $res = $ua->request(POST 'http://fife.speech.cs.cmu.edu/cgi-bin/tools/lmtool.2.pl',
+ Content_Type => 'form-data',
+ Content => [formtype => 'simple',
+ corpus => [$_[0]],
+ #handdict => undef,
+ #extrawords => undef,
+ #phoneset => '40',
+ #bracket => 'Yes',
+ #model => 'Bigram',
+ #class => 'nil',
+ #discount => '0.5',
+ submit => 'COMPILE KNOWLEDGE BASE']);
+
+ my $result;
+ if ($res->is_success) {
+ $result = $res->content;
+ } else {
+ die "Couldn't execute the perl script, probably error in the form$/";
+ }
+
+ if ($result =~ /\!-- DIC.*ct\/\/(.*)\">/) {
+
+ my $blah = "http://fife.speech.cs.cmu.edu/tools/product//$1";
+ $res = $ua->request(GET $blah);
+
+ if ($res->is_success) {
+ return split(/\n/, $res->content);
+ } else {
+ die "Can't find dictionary file$/";
+ }
+ } else {
+ die "Couldn't parse the result: $result$/";
+ }
+}
+
+sub say {
+ print shift, ": ", shift, $/;
+}
+
+sub flat {
+ my @unflat = @_;
+ my @result;
+ for (@unflat) {
+ if (!s/^\s*\((.*)\)/$1/) {
+ push @result, $_;
+ } else {
+ my @stack;
+ my %flathash;
+ push(@stack, [split]);
+ while (my $buffref = shift @stack) {
+ my $i = 0;
+ my @buff = @$buffref;
+ my $flat;
+ for (@buff) {
+ if (/^\*(.*)/) {
+ $flat .= "$1 ";
+ push(@stack, [ @buff[0..$i-1], @buff[$i+1..$#buff] ]);
+ } else {
+ $flat .= "$_ ";
+ }
+ $i++;
+ }
+ $flathash{$flat} = 1;
+ }
+ foreach (keys %flathash) {
+ push @result, "\t( $_)\n";
+ }
+ }
+ }
+ @result;
+}
More information about the Ravenclaw-developers
mailing list