[RavenclawDev 241] [18] Tools: Moved remotely
tk@edam.speech.cs.cmu.edu
tk at edam.speech.cs.cmu.edu
Thu Apr 5 10:28:59 EDT 2007
An HTML attachment was scrubbed...
URL: http://mailman.srv.cs.cmu.edu/pipermail/ravenclaw-developers/attachments/20070405/a1796fb7/attachment-0001.html
-------------- next part --------------
Copied: Tools/MakeLM/generate_random_samples.pl (from rev 17, Tools/generate_random_samples.pl)
===================================================================
--- Tools/MakeLM/generate_random_samples.pl (rev 0)
+++ Tools/MakeLM/generate_random_samples.pl 2007-04-05 14:28:59 UTC (rev 18)
@@ -0,0 +1,1015 @@
+#!/usr/bin/perl
+# This line tells emacs, that this is a -*-Perl-*- program
+
+$licence = <<LICENCE;
+
+Copyright (C) 1996 Interactive System Labs and Klaus Ries
+
+LICENCE
+
+$n_default=10;
+$break_prob_default = 0;
+$leave_out_prob_default = 0.1;
+$repeat_prob_default = 0.25;
+
+$n=$n_default;
+$break_prob = $break_prob_default;
+$leave_out_prob = $leave_out_prob_default;
+$repeat_prob = $repeat_prob_default;
+
+
+while($ARGV[0] =~ /^-.*$/) { # PROCESS FLAGS
+$_ = shift;
+if($_ eq "-h") {
+$help++;
+next;
+}
+if($_ eq "-n") {
+$n=shift;
+next;
+}
+if($_ eq "-leave_out") {
+$leave_out_prob=shift;
+next;
+}
+if($_ eq "-repeat") {
+$repeat_prob=shift;
+next;
+}
+if($_ eq "-show_break") {
+$show_break++;
+next;
+}
+if($_ eq "-d") {
+$dir=shift;
+next;
+}
+if($_ eq "-forms") {
+$forms=shift;
+next;
+}
+if($_ eq "-warn") {
+$warn++;
+next;
+}
+if($_ eq "-capital") {
+$capital++;
+next;
+}
+if($_ eq "-dictionary") {
+$dictionary++;
+next;
+}
+if($_ eq "-makedict") {
+$makedict++;
+next;
+}
+if($_ eq "-unkdict") {
+$unkdict++;
+next;
+}
+if($_ eq "-vocabulary") {
+$vocabulary++;
+next;
+}
+if($_ eq "-class") {
+$classes++;
+next;
+}
+if($_ eq "-noclass") {
+$noclasses++;
+next;
+}
+if($_ eq "-novariants") {
+$novariants++;
+next;
+}
+if($_ eq "-iclass") {
+$inverse_map++;
+next;
+}
+if($_ eq "-modifyLM") {
+$modify++;
+next;
+}
+if($_ eq "-clausi") {
+$clausi++;
+next;
+}
+die "Unrecognized command line option $_\nUse generate_random_samples -h for help\n";
+}
+
+help() if($help);
+
+if(defined $dir) {
+$currdir=$ENV{PWD};
+die "Cannot change to $dir\n" unless chdir $dir;
+warn "Changed to $dir\n";
+}
+
+
+# First load the forms of that grammer:
+
+$FORMS = "forms";
+$FORMS .= ".$forms" if defined $forms;
+
+open FORMS;
+
+while(<FORMS>) {
+
+ $random_perm++ if /%%randomperm%%/;
+
+ $prob=get_prob();
+
+ if(s/^\s*FUNCTION:\s*(\S+)//) {
+ my @F = ();
+ my @G = ();
+ $name=$1;
+ print STDERR "The name of the function is >>$name<<\n";
+ undef $nets;
+ push(@top_level,$name);
+ push(@top_level_prob,$prob);
+ push(@top_level_slots,\@F);
+ push(@top_level_probs,\@G);
+ push(@top_level_perm,$random_perm);
+ undef $random_perm;
+ $top_level{$name}++;
+ }
+ next unless defined $name;
+
+ if(/^\s*NETS:/) {
+ $nets++; next;
+ }
+ elsif($nets) {
+ 1;
+ }
+ else {
+ next;
+ }
+
+ if(s/^(\t|\s{2,})(\S+)/$2/) {
+ $entry=$2;
+ if(defined $top_level{$name,$entry}) {
+ warn "Multiple entry for $entry in function $name\n" ;
+ next;
+ }
+ push(@{$top_level_slots[$#top_level_slots]},$entry);
+ push(@{$top_level_probs[$#top_level_probs]},$prob);
+ $top_level{$name,$entry}++;
+ $public{$entry}++;
+ }
+}
+close FORMS;
+
+#warn join($/, @{$top_level_slots[$#top_level_slots]}), $/;
+
+normalize_random_entry(\@top_level_prob);
+
+foreach(@top_level_prob) {
+normalize_random_entry($_);
+}
+
+
+die "No top-level slots" unless () ne keys %top_level;
+
+# Now load the nets themselves
+
+$NETS = "nets";
+
+open NETS;
+while(<NETS>) {
+ next unless /\S/;
+ chomp;
+ s/\s+$//;
+
+ if(defined $nets{$_} || defined $nets{"[$_]"}) {
+ warn "Multiple entry for >>$_<< in nets\n" ;
+ next;
+ }
+ $nets{$_}++;
+
+#if(/[^A-Za-z]/) {
+ $_="[$_]";
+#}
+ $public{$_}++;
+}
+close NETS;
+
+$nets{"noises"}++ if -r "noises.gra";
+
+ at noise=("SegmentNoise","BeginNoise","EndNoise","BreakNoise","RandomNoise");
+
+if($nets{"noises"}) {
+ foreach(@noise) {
+ $public{$_}++;
+ }
+}
+
+# Next thing: Load all the grammars
+
+#foreach(keys %nets) {
+# print STDERR "Loading grammar file $_ ............\r";
+#
+# $NAME=$_;
+# load_grammar_file("$NAME.gra");
+#}
+load_grammar_file('zap2flat.gra');
+
+if($nets{"noises"}) {
+ foreach(@noise) {
+ undef $public{$_} unless defined $rules{$_};
+ }
+}
+
+chdir $currdir if defined $currdir;
+print STDERR "Finished loading grammar file \n";
+
+
+# Next thing: Generate sentences
+
+srand;
+
+check_grammar() if $warn;
+
+
+if($dictionary || $vocabulary || $makedict || $unkdict) { # Print all the defined dictionary/vocabulary entries
+foreach $slot_ref (@top_level_slots) {
+ foreach(@{$slot_ref}) {
+ $slots{$_}++;
+ }
+}
+ at netname = keys %slots;
+foreach(@noise) {
+push(@netname,$_) if defined $public{$_};
+}
+
+build_reachable_dictionary();
+
+
+open(OUT,"| sort -u | perl -pe 's/\\(0\\)\\}/\\}/;'");
+if($vocabulary) {
+foreach(keys %reachable_t) {
+ print OUT "$_\n";
+}
+print OUT "<s>\n</s>\n";
+}
+elsif($dictionary) {
+foreach(keys %reachable_t) {
+ if($class{$_}) {
+ $_=join("\n",print_net_all($_));
+ }
+ print OUT "$_\n";
+}
+}
+else { # Makedict !!
+ # or unkdict !!
+# Calculate all words that we need from the dictionary
+# Assign an empty list to all of them to get
+# the effective entries
+
+foreach(keys %reachable_t) {
+ next if $class{$_};
+ if(/\266/) { # In case we have entered that phrase to
+ # the dictionary
+ next if defined $vocab{$_};
+ my @F=();
+ $vocab{$_}=\@F;
+ }
+ if(/\266S$/) {
+ my $word2=$_;
+ $word2=~s/\266S$/\'S/;
+ next if defined $vocab{$word2};
+ my @F=();
+ $vocab{$word2}=\@F;
+ }
+ foreach(split(/\266/,$_)) {
+ next if defined $vocab{$_};
+ my @F=();
+ $vocab{$_}=\@F;
+ }
+}
+foreach(keys %class) {
+ next unless $reachable{$_};
+ foreach(print_net_all($_)) {
+ ($word,$class)=split(/:/,$_,2);
+ if($word=~/\266/) { # In case we have entered that phrase to
+ # the dictionary
+ next if defined $vocab{$word};
+ my @F=();
+ $vocab{$word}=\@F;
+ }
+ if($word=~/\266S$/) {
+ my $word2=$word;
+ $word2=~s/\266S$/\'S/;
+ next if defined $vocab{$word2};
+ my @F=();
+ $vocab{$word2}=\@F;
+ }
+ foreach(split(/\266/,$word)) {
+ next if defined $vocab{$_};
+ my @F=();
+ $vocab{$_}=\@F;
+ }
+ }
+}
+
+# Now selectively read the real dictionary from stdin/commandline
+# Also read pronounciation variants
+
+while(<>) {
+ s/^\{?\s*([^\s\}\(]+)(\([^\)]+\))?\s*\}?\s+//;
+ $word=$1;
+ $ref = $vocab{$word};
+ s/^\s*\{?\s*//;
+ s/\s*\}?\s*$//;
+ foreach $phonem (split) {
+ next unless $phonem=~/\S/ && $phonem=~/^[a-z]/i;
+ $phonem{$phonem}++;
+ }
+ next unless defined $ref;
+ next if $novariants && @{$ref}>0;
+ $pron=$_;
+ push(@{$ref},$pron) unless grep($_ eq $pron,@{$ref})>0;
+}
+
+
+if($unkdict) {
+ while(($word,$pron) = each %vocab) {
+ next if $word=~/\266/;
+ print OUT "$word\n" unless @{$pron} ;
+ }
+}
+else {
+ @phonem = sort { $phonem{$a} <=> $phonem{$b} }
+ grep($phonem{$_}>2 && $_ ne "SIL",keys %phonem);
+
+ # Pick a very seldom phonem for the words that are artifically in the dictionary
+ # to keep the vocabulary module happy
+ # These are specifically the class symbols
+
+ $badphonem=$phonem[0];
+ $badphonem.=" $badphonem";$badphonem.=" $badphonem";
+ $badphonem.=" $badphonem";$badphonem.=" $badphonem";
+
+ foreach(keys %reachable_t) {
+ next if $class{$_};
+ print OUT lookup_pronounciation($_,split(/\266/))
+ }
+ foreach(keys %class) {
+ next unless $reachable{$_};
+ print OUT "\{$_\} \{$badphonem\}\n";
+ foreach $symbol (print_net_all($_)) {
+ ($word,$class)=split(/:/,$symbol,2);
+ print OUT lookup_pronounciation($symbol,split(/\266/,$word));
+ }
+ }
+ print OUT "{\$} {SIL}\n{(} {SIL}\n{)} {SIL}\n";
+}
+close OUT;
+}
+}
+elsif($classes) {
+foreach $class (keys %class) {
+$_=join(" ",$class,print_net_all($class));
+print "$_\n";
+}
+}
+elsif($inverse_map) {
+foreach $class (keys %class) {
+foreach $classword (print_net_all($class)) {
+ ($word)=split(/:/,$classword);
+ $imap=$inversemap{$word};
+ if(defined $imap) {
+ push(@{$imap},$classword);
+ }
+ else {
+ my(@F)=($classword);
+ $inversemap{$word}=\@F;
+ }
+}
+}
+while(($word,$imap)=each %inversemap) {
+print join("\#",@{$imap})." $word\n";
+}
+}
+elsif($modify) {
+
+while(<>) {
+if(/^\\subsmodel:/) {
+ $skipsmodel++;
+ last;
+}
+print;
+}
+if($skipsmodel) {
+while(<>) {
+ if(/^\\end\\/) {
+ last;
+ }
+}
+print while <>;
+}
+
+print "\\subsmodel:\n";
+foreach $class (sort keys %class) {
+print "-99.9 $class $class\n";
+ at classmembers = print_net_all($class);
+$classpenalty = -log(scalar(@classmembers))/log(10);
+foreach $classmember (sort @classmembers) {
+ $_="$classpenalty $classmember $class\n";
+ print;
+}
+}
+print "\\end\\\n";
+}
+else { # Print random sentences
+foreach($i=0;$i<$n;$i++) { # Select a top-level frame and generate a sentence for it
+print_random_sentence(select_random_entry(\@top_level_prob,"FORM"));
+}
+}
+
+
+sub print_random_sentence {
+
+ my($select) = @_;
+ my($select_local,$parse,$parse_local);
+
+ #warn "print_random_sentence: top_level[select] -> $top_level[$select] >> $/";
+
+ if($top_level_perm[$select]) {
+ begin_noise();
+ $break=1;
+ while($break) {
+ $break=0;
+ print_net_random(${$top_level_slots[$select]}[$select_local]);
+ if($break) {
+ break_noise();
+ }
+ else {
+ segment_noise();
+ }
+ }
+ end_noise();
+ }
+ else {
+ begin_noise();
+ #foreach $select_local (@{$top_level_slots[$select]}) {
+ # tk - fix
+ # I don't understand, but without this fix every line would be a sucession of each net.
+ $select_local = $top_level_slots[$select][int(rand(scalar(@{$top_level_slots[$select]})))];
+ $break=1;
+ while($break) {
+ $break=0;
+ print_net_random($select_local);
+ if($break) {
+ break_noise();
+ }
+ else {
+ segment_noise();
+ }
+ }
+ #}
+ end_noise();
+ }
+ print "\n";
+}
+
+sub begin_noise {
+print_net_random("BeginNoise") if defined $public{"BeginNoise"};
+}
+
+
+sub end_noise {
+print_net_random("EndNoise") if defined $public{"EndNoise"};
+}
+
+
+sub segment_noise {
+print_net_random("SegmentNoise") if defined $public{"SegmentNoise"};
+}
+
+sub break_noise {
+print_net_random("BreakNoise") if defined $public{"BreakNoise"};
+}
+
+sub random_noise {
+print_net_random("RandomNoise") if defined $public{"RandomNoise"};
+}
+
+sub random_break {
+if(1-$break_prob<rand()) {
+$break++;
+print " **<** " if $show_break;
+}
+}
+
+
+sub select_random_entry {
+ my($array,$name) = @_;
+ my $prob=0;
+ my $random = rand();
+ my $return=0;
+ my $i=0;
+
+ foreach(@$array) {
+ $prob+=$_;
+ return $i if $random<$prob;
+ $i++;
+ }
+ die "$name not a probability distribution: ".join(" ",@{$array})."\n";
+}
+
+sub normalize_random_entry {
+local($array) = @_;
+my $i;
+my $prob = 0;
+my $zero = 0;
+
+if(@{$array}==1) {
+$$array[0]=1;
+return;
+}
+
+for( $i=0; $i<=$#$array; $i++ ) {
+$prob+=$$array[$i];
+if($$array[$i] == 0) {
+ $zero++;
+}
+}
+die "Not a probability distribution $prob>1\n" if $prob>1;
+
+if($zero>0) {
+$prob = (1-$prob)/$zero;
+for( $i=0; $i<=$#$array; $i++ ) {
+ $$array[$i] = $prob if $$array[$i]==0;
+}
+}
+}
+
+sub print_net_random {
+
+ my($netname) = @_;
+ my($repeat,$type,$body);
+
+ my($text) = "";
+
+ unless(defined $rules{$netname}) {
+ warn "Rule for $netname not defined\n" if $warn;
+ return "";
+ }
+
+ my($rule_ref) = ${$rules{$netname}}[select_random_entry($rules_prob{$netname},$netname)];
+
+ foreach(@$rule_ref) {
+ ($repeat,$type,$body) = split(/,/,$_);
+
+ while(1) {
+ if($repeat=~s/\*//g) {
+ last if rand()<$leave_out_prob;
+ $repeat="1" unless $repeat=~/\S/;
+ }
+ if($type eq "T" || $class{$body}) {
+ if($clausi) {
+ print "$body\n"; random_noise();
+ }
+ else {
+ print "$body "; random_noise();
+ }
+ }
+ else {
+ print_net_random($body);
+ random_break();
+ }
+ return if $break;
+ last if $repeat eq "1";
+ last unless rand()<$repeat_prob;
+ }
+ }
+}
+
+
+
+sub print_net_all {
+local($global_netname) = @_;
+local(%already_visited);
+
+my(@return) = print_net_all2($global_netname);
+my($ret_length)=scalar(@return);
+ at return=grep(/\S/, at return);
+warn "$global_netname could expand to empty, ignored\n"
+unless @return==$ret_length;
+ at return=grep(($_.=":$global_netname") || 1, at return);
+return @return;
+}
+
+
+sub print_net_all2 {
+
+my($netname) = shift @_;
+my($repeat,$type,$body,$ruleref);
+my(@returnarray)=();
+
+unless(defined $rules{$netname}) {
+warn "Rule for $netname not defined\n" if $warn;
+return ();
+}
+
+return @_ if $already_visited{$netname}>1;
+$already_visited{$netname}++;
+foreach $ruleref (@{$rules{$netname}}) {
+my(@rulereturn)=@_;
+foreach(@{$ruleref}) {
+ ($repeat,$type,$body) = split(/,/,$_);
+
+ my(@localrulereturn)=();
+
+ if($repeat eq "*") {
+ @localrulereturn=@rulereturn;
+ @localrulereturn=("") unless @localrulereturn>0;
+ }
+ warn "Rule for $netname contains repetition $repeat specification -- ignored"
+ unless $repeat eq "1" || $repeat eq "*";
+
+ if($type eq "T") {
+ $body = "+$body+" if $body =~ s/^&//;
+ if(@rulereturn) {
+ for($i=0;$i<=$#rulereturn;$i++) {
+ if($rulereturn[$i]=~/\S/) {
+ $rulereturn[$i].="\266$body";
+ } else {
+ $rulereturn[$i]="$body";
+ }
+ }
+ }
+ else {
+ @rulereturn=($body);
+ }
+ }
+ else {
+ @rulereturn=print_net_all2($body, at rulereturn);
+ }
+ push(@rulereturn, at localrulereturn);
+}
+push(@returnarray, at rulereturn);
+}
+$already_visited{$netname}--;
+return @returnarray;
+}
+
+
+sub build_reachable_dictionary {
+
+my($netname);
+my($repeat,$type,$body,$rule_ref);
+
+my($text) = "";
+
+while(@netname) {
+$netname = shift @netname;
+
+if($class{$netname}) {
+ $reachable_t{$netname}++;
+ $reachable{$netname}++;
+ next;
+}
+
+unless(defined $rules{$netname}) {
+ warn "Rule for $netname not defined in build_reachable_dictionary\n" if $warn;
+ return "";
+}
+
+foreach $rule_ref (@{$rules{$netname}}) {
+ foreach(@{$rule_ref}) {
+ ($repeat,$type,$body) = split(/,/,$_);
+ if($type eq "T") {
+ $reachable_t{$body}++;
+ }
+ else {
+ unless(exists $reachable{$body}) {
+ push(@netname,$body);
+ $reachable{$body}++;
+ }
+ }
+ }
+}
+}
+}
+
+
+sub check_grammar {
+
+while(($definition) = each %public) {
+warn "Net rule $definition not defined\n" unless defined $rules{$definition};
+}
+while(($definition,$count) = each %definition) {
+for($i=1;$i<=$count;$i++) {
+ warn "$definition:$count not defined" unless defined $rules{"$definition:$count"};
+}
+}
+}
+
+
+sub get_prob {
+
+if(s/[\#;]([\s\S]+)//) {
+my $comment=$1;
+
+if($comment=~/%%CLASS%%/) {
+ $class=1;
+} else {
+ $class=0;
+}
+
+return $1 if $comment=~/%%(\S+)%%/;
+}
+$class=0;
+return 0;
+}
+
+
+sub load_grammar_file {
+local($GRAMMAR_FILE) = @_;
+local($concept, at rules, at rules_prob);
+local(*GRAMMAR);
+
+# print STDERR "Reading $GRAMMAR_FILE of $NAME\n";
+
+$concept = "";
+ at rules = ();
+$conceptclass=0;
+
+open(GRAMMAR,$GRAMMAR_FILE);
+while(<GRAMMAR>) {
+if(/^\#include\s+(\S+)/) {
+ flush_concept();
+ $conceptclass=0;
+ load_grammar_file($1,$NAME);
+}
+else {
+ # Get rid of comments and empty lines
+
+ $prob=get_prob(); next unless /\S/;
+
+ # Classify line
+ if(/^(\S+)/) { # It's a concept
+ $new_concept=$1;
+ flush_concept();
+ $conceptclass=$class;
+ $concept=$new_concept;
+ }
+ else {
+ die "No concept defined" unless $concept=~/\S/;
+ die "No begin-parenthesis in $concept rules $_\n" unless s/^\s+\(\s*//;
+ die "No end-parenthesis in $concept rules $_\n" unless s/\s*\)\s*$//;
+
+ push(@rules,$_);
+ push(@rules_prob,$prob);
+ }
+}
+}
+flush_concept();
+close GRAMMAR;
+}
+
+
+sub select_random_array {
+local($array_ref) = $_[0];;
+
+return @$array_ref[int(rand(scalar(@$array_ref)))];
+}
+
+
+sub flush_concept {
+
+# Determine the symbol-table-entry for the concept defined
+# If it is a top-level entry, it is globally visible.
+# Otherwise, it is only local and may be multiple definded and
+# overwritten
+
+return unless $concept=~/\S/;
+
+if($public{$concept}) {
+$real_name=$concept;
+}
+else {
+$definition{$NAME,$concept};
+$real_name="$NAME:$concept:".($definition{"$NAME:$concept"}+1);
+}
+
+die "Multiple definition of $real_name\n" if defined $rules{$real_name};
+
+
+my @new_rules = ();
+
+foreach(@rules) {
+my @rule = ();
+foreach $body (split) {
+ $repeat = "1";
+ $repeat = $1 if $body=~s/^([\+\*]+)//;
+ $body =~ s/^\s+//; $body =~ s/\s+$//;
+ if( ($body=~/^[^A-Z_\-]/) && !($body =~ /^\[.*\]$/)) {
+ $type="T";
+ $body=~s/\+/'/g; # '
+ $body=~tr/a-z/A-Z/ if $capital;
+ $body = "+$body+" if $body =~ s/^&//;
+ $dictionary{$body}++;
+ }
+ else {
+ $type="N";
+
+ unless($public{$body}) {
+ $body="$NAME:$body:".($definition{"$NAME:$body"}+1);
+ }
+ }
+ push(@rule,"$repeat,$type,$body");
+}
+
+push(@new_rules,\@rule);
+}
+
+$rules{$real_name}=\@new_rules;
+
+my @new_rules_prob = @rules_prob;
+normalize_random_entry(\@new_rules_prob);
+$rules_prob{$real_name}=\@new_rules_prob;
+
+
+unless($public{$concept}) {
+$definition{"$NAME:$concept"}++;
+}
+
+ at rules = ();
+ at rules_prob = ();
+$class{$real_name}++ if $conceptclass && !$noclasses;
+
+$concept="";
+
+}
+
+sub lookup_pronounciation {
+
+local($word) = shift @_;
+
+local($error)="";
+
+my(@pron);
+my($pronword)=join("\266", at _);
+
+if(defined $vocab{join("\266", at _)} && @{$vocab{join("\266", at _)}}>0) {
+ at pron = allchunk(join("\266", at _));
+}
+elsif(($pronword=~s/\266S$/\'S/) && (defined $vocab{$pronword}) && @{$vocab{$pronword}}>0) {
+ at pron = allchunk($pronword);
+}
+else {
+ at pron = allchunk(@_);
+}
+return $error if $error=~/\S/; # Error case
+my($ret) = "{$word(0)} {$pron[0]}\n";
+for($i=1;$i<=$#pron;$i++) {
+$ret .= "{$word(".($i+1).")} {$pron[$i]}\n";
+}
+return $ret;
+}
+
+sub allchunk {
+my $prefix = shift @_;
+my $dictref = $vocab{$prefix};
+my @ret=();
+
+unless((defined $dictref) && (@{$dictref}>0)) {
+$error .= "ERROR: Not all subwords ($prefix) known for $word\n";
+allchunk(@_) unless @_==0;
+
+return ();
+}
+
+return @{$dictref} if @_==0;
+
+my($suff,$pref);
+
+foreach $suff (allchunk(@_)) {
+foreach $pref (@{$dictref}) {
+ push(@ret,"$pref $suff");
+}
+}
+return @ret;
+}
+
+
+
+
+
+sub help {
+print <<EOT;
+generate_random_samples -h -n N -break prob -show_break -d dir -forms ext -noclass -warn -capital
+ -clausi -repeat prob -leave_out prob -vocabulary -class -iclass
+ -dictionary -modifyLM [LanguageModel]
+
+
+Generates random sentences from a PHOENIX grammar that is in the
+current directory as if it were a stochastic context free grammmar.
+
+
+-h -- print this help
+-n N -- print N random sentences ( default $n_default )
+-break prob -- probability of a sentence break ( default $break_prob_default )
+-show_break -- visualizes the occurence of breaks for debugging
+-d dir -- change to directory dir as the PHOENIX grammar dir
+-forms ext -- use forms.ext instead of forms to generate from
+-noclass -- don\'t use classes when generating text, which is more human-readable
+-warn -- only test the grammar
+-capital -- capitalize all words
+-clausi -- use special output format
+-repeat prob -- repetition probability ( default $repeat_prob_default )
+-leave_out prob-- leaving out an optional item probability ( default $leave_out_prob_default )
+-vocabulary -- print the vocabulary used in the grammar usable for the LM
+-class -- print the classes used
+-iclass -- print the inverse classes used
+-dictionary -- only print all the words in the grammar for pronounciation
+ dictionary construction
+-modifyLM -- takes an LM from stdin or from files after the options
+ (possibly with old classes, these will be eliminated)
+ and adds classes according to the specification by the grammar
+-makedict -- takes a dictionary from stdin or from files after the options
+ and generates a pronounciation dictionary
+-unkdict -- takes a dictionary from stdin or from files after the options
+ and generates a list of unknown words related to that dictionary
+ The assumtion is, that all word strings are comosed into subwords
+
+Written by Klaus Ries <ries\@cs.cmu.edu> <kries\@ira.uka.de>
+Copyright (C) 1996,1997 Interactive System Labs and Klaus Ries
+
+
+Detailed description:
+
+All productions are assumed to be equally likely, unless one specifies
+probabilities after some of them ( the remain probability mass is
+distributed over the other prodcuctions ) in the form %%0.5%% ,
+which is interpreted as a comment by PHOENIX.
+A network can have the specification %%CLASS%% in a comment like in
+
+[townname_l] # %%CLASS%%
+( split )
+( bihac )
+(banja-luka)
+(sarajevo) # %%0.9%% since we are always talk about it
+(tuzla)
+(drvar)
+
+Elements within one class are assumed to be equiprobable.
+The specification of \"+\" is ignored and recursions are broken after
+the second visit of a net.
+
+The probability for leaving out an optional event is $leave_out_prob,
+the probability for repeating an repeatable event is $repeat_prob.
+One can either change these probabilies in the code or can rewrite the
+grammar such that one can specify them with corresponding rules.
+
+The generation of noises is possible if a file "noises.gra" is placed in
+same directory with the other grammar files, not all slots have to be present.
+The file "noises.gra" should look like
+______________________________________
+
+
+# &garbage is expanded to +garbage+ by definition, since "+" at
+# the beginning of a tokebn always means repetition
+BeginNoise # called before every utterance
+( ) # %%0.6%%
+( &garbage ) # %%0.25%%
+( &garbage &garbage )
+( &garbage &garbage &garbage )
+
+EndNoise # called after every utterance
+( ) # %%0.8%%
+( &garbage ) # %%0.15%%
+( &garbage &garbage )
+( &garbage &garbage &garbage )
+
+SegmentNoise # called between slots in the forms-file
+( ) # %%0.97%%
+( &garbage ) # %%0.015%%
+( &garbage &garbage ) # %%0.01%%
+( &garbage &garbage &garbage )
+
+BreakNoise # called after a break occured instead of SegmentNoise
+( ) # %%0.3%%
+( &garbage ) # %%0.6%%
+( &garbage &garbage )
+
+RandomNoise # Called after every word
+( ) # %%0.99%%
+( &garbage ) # %%0.008%%
+( &garbage &garbage )
+______________________________________
+
+A break is a reset of a slot that is entered in the forms file
+and simulates restarts resp. corrections.
+It may occur at every word and a probabilty for this event can be specified.
+Since it is usually marked by discourse markers a separate noise slot is defined for
+this event.
+
+
+EOT
+exit(0);
+}
Deleted: Tools/generate_random_samples.pl
===================================================================
--- Tools/generate_random_samples.pl 2007-04-05 14:28:48 UTC (rev 17)
+++ Tools/generate_random_samples.pl 2007-04-05 14:28:59 UTC (rev 18)
@@ -1,1015 +0,0 @@
-#!/usr/bin/perl
-# This line tells emacs, that this is a -*-Perl-*- program
-
-$licence = <<LICENCE;
-
-Copyright (C) 1996 Interactive System Labs and Klaus Ries
-
-LICENCE
-
-$n_default=10;
-$break_prob_default = 0;
-$leave_out_prob_default = 0.1;
-$repeat_prob_default = 0.25;
-
-$n=$n_default;
-$break_prob = $break_prob_default;
-$leave_out_prob = $leave_out_prob_default;
-$repeat_prob = $repeat_prob_default;
-
-
-while($ARGV[0] =~ /^-.*$/) { # PROCESS FLAGS
-$_ = shift;
-if($_ eq "-h") {
-$help++;
-next;
-}
-if($_ eq "-n") {
-$n=shift;
-next;
-}
-if($_ eq "-leave_out") {
-$leave_out_prob=shift;
-next;
-}
-if($_ eq "-repeat") {
-$repeat_prob=shift;
-next;
-}
-if($_ eq "-show_break") {
-$show_break++;
-next;
-}
-if($_ eq "-d") {
-$dir=shift;
-next;
-}
-if($_ eq "-forms") {
-$forms=shift;
-next;
-}
-if($_ eq "-warn") {
-$warn++;
-next;
-}
-if($_ eq "-capital") {
-$capital++;
-next;
-}
-if($_ eq "-dictionary") {
-$dictionary++;
-next;
-}
-if($_ eq "-makedict") {
-$makedict++;
-next;
-}
-if($_ eq "-unkdict") {
-$unkdict++;
-next;
-}
-if($_ eq "-vocabulary") {
-$vocabulary++;
-next;
-}
-if($_ eq "-class") {
-$classes++;
-next;
-}
-if($_ eq "-noclass") {
-$noclasses++;
-next;
-}
-if($_ eq "-novariants") {
-$novariants++;
-next;
-}
-if($_ eq "-iclass") {
-$inverse_map++;
-next;
-}
-if($_ eq "-modifyLM") {
-$modify++;
-next;
-}
-if($_ eq "-clausi") {
-$clausi++;
-next;
-}
-die "Unrecognized command line option $_\nUse generate_random_samples -h for help\n";
-}
-
-help() if($help);
-
-if(defined $dir) {
-$currdir=$ENV{PWD};
-die "Cannot change to $dir\n" unless chdir $dir;
-warn "Changed to $dir\n";
-}
-
-
-# First load the forms of that grammer:
-
-$FORMS = "forms";
-$FORMS .= ".$forms" if defined $forms;
-
-open FORMS;
-
-while(<FORMS>) {
-
- $random_perm++ if /%%randomperm%%/;
-
- $prob=get_prob();
-
- if(s/^\s*FUNCTION:\s*(\S+)//) {
- my @F = ();
- my @G = ();
- $name=$1;
- print STDERR "The name of the function is >>$name<<\n";
- undef $nets;
- push(@top_level,$name);
- push(@top_level_prob,$prob);
- push(@top_level_slots,\@F);
- push(@top_level_probs,\@G);
- push(@top_level_perm,$random_perm);
- undef $random_perm;
- $top_level{$name}++;
- }
- next unless defined $name;
-
- if(/^\s*NETS:/) {
- $nets++; next;
- }
- elsif($nets) {
- 1;
- }
- else {
- next;
- }
-
- if(s/^(\t|\s{2,})(\S+)/$2/) {
- $entry=$2;
- if(defined $top_level{$name,$entry}) {
- warn "Multiple entry for $entry in function $name\n" ;
- next;
- }
- push(@{$top_level_slots[$#top_level_slots]},$entry);
- push(@{$top_level_probs[$#top_level_probs]},$prob);
- $top_level{$name,$entry}++;
- $public{$entry}++;
- }
-}
-close FORMS;
-
-#warn join($/, @{$top_level_slots[$#top_level_slots]}), $/;
-
-normalize_random_entry(\@top_level_prob);
-
-foreach(@top_level_prob) {
-normalize_random_entry($_);
-}
-
-
-die "No top-level slots" unless () ne keys %top_level;
-
-# Now load the nets themselves
-
-$NETS = "nets";
-
-open NETS;
-while(<NETS>) {
- next unless /\S/;
- chomp;
- s/\s+$//;
-
- if(defined $nets{$_} || defined $nets{"[$_]"}) {
- warn "Multiple entry for >>$_<< in nets\n" ;
- next;
- }
- $nets{$_}++;
-
-#if(/[^A-Za-z]/) {
- $_="[$_]";
-#}
- $public{$_}++;
-}
-close NETS;
-
-$nets{"noises"}++ if -r "noises.gra";
-
- at noise=("SegmentNoise","BeginNoise","EndNoise","BreakNoise","RandomNoise");
-
-if($nets{"noises"}) {
- foreach(@noise) {
- $public{$_}++;
- }
-}
-
-# Next thing: Load all the grammars
-
-#foreach(keys %nets) {
-# print STDERR "Loading grammar file $_ ............\r";
-#
-# $NAME=$_;
-# load_grammar_file("$NAME.gra");
-#}
-load_grammar_file('zap2flat.gra');
-
-if($nets{"noises"}) {
- foreach(@noise) {
- undef $public{$_} unless defined $rules{$_};
- }
-}
-
-chdir $currdir if defined $currdir;
-print STDERR "Finished loading grammar file \n";
-
-
-# Next thing: Generate sentences
-
-srand;
-
-check_grammar() if $warn;
-
-
-if($dictionary || $vocabulary || $makedict || $unkdict) { # Print all the defined dictionary/vocabulary entries
-foreach $slot_ref (@top_level_slots) {
- foreach(@{$slot_ref}) {
- $slots{$_}++;
- }
-}
- at netname = keys %slots;
-foreach(@noise) {
-push(@netname,$_) if defined $public{$_};
-}
-
-build_reachable_dictionary();
-
-
-open(OUT,"| sort -u | perl -pe 's/\\(0\\)\\}/\\}/;'");
-if($vocabulary) {
-foreach(keys %reachable_t) {
- print OUT "$_\n";
-}
-print OUT "<s>\n</s>\n";
-}
-elsif($dictionary) {
-foreach(keys %reachable_t) {
- if($class{$_}) {
- $_=join("\n",print_net_all($_));
- }
- print OUT "$_\n";
-}
-}
-else { # Makedict !!
- # or unkdict !!
-# Calculate all words that we need from the dictionary
-# Assign an empty list to all of them to get
-# the effective entries
-
-foreach(keys %reachable_t) {
- next if $class{$_};
- if(/\266/) { # In case we have entered that phrase to
- # the dictionary
- next if defined $vocab{$_};
- my @F=();
- $vocab{$_}=\@F;
- }
- if(/\266S$/) {
- my $word2=$_;
- $word2=~s/\266S$/\'S/;
- next if defined $vocab{$word2};
- my @F=();
- $vocab{$word2}=\@F;
- }
- foreach(split(/\266/,$_)) {
- next if defined $vocab{$_};
- my @F=();
- $vocab{$_}=\@F;
- }
-}
-foreach(keys %class) {
- next unless $reachable{$_};
- foreach(print_net_all($_)) {
- ($word,$class)=split(/:/,$_,2);
- if($word=~/\266/) { # In case we have entered that phrase to
- # the dictionary
- next if defined $vocab{$word};
- my @F=();
- $vocab{$word}=\@F;
- }
- if($word=~/\266S$/) {
- my $word2=$word;
- $word2=~s/\266S$/\'S/;
- next if defined $vocab{$word2};
- my @F=();
- $vocab{$word2}=\@F;
- }
- foreach(split(/\266/,$word)) {
- next if defined $vocab{$_};
- my @F=();
- $vocab{$_}=\@F;
- }
- }
-}
-
-# Now selectively read the real dictionary from stdin/commandline
-# Also read pronounciation variants
-
-while(<>) {
- s/^\{?\s*([^\s\}\(]+)(\([^\)]+\))?\s*\}?\s+//;
- $word=$1;
- $ref = $vocab{$word};
- s/^\s*\{?\s*//;
- s/\s*\}?\s*$//;
- foreach $phonem (split) {
- next unless $phonem=~/\S/ && $phonem=~/^[a-z]/i;
- $phonem{$phonem}++;
- }
- next unless defined $ref;
- next if $novariants && @{$ref}>0;
- $pron=$_;
- push(@{$ref},$pron) unless grep($_ eq $pron,@{$ref})>0;
-}
-
-
-if($unkdict) {
- while(($word,$pron) = each %vocab) {
- next if $word=~/\266/;
- print OUT "$word\n" unless @{$pron} ;
- }
-}
-else {
- @phonem = sort { $phonem{$a} <=> $phonem{$b} }
- grep($phonem{$_}>2 && $_ ne "SIL",keys %phonem);
-
- # Pick a very seldom phonem for the words that are artifically in the dictionary
- # to keep the vocabulary module happy
- # These are specifically the class symbols
-
- $badphonem=$phonem[0];
- $badphonem.=" $badphonem";$badphonem.=" $badphonem";
- $badphonem.=" $badphonem";$badphonem.=" $badphonem";
-
- foreach(keys %reachable_t) {
- next if $class{$_};
- print OUT lookup_pronounciation($_,split(/\266/))
- }
- foreach(keys %class) {
- next unless $reachable{$_};
- print OUT "\{$_\} \{$badphonem\}\n";
- foreach $symbol (print_net_all($_)) {
- ($word,$class)=split(/:/,$symbol,2);
- print OUT lookup_pronounciation($symbol,split(/\266/,$word));
- }
- }
- print OUT "{\$} {SIL}\n{(} {SIL}\n{)} {SIL}\n";
-}
-close OUT;
-}
-}
-elsif($classes) {
-foreach $class (keys %class) {
-$_=join(" ",$class,print_net_all($class));
-print "$_\n";
-}
-}
-elsif($inverse_map) {
-foreach $class (keys %class) {
-foreach $classword (print_net_all($class)) {
- ($word)=split(/:/,$classword);
- $imap=$inversemap{$word};
- if(defined $imap) {
- push(@{$imap},$classword);
- }
- else {
- my(@F)=($classword);
- $inversemap{$word}=\@F;
- }
-}
-}
-while(($word,$imap)=each %inversemap) {
-print join("\#",@{$imap})." $word\n";
-}
-}
-elsif($modify) {
-
-while(<>) {
-if(/^\\subsmodel:/) {
- $skipsmodel++;
- last;
-}
-print;
-}
-if($skipsmodel) {
-while(<>) {
- if(/^\\end\\/) {
- last;
- }
-}
-print while <>;
-}
-
-print "\\subsmodel:\n";
-foreach $class (sort keys %class) {
-print "-99.9 $class $class\n";
- at classmembers = print_net_all($class);
-$classpenalty = -log(scalar(@classmembers))/log(10);
-foreach $classmember (sort @classmembers) {
- $_="$classpenalty $classmember $class\n";
- print;
-}
-}
-print "\\end\\\n";
-}
-else { # Print random sentences
-foreach($i=0;$i<$n;$i++) { # Select a top-level frame and generate a sentence for it
-print_random_sentence(select_random_entry(\@top_level_prob,"FORM"));
-}
-}
-
-
-sub print_random_sentence {
-
- my($select) = @_;
- my($select_local,$parse,$parse_local);
-
- #warn "print_random_sentence: top_level[select] -> $top_level[$select] >> $/";
-
- if($top_level_perm[$select]) {
- begin_noise();
- $break=1;
- while($break) {
- $break=0;
- print_net_random(${$top_level_slots[$select]}[$select_local]);
- if($break) {
- break_noise();
- }
- else {
- segment_noise();
- }
- }
- end_noise();
- }
- else {
- begin_noise();
- #foreach $select_local (@{$top_level_slots[$select]}) {
- # tk - fix
- # I don't understand, but without this fix every line would be a sucession of each net.
- $select_local = $top_level_slots[$select][int(rand(scalar(@{$top_level_slots[$select]})))];
- $break=1;
- while($break) {
- $break=0;
- print_net_random($select_local);
- if($break) {
- break_noise();
- }
- else {
- segment_noise();
- }
- }
- #}
- end_noise();
- }
- print "\n";
-}
-
-sub begin_noise {
-print_net_random("BeginNoise") if defined $public{"BeginNoise"};
-}
-
-
-sub end_noise {
-print_net_random("EndNoise") if defined $public{"EndNoise"};
-}
-
-
-sub segment_noise {
-print_net_random("SegmentNoise") if defined $public{"SegmentNoise"};
-}
-
-sub break_noise {
-print_net_random("BreakNoise") if defined $public{"BreakNoise"};
-}
-
-sub random_noise {
-print_net_random("RandomNoise") if defined $public{"RandomNoise"};
-}
-
-sub random_break {
-if(1-$break_prob<rand()) {
-$break++;
-print " **<** " if $show_break;
-}
-}
-
-
-sub select_random_entry {
- my($array,$name) = @_;
- my $prob=0;
- my $random = rand();
- my $return=0;
- my $i=0;
-
- foreach(@$array) {
- $prob+=$_;
- return $i if $random<$prob;
- $i++;
- }
- die "$name not a probability distribution: ".join(" ",@{$array})."\n";
-}
-
-sub normalize_random_entry {
-local($array) = @_;
-my $i;
-my $prob = 0;
-my $zero = 0;
-
-if(@{$array}==1) {
-$$array[0]=1;
-return;
-}
-
-for( $i=0; $i<=$#$array; $i++ ) {
-$prob+=$$array[$i];
-if($$array[$i] == 0) {
- $zero++;
-}
-}
-die "Not a probability distribution $prob>1\n" if $prob>1;
-
-if($zero>0) {
-$prob = (1-$prob)/$zero;
-for( $i=0; $i<=$#$array; $i++ ) {
- $$array[$i] = $prob if $$array[$i]==0;
-}
-}
-}
-
-sub print_net_random {
-
- my($netname) = @_;
- my($repeat,$type,$body);
-
- my($text) = "";
-
- unless(defined $rules{$netname}) {
- warn "Rule for $netname not defined\n" if $warn;
- return "";
- }
-
- my($rule_ref) = ${$rules{$netname}}[select_random_entry($rules_prob{$netname},$netname)];
-
- foreach(@$rule_ref) {
- ($repeat,$type,$body) = split(/,/,$_);
-
- while(1) {
- if($repeat=~s/\*//g) {
- last if rand()<$leave_out_prob;
- $repeat="1" unless $repeat=~/\S/;
- }
- if($type eq "T" || $class{$body}) {
- if($clausi) {
- print "$body\n"; random_noise();
- }
- else {
- print "$body "; random_noise();
- }
- }
- else {
- print_net_random($body);
- random_break();
- }
- return if $break;
- last if $repeat eq "1";
- last unless rand()<$repeat_prob;
- }
- }
-}
-
-
-
-sub print_net_all {
-local($global_netname) = @_;
-local(%already_visited);
-
-my(@return) = print_net_all2($global_netname);
-my($ret_length)=scalar(@return);
- at return=grep(/\S/, at return);
-warn "$global_netname could expand to empty, ignored\n"
-unless @return==$ret_length;
- at return=grep(($_.=":$global_netname") || 1, at return);
-return @return;
-}
-
-
-sub print_net_all2 {
-
-my($netname) = shift @_;
-my($repeat,$type,$body,$ruleref);
-my(@returnarray)=();
-
-unless(defined $rules{$netname}) {
-warn "Rule for $netname not defined\n" if $warn;
-return ();
-}
-
-return @_ if $already_visited{$netname}>1;
-$already_visited{$netname}++;
-foreach $ruleref (@{$rules{$netname}}) {
-my(@rulereturn)=@_;
-foreach(@{$ruleref}) {
- ($repeat,$type,$body) = split(/,/,$_);
-
- my(@localrulereturn)=();
-
- if($repeat eq "*") {
- @localrulereturn=@rulereturn;
- @localrulereturn=("") unless @localrulereturn>0;
- }
- warn "Rule for $netname contains repetition $repeat specification -- ignored"
- unless $repeat eq "1" || $repeat eq "*";
-
- if($type eq "T") {
- $body = "+$body+" if $body =~ s/^&//;
- if(@rulereturn) {
- for($i=0;$i<=$#rulereturn;$i++) {
- if($rulereturn[$i]=~/\S/) {
- $rulereturn[$i].="\266$body";
- } else {
- $rulereturn[$i]="$body";
- }
- }
- }
- else {
- @rulereturn=($body);
- }
- }
- else {
- @rulereturn=print_net_all2($body, at rulereturn);
- }
- push(@rulereturn, at localrulereturn);
-}
-push(@returnarray, at rulereturn);
-}
-$already_visited{$netname}--;
-return @returnarray;
-}
-
-
-sub build_reachable_dictionary {
-
-my($netname);
-my($repeat,$type,$body,$rule_ref);
-
-my($text) = "";
-
-while(@netname) {
-$netname = shift @netname;
-
-if($class{$netname}) {
- $reachable_t{$netname}++;
- $reachable{$netname}++;
- next;
-}
-
-unless(defined $rules{$netname}) {
- warn "Rule for $netname not defined in build_reachable_dictionary\n" if $warn;
- return "";
-}
-
-foreach $rule_ref (@{$rules{$netname}}) {
- foreach(@{$rule_ref}) {
- ($repeat,$type,$body) = split(/,/,$_);
- if($type eq "T") {
- $reachable_t{$body}++;
- }
- else {
- unless(exists $reachable{$body}) {
- push(@netname,$body);
- $reachable{$body}++;
- }
- }
- }
-}
-}
-}
-
-
-sub check_grammar {
-
-while(($definition) = each %public) {
-warn "Net rule $definition not defined\n" unless defined $rules{$definition};
-}
-while(($definition,$count) = each %definition) {
-for($i=1;$i<=$count;$i++) {
- warn "$definition:$count not defined" unless defined $rules{"$definition:$count"};
-}
-}
-}
-
-
-sub get_prob {
-
-if(s/[\#;]([\s\S]+)//) {
-my $comment=$1;
-
-if($comment=~/%%CLASS%%/) {
- $class=1;
-} else {
- $class=0;
-}
-
-return $1 if $comment=~/%%(\S+)%%/;
-}
-$class=0;
-return 0;
-}
-
-
-sub load_grammar_file {
-local($GRAMMAR_FILE) = @_;
-local($concept, at rules, at rules_prob);
-local(*GRAMMAR);
-
-# print STDERR "Reading $GRAMMAR_FILE of $NAME\n";
-
-$concept = "";
- at rules = ();
-$conceptclass=0;
-
-open(GRAMMAR,$GRAMMAR_FILE);
-while(<GRAMMAR>) {
-if(/^\#include\s+(\S+)/) {
- flush_concept();
- $conceptclass=0;
- load_grammar_file($1,$NAME);
-}
-else {
- # Get rid of comments and empty lines
-
- $prob=get_prob(); next unless /\S/;
-
- # Classify line
- if(/^(\S+)/) { # It's a concept
- $new_concept=$1;
- flush_concept();
- $conceptclass=$class;
- $concept=$new_concept;
- }
- else {
- die "No concept defined" unless $concept=~/\S/;
- die "No begin-parenthesis in $concept rules $_\n" unless s/^\s+\(\s*//;
- die "No end-parenthesis in $concept rules $_\n" unless s/\s*\)\s*$//;
-
- push(@rules,$_);
- push(@rules_prob,$prob);
- }
-}
-}
-flush_concept();
-close GRAMMAR;
-}
-
-
-sub select_random_array {
-local($array_ref) = $_[0];;
-
-return @$array_ref[int(rand(scalar(@$array_ref)))];
-}
-
-
-sub flush_concept {
-
-# Determine the symbol-table-entry for the concept defined
-# If it is a top-level entry, it is globally visible.
-# Otherwise, it is only local and may be multiple definded and
-# overwritten
-
-return unless $concept=~/\S/;
-
-if($public{$concept}) {
-$real_name=$concept;
-}
-else {
-$definition{$NAME,$concept};
-$real_name="$NAME:$concept:".($definition{"$NAME:$concept"}+1);
-}
-
-die "Multiple definition of $real_name\n" if defined $rules{$real_name};
-
-
-my @new_rules = ();
-
-foreach(@rules) {
-my @rule = ();
-foreach $body (split) {
- $repeat = "1";
- $repeat = $1 if $body=~s/^([\+\*]+)//;
- $body =~ s/^\s+//; $body =~ s/\s+$//;
- if( ($body=~/^[^A-Z_\-]/) && !($body =~ /^\[.*\]$/)) {
- $type="T";
- $body=~s/\+/'/g; # '
- $body=~tr/a-z/A-Z/ if $capital;
- $body = "+$body+" if $body =~ s/^&//;
- $dictionary{$body}++;
- }
- else {
- $type="N";
-
- unless($public{$body}) {
- $body="$NAME:$body:".($definition{"$NAME:$body"}+1);
- }
- }
- push(@rule,"$repeat,$type,$body");
-}
-
-push(@new_rules,\@rule);
-}
-
-$rules{$real_name}=\@new_rules;
-
-my @new_rules_prob = @rules_prob;
-normalize_random_entry(\@new_rules_prob);
-$rules_prob{$real_name}=\@new_rules_prob;
-
-
-unless($public{$concept}) {
-$definition{"$NAME:$concept"}++;
-}
-
- at rules = ();
- at rules_prob = ();
-$class{$real_name}++ if $conceptclass && !$noclasses;
-
-$concept="";
-
-}
-
-sub lookup_pronounciation {
-
-local($word) = shift @_;
-
-local($error)="";
-
-my(@pron);
-my($pronword)=join("\266", at _);
-
-if(defined $vocab{join("\266", at _)} && @{$vocab{join("\266", at _)}}>0) {
- at pron = allchunk(join("\266", at _));
-}
-elsif(($pronword=~s/\266S$/\'S/) && (defined $vocab{$pronword}) && @{$vocab{$pronword}}>0) {
- at pron = allchunk($pronword);
-}
-else {
- at pron = allchunk(@_);
-}
-return $error if $error=~/\S/; # Error case
-my($ret) = "{$word(0)} {$pron[0]}\n";
-for($i=1;$i<=$#pron;$i++) {
-$ret .= "{$word(".($i+1).")} {$pron[$i]}\n";
-}
-return $ret;
-}
-
-sub allchunk {
-my $prefix = shift @_;
-my $dictref = $vocab{$prefix};
-my @ret=();
-
-unless((defined $dictref) && (@{$dictref}>0)) {
-$error .= "ERROR: Not all subwords ($prefix) known for $word\n";
-allchunk(@_) unless @_==0;
-
-return ();
-}
-
-return @{$dictref} if @_==0;
-
-my($suff,$pref);
-
-foreach $suff (allchunk(@_)) {
-foreach $pref (@{$dictref}) {
- push(@ret,"$pref $suff");
-}
-}
-return @ret;
-}
-
-
-
-
-
-sub help {
-print <<EOT;
-generate_random_samples -h -n N -break prob -show_break -d dir -forms ext -noclass -warn -capital
- -clausi -repeat prob -leave_out prob -vocabulary -class -iclass
- -dictionary -modifyLM [LanguageModel]
-
-
-Generates random sentences from a PHOENIX grammar that is in the
-current directory as if it were a stochastic context free grammmar.
-
-
--h -- print this help
--n N -- print N random sentences ( default $n_default )
--break prob -- probability of a sentence break ( default $break_prob_default )
--show_break -- visualizes the occurence of breaks for debugging
--d dir -- change to directory dir as the PHOENIX grammar dir
--forms ext -- use forms.ext instead of forms to generate from
--noclass -- don\'t use classes when generating text, which is more human-readable
--warn -- only test the grammar
--capital -- capitalize all words
--clausi -- use special output format
--repeat prob -- repetition probability ( default $repeat_prob_default )
--leave_out prob-- leaving out an optional item probability ( default $leave_out_prob_default )
--vocabulary -- print the vocabulary used in the grammar usable for the LM
--class -- print the classes used
--iclass -- print the inverse classes used
--dictionary -- only print all the words in the grammar for pronounciation
- dictionary construction
--modifyLM -- takes an LM from stdin or from files after the options
- (possibly with old classes, these will be eliminated)
- and adds classes according to the specification by the grammar
--makedict -- takes a dictionary from stdin or from files after the options
- and generates a pronounciation dictionary
--unkdict -- takes a dictionary from stdin or from files after the options
- and generates a list of unknown words related to that dictionary
- The assumtion is, that all word strings are comosed into subwords
-
-Written by Klaus Ries <ries\@cs.cmu.edu> <kries\@ira.uka.de>
-Copyright (C) 1996,1997 Interactive System Labs and Klaus Ries
-
-
-Detailed description:
-
-All productions are assumed to be equally likely, unless one specifies
-probabilities after some of them ( the remain probability mass is
-distributed over the other prodcuctions ) in the form %%0.5%% ,
-which is interpreted as a comment by PHOENIX.
-A network can have the specification %%CLASS%% in a comment like in
-
-[townname_l] # %%CLASS%%
-( split )
-( bihac )
-(banja-luka)
-(sarajevo) # %%0.9%% since we are always talk about it
-(tuzla)
-(drvar)
-
-Elements within one class are assumed to be equiprobable.
-The specification of \"+\" is ignored and recursions are broken after
-the second visit of a net.
-
-The probability for leaving out an optional event is $leave_out_prob,
-the probability for repeating an repeatable event is $repeat_prob.
-One can either change these probabilies in the code or can rewrite the
-grammar such that one can specify them with corresponding rules.
-
-The generation of noises is possible if a file "noises.gra" is placed in
-same directory with the other grammar files, not all slots have to be present.
-The file "noises.gra" should look like
-______________________________________
-
-
-# &garbage is expanded to +garbage+ by definition, since "+" at
-# the beginning of a tokebn always means repetition
-BeginNoise # called before every utterance
-( ) # %%0.6%%
-( &garbage ) # %%0.25%%
-( &garbage &garbage )
-( &garbage &garbage &garbage )
-
-EndNoise # called after every utterance
-( ) # %%0.8%%
-( &garbage ) # %%0.15%%
-( &garbage &garbage )
-( &garbage &garbage &garbage )
-
-SegmentNoise # called between slots in the forms-file
-( ) # %%0.97%%
-( &garbage ) # %%0.015%%
-( &garbage &garbage ) # %%0.01%%
-( &garbage &garbage &garbage )
-
-BreakNoise # called after a break occured instead of SegmentNoise
-( ) # %%0.3%%
-( &garbage ) # %%0.6%%
-( &garbage &garbage )
-
-RandomNoise # Called after every word
-( ) # %%0.99%%
-( &garbage ) # %%0.008%%
-( &garbage &garbage )
-______________________________________
-
-A break is a reset of a slot that is entered in the forms file
-and simulates restarts resp. corrections.
-It may occur at every word and a probabilty for this event can be specified.
-Since it is usually marked by discourse markers a separate noise slot is defined for
-this event.
-
-
-EOT
-exit(0);
-}
More information about the Ravenclaw-developers
mailing list