[TeamTalk 280]: [816] trunk/TeamTalk/Resources: merge from branches/air/ Resources for class-based lm

tk@edam.speech.cs.cmu.edu tk at edam.speech.cs.cmu.edu
Mon Oct 8 13:06:21 EDT 2007


An HTML attachment was scrubbed...
URL: http://mailman.srv.cs.cmu.edu/pipermail/teamtalk-developers/attachments/20071008/03d8d376/attachment-0001.html
-------------- next part --------------
Modified: trunk/TeamTalk/Resources/DecoderConfig/male-16khz.arg
===================================================================
--- trunk/TeamTalk/Resources/DecoderConfig/male-16khz.arg	2007-10-08 16:09:47 UTC (rev 815)
+++ trunk/TeamTalk/Resources/DecoderConfig/male-16khz.arg	2007-10-08 17:06:21 UTC (rev 816)
@@ -23,7 +23,7 @@
  -fwdflatbeam 1e-8
  -fwdflatnwbeam 3e-4
  -rescorelw 9.5
- -lmfn	LanguageModel\TeamTalkLM.arpa
+ -lmctlfn	LanguageModel\TeamTalk.ctl
  -dictfn     Dictionary\TeamTalk.dict.reduced_phoneset
  -ndictfn    Dictionary\noise.dict
  -phnfn      HMM-16khz.ss/phone

Copied: trunk/TeamTalk/Resources/Grammar/DynamicRobotName.class (from rev 815, branches/air/Resources/Grammar/DynamicRobotName.class)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/DynamicRobotName.class	                        (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/DynamicRobotName.class	2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,4 @@
+	(alphie)
+	(bashful)
+	(clyde)
+	(decker)

Copied: trunk/TeamTalk/Resources/Grammar/Number-20.class (from rev 815, branches/air/Resources/Grammar/Number-20.class)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/Number-20.class	                        (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/Number-20.class	2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,21 @@
+	(zero)
+	(one)
+	(two)
+	(three)
+	(four)
+	(five)
+	(six)
+	(seven)
+	(eight)
+	(nine)
+	(ten)
+	(eleven)
+	(twelve)
+	(thirteen)
+	(fourteen)
+	(fifteen)
+	(sixteen)
+	(seventeen)
+	(eighteen)
+	(nineteen)
+	(twenty)

Copied: trunk/TeamTalk/Resources/Grammar/Number-80-by5.class (from rev 815, branches/air/Resources/Grammar/Number-80-by5.class)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/Number-80-by5.class	                        (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/Number-80-by5.class	2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,16 @@
+	(five)
+	(ten)
+	(fifteen)
+	(twenty)
+	(twenty five)
+	(thirty)
+	(thirty five)
+	(forty)
+	(forty five)
+	(fifty)
+	(fifty five)
+	(sixty)
+	(sixty five)
+	(seventy)
+	(seventy five)
+	(eighty)

Copied: trunk/TeamTalk/Resources/Grammar/Number-95-by5.class (from rev 815, branches/air/Resources/Grammar/Number-95-by5.class)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/Number-95-by5.class	                        (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/Number-95-by5.class	2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,19 @@
+	(five)
+	(ten)
+	(fifteen)
+	(twenty)
+	(twenty five)
+	(thirty)
+	(thirty five)
+	(forty)
+	(forty five)  #%%0.1%%  there are more likely?
+	(fifty)
+	(fifty five)
+	(sixty)
+	(sixty five)
+	(seventy)
+	(seventy five)
+	(eighty)
+	(eighty five)
+	(ninety) #%%0.1%%
+	(ninety five)

Modified: trunk/TeamTalk/Resources/Grammar/TeamTalkTask.forms
===================================================================
--- trunk/TeamTalk/Resources/Grammar/TeamTalkTask.forms	2007-10-08 16:09:47 UTC (rev 815)
+++ trunk/TeamTalk/Resources/Grammar/TeamTalkTask.forms	2007-10-08 17:06:21 UTC (rev 816)
@@ -18,20 +18,12 @@
 
 FUNCTION: Commands
     NETS:
-	[HumanReportCommand]
-	[HumanLocationQuery]
-	[HumanMoveCommand]
-#	[MoveVectorCardinal]
-	[MoveVectorRelative]
-	[MoveToGoal]
-	[HumanGoodbyeCommand]
-	[HumanTurnCommand]	
-	[HumanHaltCommand]
-	[HumanFollowCommand]
-	[HumanPauseCommand]
-	[HumanContinueCommand]
-	[HumanExploreCommand]
-	[HumanSearchCommand]
+	[InvokePlay]
+	[ControlRobot]
+	[QueryRobot]
+	[MoveRobot]
+	[RespondYesNo]
+#	[HumanGoodbyeCommand]
 ;
 
 FUNCTION: Features
@@ -39,43 +31,10 @@
 	[AbsoluteDistance]
 	[TurnDirection]
 	[MoveDirection]
-	[Units]
 ;
 
-FUNCTION: YesNo
-    NETS:
-#	[Neither]
-	[Yes]
-	[No]
-;
 
-FUNCTION: Cancel
-    NETS:
-	[Cancel]
-;
 
-#FUNCTION: Queries
-#    NETS: 
-#	[QueryProjector]
-#	[QueryWhiteboard]
-#	[QueryComputer]
-#	[QueryNetworking]
-#	[QueryLocation]
-#	[QueryRoomSize]
-#	[QueryRoomSizeSpec]
-#	[QueryOtherRooms]
-#	[QueryRoomDetails]
-#;
-
-#FUNCTION: Responses
-#    NETS:
-#	[Indifferent]
-#	[Satisfied]
-#	[SomewhatSatisfied]
-#	[FirstOne]
-#	[SecondOne]
-#;
-
 # these auxiliaries are defined in order to capture some parses like
 # next, this that, which o/w would parse as date-time
 #FUNCTION: Auxiliaries

Modified: trunk/TeamTalk/Resources/Grammar/TeamTalkTask.gra
===================================================================
--- trunk/TeamTalk/Resources/Grammar/TeamTalkTask.gra	2007-10-08 16:09:47 UTC (rev 815)
+++ trunk/TeamTalk/Resources/Grammar/TeamTalkTask.gra	2007-10-08 17:06:21 UTC (rev 816)
@@ -10,23 +10,66 @@
 
 [RobotName]
 	(everyone)
+	(%[DynamicRobotName]%)	# class stub
 ;
 
 [OBJ-Robot]
 	([RobotName])
 ;
 
+
+#########    Main Nets    ##############
+
+[InvokePlay]
+	([HumanExploreCommand])
+	([HumanSearchCommand])
+	([HumanFollowCommand])
+;
+
+[ControlRobot]
+	([HumanReportCommand])
+	([HumanPauseCommand])
+	([HumanContinueCommand])
+	([HumanHaltCommand])
+	([Cancel])
+;
+
+[QueryRobot]
+	([HumanLocationQuery])
+;
+
+[MoveRobot]
+	([HumanMoveCommand])
+	([HumanTurnCommand])
+	([MoveVectorRelative])
+#	([MoveVectorCardinal])
+	([MoveToGoal])
+;
+
+[RespondYesNo]
+	([Yes])
+	([No])
+;
+
+###########################################
+
+
+
 [HumanExploreCommand]
-	(explore)
+	(explore *[MapLocation])
 ;
 
 [HumanSearchCommand]
-	(search)
+	(search *[MapLocation])
 ;
 
+[MapLocation]
+	(this space)
+	(the area)
+;
+
 [HumanFollowCommand]
 	(*[RobotName] FOLLOW [OBJ-Robot])
-
 FOLLOW
 	(join)
 	(follow)
@@ -34,11 +77,11 @@
 ;
 
 [HumanPauseCommand]
-	(*[RobotName] pause)
+	(*[RobotName] pause *task)
 ;
 
 [HumanContinueCommand]
-	(*[RobotName] continue)
+	(*[RobotName] continue *task)
 ;
 
 [HumanReportCommand]
@@ -48,64 +91,59 @@
 
 [HumanLocationQuery]
 	(*[RobotName] where are you)
+	(*[RobotName] report location)
 ;
 
 [HumanHaltCommand]
 	(*[RobotName] all stop)
+	(*[RobotName] stop immediately)
 ;
 
+[Number-180-by5]
+#	equalize the relative proportion
+	( %[Number-95-by5]% ) #%%0.54%%
+	( HUNDRED *and %[Number-80-by5]% ) #%%0.46%%
+HUNDRED
+	( a hundred ) 
+	( one hundred ) 
+;
+
+
 [TurnDirection]
-	(right *[AngularQualifier])
-	(left *[AngularQualifier])
+	(*PREP SIDE *[AngularQualifier])
+	([AngularQualifier] *PREP *SIDE)
 	(around)
+SIDE
+	(right)
+	(left)
+PREP
+	(to the)
+	(to your)
 ;
 
 [MoveDirection]
-	(right *[AngularQualifier])
-	(left *[AngularQualifier])
+	(*PREP SIDE)
 	(straight)
 	(forward)
 	(forwards)
 	(back)
 	(backward)
 	(backwards)
+SIDE
+	(left)
+	(right)
+PREP
+	(to the)
+	(to your)
 ;
 
 [AngularQualifier]
 	([Number-180-by5] degrees)
 ;
 
-[Number-180-by5]
-	(five)
-	(ten)
-	(fifteen)
-	(twenty *five)
-	(thirty *five)
-	(forty *five)
-	(fifty *five)
-	(sixty *five)
-	(seventy *five)
-	(eighty *five)
-	(ninety *five)
-	(HUNDRED)
-	(HUNDRED *and five)
-	(HUNDRED *and ten)
-	(HUNDRED *and fifteen)
-	(HUNDRED *and twenty *five)
-	(HUNDRED *and thirty *five)
-	(HUNDRED *and fourty *five)
-	(HUNDRED *and fifty *five)
-	(HUNDRED *and sixty *five)
-	(HUNDRED *and seventy *five)
-	(HUNDRED *and eighty)
 
-HUNDRED
-	(a hundred)
-	(one hundred)
-;
-
 [AbsoluteDistance]
-	([Number-20] [Units])
+	(%[Number-20]% [Units])
 ;
 
 [RelativeDistance]
@@ -122,15 +160,17 @@
 ;
 
 [TeamTalkHalf]
-	(halfway)
-	(one half)
-	(a half)
-	(half)
+	(*MOD half)
+MOD
+	(one)
+	(a)
 ;
 
 [TeamTalkThird]
-	(one third)
-	(a third)
+	(MOD third)
+MOD
+	(one)
+	(a)
 ;
 
 [TeamTalkTwoThird]
@@ -139,20 +179,18 @@
 ;
 
 [TeamTalkOneQuarter]
-	(one quarters)
 	(one quarter)
-	(one forth)
-	(a quarters)
+	(one fourth)
 	(a quarter)
-	(a forth)
+	(a fourth)
 ;
 
 
 [TeamTalkThreeQuarter]
 	(three quarter)
 	(three quarters)
-	(three forth)
-	(three forths)
+	(three fourth)
+	(three fourths)
 ;
 
 [HumanMoveCommand]
@@ -177,7 +215,6 @@
 [MoveVectorRelative]
 	(*[RobotName] MOVE *[MoveDirection] [AbsoluteDistance])
 	(*[RobotName] MOVE *[AbsoluteDistance] [MoveDirection])
-
 MOVE
 	(move)
 	(go)
@@ -185,15 +222,21 @@
 	(return)
 ;
 
+[HumanTurnCommand]
+	(*[RobotName] TURN [TurnDirection])
+TURN
+	(turn)
+	(face)
+	(move)
+	(go)
+;
 [MoveToGoal]
 	(*[RobotName] MOVE *[RelativeDistance] PREP *[Side] [Goal])
 	(*[RobotName] MOVE [Home])
-	
 MOVE
-	(move)
-	(go)
-	(drive)
-
+	(move *to)
+	(go *to)
+	(drive *to)
 PREP
 	(toward)
 	(towards)
@@ -217,19 +260,26 @@
 
 [Home]
 	(home)
+	(base)
 ;
 
 [Xcoord]
-	(*negative [Number-20])
+	(*NEG %[Number-20]%)
+NEG
+	(negative)
+	(minus)
 ;
 
 [Ycoord]
-	(*negative [Number-20])
+	(*NEG %[Number-20]%)
+NEG
+	(negative)
+	(minus)
 ;
 
 [Units]
-	(meters)
-	(meter)
+	(metres)
+	(metre)
 #	(feet)
 #	(foot)
 #	(yards)
@@ -238,42 +288,13 @@
 
 [HumanGoodbyeCommand]
 	(goodbye)
-	(bye bye)
+	(bye)
 	(mission complete)
-	(that's it)
+#	(that's it)
 ;
 
-[HumanTurnCommand]
-	(TURN [TurnDirection])
 
-TURN
-	(turn)
-	(face)
-;
 
-[Number-20]
-	(zero)
-	(one)
-	(two)
-	(three)
-	(four)
-	(five)
-	(six)
-	(seven)
-	(eight)
-	(nine)
-	(ten)
-	(eleven)
-	(twelve)
-	(thirteen)
-	(fourteen)
-	(fifteen)
-	(sixteen)
-	(seventeen)
-	(eighteen)
-	(nineteen)
-	(twenty)
-;
 
 ###################################################################
 # YES/NO grammar
@@ -281,20 +302,17 @@
 
 [Yes]
 	(YES *MOD)
-	(STRONG_MOD)
 	(OKAY)
-	(WEAK_MOD)
+	(WEAK_MOD)  #%%0.10%%    # weaks don't seem likely in this domain
+	(STRONG_MOD) #%%0.10%%
 YES
 	(yes)
 	(yeah)
-	(yep)
-	(yup)
+#	(yup)
 MOD
 	(STRONG_MOD)
 	(WEAK_MOD)
 STRONG_MOD
-	(you betcha)
-#tk hack: interferes with "go forward"	(*let's go for it)
 	(absolutely)
 	(definitely)
 	(OKAY OKAY)
@@ -303,20 +321,18 @@
 	(i think so)
 	(i guess so)
 OKAY
+	(okay)
 	(sure)
 	(of course)
-	(ok)
-	(okay)
 	(correct)
-	(fine)
+#	(fine)
 	(perfect)
 	(great)
-	(wonderful)
+#	(wonderful)
 	(acceptable)
 	(good *enough)
-	(right)
-	(alright)
-	(cool)
+#	(right)
+#	(alright)
 ;
 
 [No]
@@ -327,9 +343,8 @@
 	(no way)
 	(*no i DONT)
 	(*no i DONT think so)
-	(never mind)
 	(nevermind)
-	(*no not really)
+	(not really)
 	(nowhere)
 	(negative)
 DONT
@@ -337,19 +352,18 @@
 	(do not)
 MOD
 	(thanks)
-	(thank you)
+	(thank=you)     # should be a lexeme
 	(not really)
-	(i *really don't want to)
 	(it's not)
 	(i'm not)
-NO
-	(no)
-	(not)
-GOOD
-	(right)
-	(correct)
-	(good)
-	(okay)
+#NO
+#	(no)
+#	(not)
+#GOOD
+#	(right)
+#	(correct)
+#	(good)
+#	(okay)
 ;
 
 
@@ -360,6 +374,7 @@
 [Cancel]
 	(CANCEL *COMMAND)
 CANCEL
+	(abort)
 	(cancel)
 	(quit)
 COMMAND

Modified: trunk/TeamTalk/Resources/Grammar/cmp.pl
===================================================================
--- trunk/TeamTalk/Resources/Grammar/cmp.pl	2007-10-08 16:09:47 UTC (rev 815)
+++ trunk/TeamTalk/Resources/Grammar/cmp.pl	2007-10-08 17:06:21 UTC (rev 816)
@@ -1,22 +1,37 @@
 #!/usr/local/bin/perl
 
 use strict;
+use Getopt::Long;
 
+my $classflag = 0;
+if (not GetOptions( "class" => \$classflag, )) { die "usage: cmp.pl [-class]\n"; }
+print STDERR "cmp.pl: class is $classflag\n";
 open(TTGRA, ">TeamTalk.gra");
 open(NETS, ">nets");
 open(TTTASKGRA, "TeamTalkTask.gra");
+
+# check if a robot names file is available, copy into class file
+if ( $classflag and -e 'TeamTalkRobots' ) {
+  system("copy","TeamTalkRobots","RobotName.class");
+}
+
+# substitute in the robot names
 while(<TTTASKGRA>) {
     print TTGRA $_;
     next unless (/^\[([^\]]+)\]/);
     print NETS "$1\n";
-    next unless $1 eq 'RobotName' && -e 'TeamTalkRobots';
-    open(TTROBOTS, "TeamTalkRobots");
-    for my $robot (grep /\S/, <TTROBOTS>) {
+    # backward compatible behavior
+    if ( not $classflag ) {
+      next unless $1 eq 'RobotName' && -e 'TeamTalkRobots';
+      print STDERR "cmp.pl: directly inserting Robot Names\n";
+      open(TTROBOTS, "TeamTalkRobots");
+      for my $robot (grep /\S/, <TTROBOTS>) {
 	chop $robot;
 	$robot =~ s/\r$//;
 	print TTGRA "\t($robot)\n";
+      }
+      close TTROBOTS;
     }
-    close TTROBOTS;
 }
 
 open(FORMS, ">forms");
@@ -30,4 +45,7 @@
 close COMPILE; close LOG;
 
 system("concept_leaf -grammar TeamTalk.net");
-1;
+
+
+# 1; # now a program
+exit 1;

Copied: trunk/TeamTalk/Resources/Grammar/compile_gra.pl (from rev 815, branches/air/Resources/Grammar/compile_gra.pl)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/compile_gra.pl	                        (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/compile_gra.pl	2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,59 @@
+#!/usr/local/bin/perl
+# compile a grammar into forms and nets files
+# produce a "final" version of the grammar (after resolution)
+
+use strict;
+use Getopt::Long;
+
+# some defaults
+my $domain = "TeamTalkTask";
+my $ingra = "$domain.gra";
+my $project = "TeamTalk";
+my $outgra = "$project.gra";
+my $absgra = "$project.grabs";
+
+my $classflag = 0;
+if (not GetOptions( "class" => \$classflag,
+		    "domain:s" => \$domain,
+		    "project:s" => \$project,
+		    "ingra:s" => \$ingra,
+		    "absgra:s" => \$absgra,
+		  ) )
+  { die "usage: compile_gra [-class] [-project <project> -ingra <.gra> -absgra <.absgra>\n"; }
+print STDERR "compile_gra: class->$classflag  ingra->$ingra  outgra->$outgra\n";
+
+# check if a robot names file is available, copy into class file (note DOS)
+# HARDWIRED!!
+if ( $classflag and -e 'TeamTalkRobots' ) {
+  open(IN,"TeamTalkRobots") or die "compile_gra: can't open TeamTalkRobots!\n";
+  open(OUT,">DynamicRobotName.class") or die "compile_gra: can't open DynamicRobotName.class!\n";
+  while (<IN>) { chomp; print OUT "\t($_)\n"; }
+}
+
+# resolve classes to make "extended" and "abstracted" grammars
+system("perl resolve.pl -i $ingra -e $outgra -a $absgra");
+
+# fish out the net names
+open(TTGRA, "$outgra") or die "compile_gra: can't open $outgra!\n";;
+open(NETS, ">nets") or die "compile_gra: can't open nets!\n";;
+while(<TTGRA>) {
+  next unless (/^\[([^\]]+)\]/);
+  print NETS "$1\n";
+}
+
+# copy over the forms file
+open(TTFORMS, "$domain.forms") or die "compile_gra: no $ingra.forms file!\n";
+open(FORMS, ">forms");
+print FORMS <TTFORMS>;
+close TTFORMS; close FORMS;
+
+# compile Phoenix grammar
+open(COMPILE, "compile -g . -f $project |");
+open(LOG, ">log"); print LOG <COMPILE>; close LOG;
+close COMPILE;
+
+system("concept_leaf -grammar $project.net");
+
+
+exit 1;
+#

Copied: trunk/TeamTalk/Resources/Grammar/resolve.pl (from rev 815, branches/air/Resources/Grammar/resolve.pl)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/resolve.pl	                        (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/resolve.pl	2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,79 @@
+#!E:/Perl/bin/perl -w
+# resolve class references in a .gra file
+# produce an expanded version for Phoenix, abstracted version for generate_*
+#
+# path to a class file is notated as "%[File]%"  --> File.class
+# a file of that name should exist, in the folder with the .gra file
+# [20070923] (air)
+
+use Getopt::Long;
+
+my ($infile,$expfile);
+my $usage = "usage: resolve -infile <.gra> -expgra <_exp.gra> -abstgra <abs_.gra> \n";
+if ( scalar @ARGV eq 0 or
+     not GetOptions ( "infile:s" => \$infile,
+		      "expgra:s" => \$expfile,
+		      "absgra:s" => \$absfile,
+		    ) ) { die $usage; }
+print STDERR "resolve: infile-> $infile; graex-> $expfile; grabs-> $absfile\n";
+open(IN,$infile) or die "resolve: can't open $infile!\n";
+open(OUT,">$expfile") or die "resolve: can't open expgra: $expfile!\n";
+open(ABS,">$absfile") or die "resolve: can't open absgra: $absfile!\n";
+
+my $postscript = <<EOS;
+
+
+############################################
+##   Automatically generated class nets   ##
+############################################
+############################################
+
+EOS
+
+my $preamble = <<EOS;
+#
+#
+########################################################################
+##   automatically generated intermediate grammar; DO NOT EDIT!       ##
+########################################################################
+#
+#
+
+EOS
+
+print OUT $preamble;
+print ABS $preamble;
+
+while (<IN>) {
+  chomp;
+  if ( /(.+?)%\[(.+?)\]%(.*?)$/) {
+    $pre  = $1; $file=$2; $post=$3;
+    print OUT "$pre\[$file\]$post\n";
+    print ABS "$pre%\[$file\]%$post\n";  # pass the marker through
+  } else { print OUT "$_\n"; print ABS "$_\n";  next; }
+  if ( not defined $classnet{$file} ) {
+    print STDERR "resolve: defining $file\n";
+    open(CLASS,"$file.class") or die "missing .class file: $file\n";
+    my $classset = "\n[$file]\n";
+    while (<CLASS>) {
+      chomp;
+      if ( /#/ ) { ($text,$com) = split /\s*#\s*/,$_,2; $div="#"; }
+      else { $text = $_; $com = ""; $div = "";}
+      $text =~ s/^\s*(.+?)\s*$/$1/;
+      $classset .= "\t$text\t$div$com\n";
+    }
+    $classset .= ";\n";
+    $classnet{$file} = $classset;
+    close(CLASS);
+  }
+}
+close(IN);
+
+# add class nets at the end of the file
+print OUT $postscript;
+foreach $net (sort keys %classnet) { print OUT $classnet{$net}; }
+close(OUT);
+
+close(ABS); 
+
+#

Copied: trunk/TeamTalk/Resources/Grammar/tokenize.pl (from rev 815, branches/air/Resources/Grammar/tokenize.pl)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/tokenize.pl	                        (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/tokenize.pl	2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,155 @@
+#!E:/Perl/bin/perl.exe -w
+# convert a .class file into:
+#    a) .probdef file   b) .token file (for dict)   c) .ctl file
+# [20070923] (air)
+
+use Getopt::Long;
+use File::Basename;
+
+my ($grafile,$project,$wordfile);
+my $usage="usage: tokenize -grammar <file> -project <name>\n";
+if (scalar @ARGV eq 0
+    or not GetOptions (
+		      "grammar=s" => \$grafile,
+		      "project=s" => \$project,
+		      ) ) { die $usage; }
+$probdefile = "$project.probdef";
+$tokenfile = "$project.token";
+$wordfile = "$project.words";
+print STDERR "tokenize: grammar->$grafile; project->$project; wordfile->$wordfile\n";
+my $classcount = 0;
+
+my $epsilon = 0.0001;  # slop factor for probability distribution (10^-4)
+my $fault = 0;
+
+# scan .gra file; make list of classes that need to be processed
+# also collect all terminals to make a wordlist (for lm compilation)
+my %classes = (); my %wordlist = ();
+open(GRA,$grafile) or die "tokenize: $grafile not found!\n$usage\n";
+while (<GRA>) {
+  chomp;
+  if ( /^\s*#/ or /^\s+$/ ) { next; }  # skip comments, blank lines
+  if ( /\s+\(\s*(.+?)\)\s*/) {  # look only at ()'s
+    @toks = split /\s+/, $1;
+  } else { next; }
+#print STDERR "$_\n  ->";
+  foreach $tok (@toks) {
+    $tok =~ s/^\**(.+)/$1/;  # strip off Kleene star
+#    print STDERR " '$tok'";
+    if ( $tok =~ /^[A-Z]+/ ) { next; }  # skip macros
+    if ( $tok =~ /%(\[.+?\])%/) { # keep protected net names, keep []'s
+      if ( not defined $classes{$1} ) {
+	print STDERR "tokenize: found $1\n";
+	$classcount++;
+      }
+      $classes{$1} = sprintf "C%02d",$classcount;
+      $wordlist{$1} = "c";  # remember type
+#    print STDERR " $1($wordlist{$1})";
+    } elsif ( $tok =~ /^\[.+?\]/ ) { next; }  # other net, ignore
+    else {
+      $w = $1; 
+      $wordlist{$w} = "w";
+#    print STDERR " {$w}($wordlist{$w})";
+    }
+  }
+#  print STDERR "\n";
+}
+close(GRA);
+
+
+# do each class
+open(PROB,">$probdefile") or die "tokenize: can't open $probdefile";
+foreach $classfil (sort keys %classes) {
+  $classid = $classes{$classfil};
+  $classfil =~ s/\[(.+?)\]/$1/;  # strip []'s
+  open(CLASS,"$classfil.class") or die "tokenize: class file $classfil not found";
+  ($classname,$dirn,$suffix) = fileparse($classfil,qr/\.[^.]*/);
+  my %lexset = ();
+  while (<CLASS>) {
+    chomp;
+    $line = $_;
+    if ( /#/ ) { # has a comment, necessarily a prob
+      ($text,$com) = split /\s*#\s*/,$line,2;
+      if ( $com =~ /%%(\d\.\d+)%%/ ) { $prob = $1; }
+      else { # bad
+	print STDERR "tokenize: possible malformed probability in $classfil \"$line\" --> ignored\n";
+	$prob = undef;
+	$fault++;
+      }
+    } else { # unspecified: "implicit"
+      $text = $line; $prob = undef;
+    }
+    $text =~ s/^\s*\((.+?)\)\s*$/$1/;  # trim spaces from ends, strip ()'s
+    $text =~ s/\s+/=/g;  # tokenize the text by substituting spaces
+    $tokens{"$text:$classid"}++;
+    $lexset{"$text:$classid"} = $prob;
+  }
+  close(CLASS);
+
+  # evaluate probabilities
+  $mass = 0.0; $empty = 0;
+  foreach $lex (keys %lexset) {
+    if ( defined $lexset{$lex}) { $mass += $lexset{$lex}; }
+    else { $empty++; }
+  }
+  if ($mass<0.0 or $mass>1.0) {
+    print STDERR "tokenize: $classfil -> explicit probs add up to $mass!\n";
+    $fault++;
+  }
+  # fix up the probabilities so that everything adds up right
+  $adjust = 1.0; $dist = 0.0;
+  if ($empty eq 0 and $mass gt 0.0 and $mass lt (1.0-$epsilon)) { # all probs explicit
+    $adjust = 1.0 / $mass; # not enough mass: scale all probs upwards
+    print STDERR "tokenize: $classfil -> explicit probs scaled by $adjust\n";
+  } elsif ($mass lt 1.0 and $empty gt 0) {
+    $dist = (1.0 - $mass)/$empty; # some probs not specified: split remaining mass
+    print STDERR "tokenize: $classfil -> token implicit probabilities set to $dist\n";
+  } elsif ( $mass gt 1.0) {  # something not right...
+    $adjust = 1.0 / ($mass+($epsilon*$empty)); # too much mass: scale all probs down
+    print STDERR "tokenize: $classfil -> explicit probs scaled by $adjust\n";
+    $dist = $epsilon; # but set all other tokens to min prob
+    print STDERR "tokenize: $classfil -> $empty token probs set to $epsilon\n";
+  }
+
+  # readjust the class member probabilities
+  foreach $lex (keys %lexset) {
+    if ( defined $lexset{$lex} ) { $lexset{$lex} *= $adjust; }
+    else { $lexset{$lex} = $dist; }
+  }
+
+  # add to the .probdef file
+  print PROB "LMCLASS [$classname]\n";
+  foreach $lex (sort keys %lexset) {
+    printf PROB "%s\t%8.6f\n", uc($lex),$lexset{$lex};
+  }
+  print PROB "END [$classname]\n\n";
+}
+close(PROB);
+
+# create .words file (for lm compilation); includes class []'s  --> UPPERCASE
+open(WRD,">$wordfile") or die "tokenize: can't open $wordfile!\n";
+foreach $t  (sort keys %wordlist) {
+  if ( $t =~ /\[.+?\]/ ) { print WRD "$t\n"; } else { print WRD "\U$t\n"; }
+}
+close(WRD);
+
+# create the .token file (for pronunciation dict); excludes []'s -> UPPERCASE
+open(TOK,">$tokenfile") or die "tokenize: can't write to $tokenfile\n";
+foreach (keys %tokens) { $wordlist{$_}="t";}  # add in the wordlist
+foreach $tok (sort keys %wordlist) {
+  if ($tok =~ /\[.+?\]/ ) { next; }  # but ignore nets []'s
+  print TOK "\U$tok\n";  # for compatibility with pronounce
+}
+close(TOK);
+
+# create a .ctl file
+open(CTL,">$project.ctl") or die "tokenize: can't write to .ctl file!\n";
+print CTL "{ LanguageModel\\$project.probdef }\nLanguageModel\\$project.arpa general {\n";
+foreach $class (sort keys %classes) {
+  ($classname,$dirn,$suffix) = fileparse($class,qr/\.[^.]*/);
+  print CTL "$classname\n";
+}
+print CTL "}\n";
+close(CTL);
+
+#


More information about the TeamTalk-developers mailing list