[TeamTalk 280]: [816] trunk/TeamTalk/Resources: merge from branches/air/ Resources for class-based lm
tk@edam.speech.cs.cmu.edu
tk at edam.speech.cs.cmu.edu
Mon Oct 8 13:06:21 EDT 2007
An HTML attachment was scrubbed...
URL: http://mailman.srv.cs.cmu.edu/pipermail/teamtalk-developers/attachments/20071008/03d8d376/attachment-0001.html
-------------- next part --------------
Modified: trunk/TeamTalk/Resources/DecoderConfig/male-16khz.arg
===================================================================
--- trunk/TeamTalk/Resources/DecoderConfig/male-16khz.arg 2007-10-08 16:09:47 UTC (rev 815)
+++ trunk/TeamTalk/Resources/DecoderConfig/male-16khz.arg 2007-10-08 17:06:21 UTC (rev 816)
@@ -23,7 +23,7 @@
-fwdflatbeam 1e-8
-fwdflatnwbeam 3e-4
-rescorelw 9.5
- -lmfn LanguageModel\TeamTalkLM.arpa
+ -lmctlfn LanguageModel\TeamTalk.ctl
-dictfn Dictionary\TeamTalk.dict.reduced_phoneset
-ndictfn Dictionary\noise.dict
-phnfn HMM-16khz.ss/phone
Copied: trunk/TeamTalk/Resources/Grammar/DynamicRobotName.class (from rev 815, branches/air/Resources/Grammar/DynamicRobotName.class)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/DynamicRobotName.class (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/DynamicRobotName.class 2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,4 @@
+ (alphie)
+ (bashful)
+ (clyde)
+ (decker)
Copied: trunk/TeamTalk/Resources/Grammar/Number-20.class (from rev 815, branches/air/Resources/Grammar/Number-20.class)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/Number-20.class (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/Number-20.class 2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,21 @@
+ (zero)
+ (one)
+ (two)
+ (three)
+ (four)
+ (five)
+ (six)
+ (seven)
+ (eight)
+ (nine)
+ (ten)
+ (eleven)
+ (twelve)
+ (thirteen)
+ (fourteen)
+ (fifteen)
+ (sixteen)
+ (seventeen)
+ (eighteen)
+ (nineteen)
+ (twenty)
Copied: trunk/TeamTalk/Resources/Grammar/Number-80-by5.class (from rev 815, branches/air/Resources/Grammar/Number-80-by5.class)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/Number-80-by5.class (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/Number-80-by5.class 2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,16 @@
+ (five)
+ (ten)
+ (fifteen)
+ (twenty)
+ (twenty five)
+ (thirty)
+ (thirty five)
+ (forty)
+ (forty five)
+ (fifty)
+ (fifty five)
+ (sixty)
+ (sixty five)
+ (seventy)
+ (seventy five)
+ (eighty)
Copied: trunk/TeamTalk/Resources/Grammar/Number-95-by5.class (from rev 815, branches/air/Resources/Grammar/Number-95-by5.class)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/Number-95-by5.class (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/Number-95-by5.class 2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,19 @@
+ (five)
+ (ten)
+ (fifteen)
+ (twenty)
+ (twenty five)
+ (thirty)
+ (thirty five)
+ (forty)
+ (forty five) #%%0.1%% there are more likely?
+ (fifty)
+ (fifty five)
+ (sixty)
+ (sixty five)
+ (seventy)
+ (seventy five)
+ (eighty)
+ (eighty five)
+ (ninety) #%%0.1%%
+ (ninety five)
Modified: trunk/TeamTalk/Resources/Grammar/TeamTalkTask.forms
===================================================================
--- trunk/TeamTalk/Resources/Grammar/TeamTalkTask.forms 2007-10-08 16:09:47 UTC (rev 815)
+++ trunk/TeamTalk/Resources/Grammar/TeamTalkTask.forms 2007-10-08 17:06:21 UTC (rev 816)
@@ -18,20 +18,12 @@
FUNCTION: Commands
NETS:
- [HumanReportCommand]
- [HumanLocationQuery]
- [HumanMoveCommand]
-# [MoveVectorCardinal]
- [MoveVectorRelative]
- [MoveToGoal]
- [HumanGoodbyeCommand]
- [HumanTurnCommand]
- [HumanHaltCommand]
- [HumanFollowCommand]
- [HumanPauseCommand]
- [HumanContinueCommand]
- [HumanExploreCommand]
- [HumanSearchCommand]
+ [InvokePlay]
+ [ControlRobot]
+ [QueryRobot]
+ [MoveRobot]
+ [RespondYesNo]
+# [HumanGoodbyeCommand]
;
FUNCTION: Features
@@ -39,43 +31,10 @@
[AbsoluteDistance]
[TurnDirection]
[MoveDirection]
- [Units]
;
-FUNCTION: YesNo
- NETS:
-# [Neither]
- [Yes]
- [No]
-;
-FUNCTION: Cancel
- NETS:
- [Cancel]
-;
-#FUNCTION: Queries
-# NETS:
-# [QueryProjector]
-# [QueryWhiteboard]
-# [QueryComputer]
-# [QueryNetworking]
-# [QueryLocation]
-# [QueryRoomSize]
-# [QueryRoomSizeSpec]
-# [QueryOtherRooms]
-# [QueryRoomDetails]
-#;
-
-#FUNCTION: Responses
-# NETS:
-# [Indifferent]
-# [Satisfied]
-# [SomewhatSatisfied]
-# [FirstOne]
-# [SecondOne]
-#;
-
# these auxiliaries are defined in order to capture some parses like
# next, this that, which o/w would parse as date-time
#FUNCTION: Auxiliaries
Modified: trunk/TeamTalk/Resources/Grammar/TeamTalkTask.gra
===================================================================
--- trunk/TeamTalk/Resources/Grammar/TeamTalkTask.gra 2007-10-08 16:09:47 UTC (rev 815)
+++ trunk/TeamTalk/Resources/Grammar/TeamTalkTask.gra 2007-10-08 17:06:21 UTC (rev 816)
@@ -10,23 +10,66 @@
[RobotName]
(everyone)
+ (%[DynamicRobotName]%) # class stub
;
[OBJ-Robot]
([RobotName])
;
+
+######### Main Nets ##############
+
+[InvokePlay]
+ ([HumanExploreCommand])
+ ([HumanSearchCommand])
+ ([HumanFollowCommand])
+;
+
+[ControlRobot]
+ ([HumanReportCommand])
+ ([HumanPauseCommand])
+ ([HumanContinueCommand])
+ ([HumanHaltCommand])
+ ([Cancel])
+;
+
+[QueryRobot]
+ ([HumanLocationQuery])
+;
+
+[MoveRobot]
+ ([HumanMoveCommand])
+ ([HumanTurnCommand])
+ ([MoveVectorRelative])
+# ([MoveVectorCardinal])
+ ([MoveToGoal])
+;
+
+[RespondYesNo]
+ ([Yes])
+ ([No])
+;
+
+###########################################
+
+
+
[HumanExploreCommand]
- (explore)
+ (explore *[MapLocation])
;
[HumanSearchCommand]
- (search)
+ (search *[MapLocation])
;
+[MapLocation]
+ (this space)
+ (the area)
+;
+
[HumanFollowCommand]
(*[RobotName] FOLLOW [OBJ-Robot])
-
FOLLOW
(join)
(follow)
@@ -34,11 +77,11 @@
;
[HumanPauseCommand]
- (*[RobotName] pause)
+ (*[RobotName] pause *task)
;
[HumanContinueCommand]
- (*[RobotName] continue)
+ (*[RobotName] continue *task)
;
[HumanReportCommand]
@@ -48,64 +91,59 @@
[HumanLocationQuery]
(*[RobotName] where are you)
+ (*[RobotName] report location)
;
[HumanHaltCommand]
(*[RobotName] all stop)
+ (*[RobotName] stop immediately)
;
+[Number-180-by5]
+# equalize the relative proportion
+ ( %[Number-95-by5]% ) #%%0.54%%
+ ( HUNDRED *and %[Number-80-by5]% ) #%%0.46%%
+HUNDRED
+ ( a hundred )
+ ( one hundred )
+;
+
+
[TurnDirection]
- (right *[AngularQualifier])
- (left *[AngularQualifier])
+ (*PREP SIDE *[AngularQualifier])
+ ([AngularQualifier] *PREP *SIDE)
(around)
+SIDE
+ (right)
+ (left)
+PREP
+ (to the)
+ (to your)
;
[MoveDirection]
- (right *[AngularQualifier])
- (left *[AngularQualifier])
+ (*PREP SIDE)
(straight)
(forward)
(forwards)
(back)
(backward)
(backwards)
+SIDE
+ (left)
+ (right)
+PREP
+ (to the)
+ (to your)
;
[AngularQualifier]
([Number-180-by5] degrees)
;
-[Number-180-by5]
- (five)
- (ten)
- (fifteen)
- (twenty *five)
- (thirty *five)
- (forty *five)
- (fifty *five)
- (sixty *five)
- (seventy *five)
- (eighty *five)
- (ninety *five)
- (HUNDRED)
- (HUNDRED *and five)
- (HUNDRED *and ten)
- (HUNDRED *and fifteen)
- (HUNDRED *and twenty *five)
- (HUNDRED *and thirty *five)
- (HUNDRED *and fourty *five)
- (HUNDRED *and fifty *five)
- (HUNDRED *and sixty *five)
- (HUNDRED *and seventy *five)
- (HUNDRED *and eighty)
-HUNDRED
- (a hundred)
- (one hundred)
-;
-
[AbsoluteDistance]
- ([Number-20] [Units])
+ (%[Number-20]% [Units])
;
[RelativeDistance]
@@ -122,15 +160,17 @@
;
[TeamTalkHalf]
- (halfway)
- (one half)
- (a half)
- (half)
+ (*MOD half)
+MOD
+ (one)
+ (a)
;
[TeamTalkThird]
- (one third)
- (a third)
+ (MOD third)
+MOD
+ (one)
+ (a)
;
[TeamTalkTwoThird]
@@ -139,20 +179,18 @@
;
[TeamTalkOneQuarter]
- (one quarters)
(one quarter)
- (one forth)
- (a quarters)
+ (one fourth)
(a quarter)
- (a forth)
+ (a fourth)
;
[TeamTalkThreeQuarter]
(three quarter)
(three quarters)
- (three forth)
- (three forths)
+ (three fourth)
+ (three fourths)
;
[HumanMoveCommand]
@@ -177,7 +215,6 @@
[MoveVectorRelative]
(*[RobotName] MOVE *[MoveDirection] [AbsoluteDistance])
(*[RobotName] MOVE *[AbsoluteDistance] [MoveDirection])
-
MOVE
(move)
(go)
@@ -185,15 +222,21 @@
(return)
;
+[HumanTurnCommand]
+ (*[RobotName] TURN [TurnDirection])
+TURN
+ (turn)
+ (face)
+ (move)
+ (go)
+;
[MoveToGoal]
(*[RobotName] MOVE *[RelativeDistance] PREP *[Side] [Goal])
(*[RobotName] MOVE [Home])
-
MOVE
- (move)
- (go)
- (drive)
-
+ (move *to)
+ (go *to)
+ (drive *to)
PREP
(toward)
(towards)
@@ -217,19 +260,26 @@
[Home]
(home)
+ (base)
;
[Xcoord]
- (*negative [Number-20])
+ (*NEG %[Number-20]%)
+NEG
+ (negative)
+ (minus)
;
[Ycoord]
- (*negative [Number-20])
+ (*NEG %[Number-20]%)
+NEG
+ (negative)
+ (minus)
;
[Units]
- (meters)
- (meter)
+ (metres)
+ (metre)
# (feet)
# (foot)
# (yards)
@@ -238,42 +288,13 @@
[HumanGoodbyeCommand]
(goodbye)
- (bye bye)
+ (bye)
(mission complete)
- (that's it)
+# (that's it)
;
-[HumanTurnCommand]
- (TURN [TurnDirection])
-TURN
- (turn)
- (face)
-;
-[Number-20]
- (zero)
- (one)
- (two)
- (three)
- (four)
- (five)
- (six)
- (seven)
- (eight)
- (nine)
- (ten)
- (eleven)
- (twelve)
- (thirteen)
- (fourteen)
- (fifteen)
- (sixteen)
- (seventeen)
- (eighteen)
- (nineteen)
- (twenty)
-;
###################################################################
# YES/NO grammar
@@ -281,20 +302,17 @@
[Yes]
(YES *MOD)
- (STRONG_MOD)
(OKAY)
- (WEAK_MOD)
+ (WEAK_MOD) #%%0.10%% # weaks don't seem likely in this domain
+ (STRONG_MOD) #%%0.10%%
YES
(yes)
(yeah)
- (yep)
- (yup)
+# (yup)
MOD
(STRONG_MOD)
(WEAK_MOD)
STRONG_MOD
- (you betcha)
-#tk hack: interferes with "go forward" (*let's go for it)
(absolutely)
(definitely)
(OKAY OKAY)
@@ -303,20 +321,18 @@
(i think so)
(i guess so)
OKAY
+ (okay)
(sure)
(of course)
- (ok)
- (okay)
(correct)
- (fine)
+# (fine)
(perfect)
(great)
- (wonderful)
+# (wonderful)
(acceptable)
(good *enough)
- (right)
- (alright)
- (cool)
+# (right)
+# (alright)
;
[No]
@@ -327,9 +343,8 @@
(no way)
(*no i DONT)
(*no i DONT think so)
- (never mind)
(nevermind)
- (*no not really)
+ (not really)
(nowhere)
(negative)
DONT
@@ -337,19 +352,18 @@
(do not)
MOD
(thanks)
- (thank you)
+ (thank=you) # should be a lexeme
(not really)
- (i *really don't want to)
(it's not)
(i'm not)
-NO
- (no)
- (not)
-GOOD
- (right)
- (correct)
- (good)
- (okay)
+#NO
+# (no)
+# (not)
+#GOOD
+# (right)
+# (correct)
+# (good)
+# (okay)
;
@@ -360,6 +374,7 @@
[Cancel]
(CANCEL *COMMAND)
CANCEL
+ (abort)
(cancel)
(quit)
COMMAND
Modified: trunk/TeamTalk/Resources/Grammar/cmp.pl
===================================================================
--- trunk/TeamTalk/Resources/Grammar/cmp.pl 2007-10-08 16:09:47 UTC (rev 815)
+++ trunk/TeamTalk/Resources/Grammar/cmp.pl 2007-10-08 17:06:21 UTC (rev 816)
@@ -1,22 +1,37 @@
#!/usr/local/bin/perl
use strict;
+use Getopt::Long;
+my $classflag = 0;
+if (not GetOptions( "class" => \$classflag, )) { die "usage: cmp.pl [-class]\n"; }
+print STDERR "cmp.pl: class is $classflag\n";
open(TTGRA, ">TeamTalk.gra");
open(NETS, ">nets");
open(TTTASKGRA, "TeamTalkTask.gra");
+
+# check if a robot names file is available, copy into class file
+if ( $classflag and -e 'TeamTalkRobots' ) {
+ system("copy","TeamTalkRobots","RobotName.class");
+}
+
+# substitute in the robot names
while(<TTTASKGRA>) {
print TTGRA $_;
next unless (/^\[([^\]]+)\]/);
print NETS "$1\n";
- next unless $1 eq 'RobotName' && -e 'TeamTalkRobots';
- open(TTROBOTS, "TeamTalkRobots");
- for my $robot (grep /\S/, <TTROBOTS>) {
+ # backward compatible behavior
+ if ( not $classflag ) {
+ next unless $1 eq 'RobotName' && -e 'TeamTalkRobots';
+ print STDERR "cmp.pl: directly inserting Robot Names\n";
+ open(TTROBOTS, "TeamTalkRobots");
+ for my $robot (grep /\S/, <TTROBOTS>) {
chop $robot;
$robot =~ s/\r$//;
print TTGRA "\t($robot)\n";
+ }
+ close TTROBOTS;
}
- close TTROBOTS;
}
open(FORMS, ">forms");
@@ -30,4 +45,7 @@
close COMPILE; close LOG;
system("concept_leaf -grammar TeamTalk.net");
-1;
+
+
+# 1; # now a program
+exit 1;
Copied: trunk/TeamTalk/Resources/Grammar/compile_gra.pl (from rev 815, branches/air/Resources/Grammar/compile_gra.pl)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/compile_gra.pl (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/compile_gra.pl 2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,59 @@
+#!/usr/local/bin/perl
+# compile a grammar into forms and nets files
+# produce a "final" version of the grammar (after resolution)
+
+use strict;
+use Getopt::Long;
+
+# some defaults
+my $domain = "TeamTalkTask";
+my $ingra = "$domain.gra";
+my $project = "TeamTalk";
+my $outgra = "$project.gra";
+my $absgra = "$project.grabs";
+
+my $classflag = 0;
+if (not GetOptions( "class" => \$classflag,
+ "domain:s" => \$domain,
+ "project:s" => \$project,
+ "ingra:s" => \$ingra,
+ "absgra:s" => \$absgra,
+ ) )
+ { die "usage: compile_gra [-class] [-project <project> -ingra <.gra> -absgra <.absgra>\n"; }
+print STDERR "compile_gra: class->$classflag ingra->$ingra outgra->$outgra\n";
+
+# check if a robot names file is available, copy into class file (note DOS)
+# HARDWIRED!!
+if ( $classflag and -e 'TeamTalkRobots' ) {
+ open(IN,"TeamTalkRobots") or die "compile_gra: can't open TeamTalkRobots!\n";
+ open(OUT,">DynamicRobotName.class") or die "compile_gra: can't open DynamicRobotName.class!\n";
+ while (<IN>) { chomp; print OUT "\t($_)\n"; }
+}
+
+# resolve classes to make "extended" and "abstracted" grammars
+system("perl resolve.pl -i $ingra -e $outgra -a $absgra");
+
+# fish out the net names
+open(TTGRA, "$outgra") or die "compile_gra: can't open $outgra!\n";;
+open(NETS, ">nets") or die "compile_gra: can't open nets!\n";;
+while(<TTGRA>) {
+ next unless (/^\[([^\]]+)\]/);
+ print NETS "$1\n";
+}
+
+# copy over the forms file
+open(TTFORMS, "$domain.forms") or die "compile_gra: no $ingra.forms file!\n";
+open(FORMS, ">forms");
+print FORMS <TTFORMS>;
+close TTFORMS; close FORMS;
+
+# compile Phoenix grammar
+open(COMPILE, "compile -g . -f $project |");
+open(LOG, ">log"); print LOG <COMPILE>; close LOG;
+close COMPILE;
+
+system("concept_leaf -grammar $project.net");
+
+
+exit 1;
+#
Copied: trunk/TeamTalk/Resources/Grammar/resolve.pl (from rev 815, branches/air/Resources/Grammar/resolve.pl)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/resolve.pl (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/resolve.pl 2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,79 @@
+#!E:/Perl/bin/perl -w
+# resolve class references in a .gra file
+# produce an expanded version for Phoenix, abstracted version for generate_*
+#
+# path to a class file is notated as "%[File]%" --> File.class
+# a file of that name should exist, in the folder with the .gra file
+# [20070923] (air)
+
+use Getopt::Long;
+
+my ($infile,$expfile);
+my $usage = "usage: resolve -infile <.gra> -expgra <_exp.gra> -abstgra <abs_.gra> \n";
+if ( scalar @ARGV eq 0 or
+ not GetOptions ( "infile:s" => \$infile,
+ "expgra:s" => \$expfile,
+ "absgra:s" => \$absfile,
+ ) ) { die $usage; }
+print STDERR "resolve: infile-> $infile; graex-> $expfile; grabs-> $absfile\n";
+open(IN,$infile) or die "resolve: can't open $infile!\n";
+open(OUT,">$expfile") or die "resolve: can't open expgra: $expfile!\n";
+open(ABS,">$absfile") or die "resolve: can't open absgra: $absfile!\n";
+
+my $postscript = <<EOS;
+
+
+############################################
+## Automatically generated class nets ##
+############################################
+############################################
+
+EOS
+
+my $preamble = <<EOS;
+#
+#
+########################################################################
+## automatically generated intermediate grammar; DO NOT EDIT! ##
+########################################################################
+#
+#
+
+EOS
+
+print OUT $preamble;
+print ABS $preamble;
+
+while (<IN>) {
+ chomp;
+ if ( /(.+?)%\[(.+?)\]%(.*?)$/) {
+ $pre = $1; $file=$2; $post=$3;
+ print OUT "$pre\[$file\]$post\n";
+ print ABS "$pre%\[$file\]%$post\n"; # pass the marker through
+ } else { print OUT "$_\n"; print ABS "$_\n"; next; }
+ if ( not defined $classnet{$file} ) {
+ print STDERR "resolve: defining $file\n";
+ open(CLASS,"$file.class") or die "missing .class file: $file\n";
+ my $classset = "\n[$file]\n";
+ while (<CLASS>) {
+ chomp;
+ if ( /#/ ) { ($text,$com) = split /\s*#\s*/,$_,2; $div="#"; }
+ else { $text = $_; $com = ""; $div = "";}
+ $text =~ s/^\s*(.+?)\s*$/$1/;
+ $classset .= "\t$text\t$div$com\n";
+ }
+ $classset .= ";\n";
+ $classnet{$file} = $classset;
+ close(CLASS);
+ }
+}
+close(IN);
+
+# add class nets at the end of the file
+print OUT $postscript;
+foreach $net (sort keys %classnet) { print OUT $classnet{$net}; }
+close(OUT);
+
+close(ABS);
+
+#
Copied: trunk/TeamTalk/Resources/Grammar/tokenize.pl (from rev 815, branches/air/Resources/Grammar/tokenize.pl)
===================================================================
--- trunk/TeamTalk/Resources/Grammar/tokenize.pl (rev 0)
+++ trunk/TeamTalk/Resources/Grammar/tokenize.pl 2007-10-08 17:06:21 UTC (rev 816)
@@ -0,0 +1,155 @@
+#!E:/Perl/bin/perl.exe -w
+# convert a .class file into:
+# a) .probdef file b) .token file (for dict) c) .ctl file
+# [20070923] (air)
+
+use Getopt::Long;
+use File::Basename;
+
+my ($grafile,$project,$wordfile);
+my $usage="usage: tokenize -grammar <file> -project <name>\n";
+if (scalar @ARGV eq 0
+ or not GetOptions (
+ "grammar=s" => \$grafile,
+ "project=s" => \$project,
+ ) ) { die $usage; }
+$probdefile = "$project.probdef";
+$tokenfile = "$project.token";
+$wordfile = "$project.words";
+print STDERR "tokenize: grammar->$grafile; project->$project; wordfile->$wordfile\n";
+my $classcount = 0;
+
+my $epsilon = 0.0001; # slop factor for probability distribution (10^-4)
+my $fault = 0;
+
+# scan .gra file; make list of classes that need to be processed
+# also collect all terminals to make a wordlist (for lm compilation)
+my %classes = (); my %wordlist = ();
+open(GRA,$grafile) or die "tokenize: $grafile not found!\n$usage\n";
+while (<GRA>) {
+ chomp;
+ if ( /^\s*#/ or /^\s+$/ ) { next; } # skip comments, blank lines
+ if ( /\s+\(\s*(.+?)\)\s*/) { # look only at ()'s
+ @toks = split /\s+/, $1;
+ } else { next; }
+#print STDERR "$_\n ->";
+ foreach $tok (@toks) {
+ $tok =~ s/^\**(.+)/$1/; # strip off Kleene star
+# print STDERR " '$tok'";
+ if ( $tok =~ /^[A-Z]+/ ) { next; } # skip macros
+ if ( $tok =~ /%(\[.+?\])%/) { # keep protected net names, keep []'s
+ if ( not defined $classes{$1} ) {
+ print STDERR "tokenize: found $1\n";
+ $classcount++;
+ }
+ $classes{$1} = sprintf "C%02d",$classcount;
+ $wordlist{$1} = "c"; # remember type
+# print STDERR " $1($wordlist{$1})";
+ } elsif ( $tok =~ /^\[.+?\]/ ) { next; } # other net, ignore
+ else {
+ $w = $1;
+ $wordlist{$w} = "w";
+# print STDERR " {$w}($wordlist{$w})";
+ }
+ }
+# print STDERR "\n";
+}
+close(GRA);
+
+
+# do each class
+open(PROB,">$probdefile") or die "tokenize: can't open $probdefile";
+foreach $classfil (sort keys %classes) {
+ $classid = $classes{$classfil};
+ $classfil =~ s/\[(.+?)\]/$1/; # strip []'s
+ open(CLASS,"$classfil.class") or die "tokenize: class file $classfil not found";
+ ($classname,$dirn,$suffix) = fileparse($classfil,qr/\.[^.]*/);
+ my %lexset = ();
+ while (<CLASS>) {
+ chomp;
+ $line = $_;
+ if ( /#/ ) { # has a comment, necessarily a prob
+ ($text,$com) = split /\s*#\s*/,$line,2;
+ if ( $com =~ /%%(\d\.\d+)%%/ ) { $prob = $1; }
+ else { # bad
+ print STDERR "tokenize: possible malformed probability in $classfil \"$line\" --> ignored\n";
+ $prob = undef;
+ $fault++;
+ }
+ } else { # unspecified: "implicit"
+ $text = $line; $prob = undef;
+ }
+ $text =~ s/^\s*\((.+?)\)\s*$/$1/; # trim spaces from ends, strip ()'s
+ $text =~ s/\s+/=/g; # tokenize the text by substituting spaces
+ $tokens{"$text:$classid"}++;
+ $lexset{"$text:$classid"} = $prob;
+ }
+ close(CLASS);
+
+ # evaluate probabilities
+ $mass = 0.0; $empty = 0;
+ foreach $lex (keys %lexset) {
+ if ( defined $lexset{$lex}) { $mass += $lexset{$lex}; }
+ else { $empty++; }
+ }
+ if ($mass<0.0 or $mass>1.0) {
+ print STDERR "tokenize: $classfil -> explicit probs add up to $mass!\n";
+ $fault++;
+ }
+ # fix up the probabilities so that everything adds up right
+ $adjust = 1.0; $dist = 0.0;
+ if ($empty eq 0 and $mass gt 0.0 and $mass lt (1.0-$epsilon)) { # all probs explicit
+ $adjust = 1.0 / $mass; # not enough mass: scale all probs upwards
+ print STDERR "tokenize: $classfil -> explicit probs scaled by $adjust\n";
+ } elsif ($mass lt 1.0 and $empty gt 0) {
+ $dist = (1.0 - $mass)/$empty; # some probs not specified: split remaining mass
+ print STDERR "tokenize: $classfil -> token implicit probabilities set to $dist\n";
+ } elsif ( $mass gt 1.0) { # something not right...
+ $adjust = 1.0 / ($mass+($epsilon*$empty)); # too much mass: scale all probs down
+ print STDERR "tokenize: $classfil -> explicit probs scaled by $adjust\n";
+ $dist = $epsilon; # but set all other tokens to min prob
+ print STDERR "tokenize: $classfil -> $empty token probs set to $epsilon\n";
+ }
+
+ # readjust the class member probabilities
+ foreach $lex (keys %lexset) {
+ if ( defined $lexset{$lex} ) { $lexset{$lex} *= $adjust; }
+ else { $lexset{$lex} = $dist; }
+ }
+
+ # add to the .probdef file
+ print PROB "LMCLASS [$classname]\n";
+ foreach $lex (sort keys %lexset) {
+ printf PROB "%s\t%8.6f\n", uc($lex),$lexset{$lex};
+ }
+ print PROB "END [$classname]\n\n";
+}
+close(PROB);
+
+# create .words file (for lm compilation); includes class []'s --> UPPERCASE
+open(WRD,">$wordfile") or die "tokenize: can't open $wordfile!\n";
+foreach $t (sort keys %wordlist) {
+ if ( $t =~ /\[.+?\]/ ) { print WRD "$t\n"; } else { print WRD "\U$t\n"; }
+}
+close(WRD);
+
+# create the .token file (for pronunciation dict); excludes []'s -> UPPERCASE
+open(TOK,">$tokenfile") or die "tokenize: can't write to $tokenfile\n";
+foreach (keys %tokens) { $wordlist{$_}="t";} # add in the wordlist
+foreach $tok (sort keys %wordlist) {
+ if ($tok =~ /\[.+?\]/ ) { next; } # but ignore nets []'s
+ print TOK "\U$tok\n"; # for compatibility with pronounce
+}
+close(TOK);
+
+# create a .ctl file
+open(CTL,">$project.ctl") or die "tokenize: can't write to .ctl file!\n";
+print CTL "{ LanguageModel\\$project.probdef }\nLanguageModel\\$project.arpa general {\n";
+foreach $class (sort keys %classes) {
+ ($classname,$dirn,$suffix) = fileparse($class,qr/\.[^.]*/);
+ print CTL "$classname\n";
+}
+print CTL "}\n";
+close(CTL);
+
+#
More information about the TeamTalk-developers
mailing list