[RavenclawDev 242] [19] Tools: Moved remotely

Thu Apr 5 10:29:08 EDT 2007

An HTML attachment was scrubbed...
URL: http://mailman.srv.cs.cmu.edu/pipermail/ravenclaw-developers/attachments/20070405/16845b02/attachment.html
-------------- next part --------------
Copied: Tools/MakeLM/makelm.pl (from rev 18, Tools/makelm.pl)
===================================================================

--- Tools/MakeLM/makelm.pl	                        (rev 0)
+++ Tools/MakeLM/makelm.pl	2007-04-05 14:29:08 UTC (rev 19)
@@ -0,0 +1,277 @@
+#!/usr/bin/perl
+
+use LWP::UserAgent;
+use HTTP::Request::Common;
+use File::Spec;
+use File::Copy;
+use File::stat;
+$ENV{'LC_COLLATE'} = 'C';
+$ENV{'LC_ALL'} = 'C';
+use locale;
+
+my $project = 'zap2';
+my $SAMPSIZE = scalar @ARGV? shift: 30000;
+
+$GRAMMARDIR = File::Spec->catdir(File::Spec->updir(), 'Grammar');
+$GRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'.gra');
+$FLATGRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'flat.gra');
+$CORPUS = 'tempfile';
+$BASEDIC = File::Spec->catfile($GRAMMARDIR, 'base.dic');
+$VOCAB = 'vocab';
+$CCS = 'temp.ccs';
+$PHOENIX = File::Spec->catfile($GRAMMARDIR, 'compile.exe');
+$TEXT2IDNGRAM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'text2idngram');
+$IDNGRAM2LM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'idngram2lm');
+$RANDOMSAMPS = 'generate_random_samples.pl';
+$IDNGRAM = $project.'.idngram';
+$LM = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'LanguageModel', 
+                          $project.'LM.arpa');
+$DICT = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'Dictionary',
+                            $project.'.dict');
+$REDUCED_DICT = $DICT.'.reduced_phoneset';
+
+#get language model
+#copy('../Grammar/ZapTask.gra', '../Grammar/Zap.gra');
+#copy('../Grammar/ZapTask.forms', '../Grammar/forms');
+#chdir '../Grammar';
+#system('mk_nets2.pl Zap.gra');
+#chdir '../MakeLM';
+#system("$PHOENIX -g $GRAMMARDIR -f ..\\Grammar\\Zap");
+&say('compile', 'compiling grammar...');
+chdir($GRAMMARDIR);
+system('cmp.bat');
+&say('compile', 'compiling language model...');
+chdir('../MakeLM');
+&say('compile', 'generating corpus...');
+&getcorpus($CORPUS);
+&say('compile', 'getting vocabulary...');
+&getvocab($BASEDIC, $VOCAB, $CCS);
+&say('compile', 'computing ngrams...');
+my $cmd = "$TEXT2IDNGRAM -vocab $VOCAB -temp . -write_ascii < $CORPUS > $IDNGRAM";
+&say('compile', $cmd);
+die "text2idngram failed$/" if system($cmd);
+&say('compile', 'computing language model...');
+$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -good_turing -disc_ranges 0 0 0 -ascii_input";
+#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -absolute -ascii_input";
+#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -ascii_input";
+&say('compile', "$cmd$/");
+die "idngram2lm failed$/" if system($cmd);
+
+#get dictionary
+&say('compile', 'compiling dictionary...');
+&getdict($DICT, $REDUCED_DICT, $VOCAB);
+&say('compile', "done\n");
+
+sub getcorpus {
+    my $corpusfile = shift;
+    #flatten grammar
+    open(ZAPGRA, $GRAMMARFILE) || die "Can't open grammar file$/";
+    open(ZAPFLAT, ">$FLATGRAMMARFILE") || die "Can't open grammar flat file$/";
+    print ZAPFLAT &flat(<ZAPGRA>);
+    close ZAPGRA;
+    close ZAPFLAT;
+
+    open(RANDOM, "$RANDOMSAMPS -n $SAMPSIZE -d $GRAMMARDIR|") || 
+	die "Cannot execute $RANDOMSAMPS$/";
+    open(CORPUS, ">$corpusfile") || die "Can't open $corpusfile$/";
+    binmode CORPUS;
+    while (<RANDOM>) {
+	$_ = uc($_);
+	chomp;
+	s/<\/?S> //g;
+	print CORPUS "<s> $_ </s>\n";
+    }
+    close CORPUS;
+}
+
+sub getvocab {
+    my $basefile = shift;
+    my $vocab = shift;
+    my $ccs = shift;
+    open(VOCAB, ">$vocab") || die "Can't open $vocab$/";
+    binmode VOCAB;
+    open(BASE, "<$basefile") || die "Can't open $basefile$/";
+    my @base = map { /(.*) .*/? uc("$1\n"): () } <BASE>;
+    print VOCAB grep !/<\/?S>/, sort(@base);
+    print VOCAB "<s>\n";
+    print VOCAB "</s>\n";
+    close VOCAB;
+    open(CCS, ">$ccs") || die "Can't open $ccs$/";
+    binmode CCS;
+    print CCS "<s>\n";
+    close CCS;
+}
+
+sub getdict {
+    my $dict = shift;
+    my $reduced = shift;
+    my $vocab = shift;
+    my @long;
+    open(DICT, ">$dict") || die "Can't open $dict$/";
+    binmode DICT;
+    open(REDUCED, ">$reduced") || die "Can't open $reduced$/";
+    binmode REDUCED;
+    open(VOCAB, "<$vocab") || die "Can't open $vocab$/";
+    for (<VOCAB>) {
+	s/=/_/g;
+	push @long, $_;
+    }
+    foreach (&shorten(@long)) {
+	s/_/=/g;
+	print DICT "$_\n";
+	s/\bIX\b/AX/g;
+	print REDUCED "$_\n";
+    }
+    close DICT;
+    close REDUCED;
+}
+
+sub shorten {
+    open (OUTFILE, ">outfile.tmp");
+    open (LONGFILE, ">longfile.tmp");
+
+    my $counter = 0;
+    my $not_done = 1;
+    my @listlongwords;
+
+    foreach (@_) {
+	my @blah = split / /, $_;
+	for my $i (@blah) {
+	    my $len = length $i;
+	    if ($len >= 35) {
+		for my $m (@listlongwords) {
+		    if ($m =~ /$i/) {
+			$not_done = 0;
+		    }
+		}
+		
+		if ($not_done) {
+		    push(@listlongwords, $i);
+		}
+		
+		$not_done = 1;
+		my @words = split /_/, $i;
+		for my $j (@words) {
+		    print LONGFILE "$j ";
+		}
+		print LONGFILE "\n";
+	    } else {
+		print OUTFILE "$i ";
+	    }
+	}
+    }
+    
+    close(OUTFILE);
+    close(LONGFILE);
+
+    my @diclines = 
+	grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('outfile.tmp');
+    unlink 'outfile.tmp';
+    my @longlines = stat('longfile.tmp')->size? 
+	grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('longfile.tmp'): ();
+    unlink 'longfile.tmp';
+
+    my $not_used = 1;
+    
+    for my $i (@listlongwords) {
+	chomp $i;
+	my $lotsaphones = "$i ";
+	my @word_parts = split /_/, $i;
+	for my $word (@word_parts) {
+	    $word =~ s/(\W)/\\$1/;
+	    #warn "the word to look up is $word\n";
+	    for my $j (@longlines) {
+		if ($not_used) {
+		    if ($j =~ /^$word\t/) {
+			my @pronounciation = split /\t/, $j;
+			chomp $pronounciation[1];
+			$lotsaphones .= "$pronounciation[1] ";
+			$not_used = 0;
+		    }
+		}
+	    }
+	    $not_used = 1;
+	}
+	$lotsaphones .= "\n";
+	push @diclines, $lotsaphones;
+    }	
+    return @diclines;
+}
+
+sub getdic {
+    if ($#_ == -1) {
+	die "Need the corpus location as an argument\n";
+    }
+
+    my $ua = new LWP::UserAgent;
+    my $res = $ua->request(POST 'http://fife.speech.cs.cmu.edu/cgi-bin/tools/lmtool.2.pl', 
+			   Content_Type => 'form-data', 
+			   Content => [formtype => 'simple', 
+				       corpus => [$_[0]], 
+				       #handdict => undef,
+				       #extrawords => undef,
+				       #phoneset => '40',
+				       #bracket => 'Yes',
+				       #model => 'Bigram',
+				       #class => 'nil',
+				       #discount => '0.5',
+				       submit => 'COMPILE KNOWLEDGE BASE']);
+
+    my $result;
+    if ($res->is_success) {
+	$result = $res->content;
+    } else {
+	die "Couldn't execute the perl script, probably error in the form$/";
+    }
+    
+    if ($result =~ /\!-- DIC.*ct\/\/(.*)\">/) {
+	
+	my $blah = "http://fife.speech.cs.cmu.edu/tools/product//$1";
+	$res = $ua->request(GET $blah);
+    
+	if ($res->is_success) {
+	    return split(/\n/, $res->content);
+	} else {
+	    die "Can't find dictionary file$/";
+	}
+    } else {
+	die "Couldn't parse the result: $result$/";
+    }
+}
+
+sub say {
+    print shift, ": ", shift, $/;
+}
+
+sub flat {
+    my @unflat = @_;
+    my @result;
+    for (@unflat) {
+	if (!s/^\s*\((.*)\)/$1/) {
+	    push @result, $_;
+	} else {
+	    my @stack;
+	    my %flathash;
+	    push(@stack, [split]);
+	    while (my $buffref = shift @stack) {
+		my $i = 0;
+		my @buff = @$buffref;
+		my $flat;
+		for (@buff) {
+		    if (/^\*(.*)/) {
+			$flat .= "$1 ";
+			push(@stack, [ @buff[0..$i-1], @buff[$i+1..$#buff] ]);
+		    } else {
+			$flat .= "$_ ";
+		    }
+		    $i++;
+		}
+		$flathash{$flat} = 1;
+	    }
+	    foreach (keys %flathash) {
+		push @result, "\t( $_)\n";
+	    }
+	}
+    }
+    @result;
+}

Deleted: Tools/makelm.pl
===================================================================
--- Tools/makelm.pl	2007-04-05 14:28:59 UTC (rev 18)
+++ Tools/makelm.pl	2007-04-05 14:29:08 UTC (rev 19)
@@ -1,277 +0,0 @@
-#!/usr/bin/perl
-
-use LWP::UserAgent;
-use HTTP::Request::Common;
-use File::Spec;
-use File::Copy;
-use File::stat;
-$ENV{'LC_COLLATE'} = 'C';
-$ENV{'LC_ALL'} = 'C';
-use locale;
-
-my $project = 'zap2';
-my $SAMPSIZE = scalar @ARGV? shift: 30000;
-
-$GRAMMARDIR = File::Spec->catdir(File::Spec->updir(), 'Grammar');
-$GRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'.gra');
-$FLATGRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'flat.gra');
-$CORPUS = 'tempfile';
-$BASEDIC = File::Spec->catfile($GRAMMARDIR, 'base.dic');
-$VOCAB = 'vocab';
-$CCS = 'temp.ccs';
-$PHOENIX = File::Spec->catfile($GRAMMARDIR, 'compile.exe');
-$TEXT2IDNGRAM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'text2idngram');
-$IDNGRAM2LM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'idngram2lm');
-$RANDOMSAMPS = 'generate_random_samples.pl';
-$IDNGRAM = $project.'.idngram';
-$LM = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'LanguageModel', 
-                          $project.'LM.arpa');
-$DICT = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'Dictionary',
-                            $project.'.dict');
-$REDUCED_DICT = $DICT.'.reduced_phoneset';
-
-#get language model
-#copy('../Grammar/ZapTask.gra', '../Grammar/Zap.gra');
-#copy('../Grammar/ZapTask.forms', '../Grammar/forms');
-#chdir '../Grammar';
-#system('mk_nets2.pl Zap.gra');
-#chdir '../MakeLM';
-#system("$PHOENIX -g $GRAMMARDIR -f ..\\Grammar\\Zap");
-&say('compile', 'compiling grammar...');
-chdir($GRAMMARDIR);
-system('cmp.bat');
-&say('compile', 'compiling language model...');
-chdir('../MakeLM');
-&say('compile', 'generating corpus...');
-&getcorpus($CORPUS);
-&say('compile', 'getting vocabulary...');
-&getvocab($BASEDIC, $VOCAB, $CCS);
-&say('compile', 'computing ngrams...');
-my $cmd = "$TEXT2IDNGRAM -vocab $VOCAB -temp . -write_ascii < $CORPUS > $IDNGRAM";
-&say('compile', $cmd);
-die "text2idngram failed$/" if system($cmd);
-&say('compile', 'computing language model...');
-$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -good_turing -disc_ranges 0 0 0 -ascii_input";
-#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -absolute -ascii_input";
-#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -ascii_input";
-&say('compile', "$cmd$/");
-die "idngram2lm failed$/" if system($cmd);
-
-#get dictionary
-&say('compile', 'compiling dictionary...');
-&getdict($DICT, $REDUCED_DICT, $VOCAB);
-&say('compile', "done\n");
-
-sub getcorpus {
-    my $corpusfile = shift;
-    #flatten grammar
-    open(ZAPGRA, $GRAMMARFILE) || die "Can't open grammar file$/";
-    open(ZAPFLAT, ">$FLATGRAMMARFILE") || die "Can't open grammar flat file$/";
-    print ZAPFLAT &flat(<ZAPGRA>);
-    close ZAPGRA;
-    close ZAPFLAT;
-
-    open(RANDOM, "$RANDOMSAMPS -n $SAMPSIZE -d $GRAMMARDIR|") || 
-	die "Cannot execute $RANDOMSAMPS$/";
-    open(CORPUS, ">$corpusfile") || die "Can't open $corpusfile$/";
-    binmode CORPUS;
-    while (<RANDOM>) {
-	$_ = uc($_);
-	chomp;
-	s/<\/?S> //g;
-	print CORPUS "<s> $_ </s>\n";
-    }
-    close CORPUS;
-}
-
-sub getvocab {
-    my $basefile = shift;
-    my $vocab = shift;
-    my $ccs = shift;
-    open(VOCAB, ">$vocab") || die "Can't open $vocab$/";
-    binmode VOCAB;
-    open(BASE, "<$basefile") || die "Can't open $basefile$/";
-    my @base = map { /(.*) .*/? uc("$1\n"): () } <BASE>;
-    print VOCAB grep !/<\/?S>/, sort(@base);
-    print VOCAB "<s>\n";
-    print VOCAB "</s>\n";
-    close VOCAB;
-    open(CCS, ">$ccs") || die "Can't open $ccs$/";
-    binmode CCS;
-    print CCS "<s>\n";
-    close CCS;
-}
-
-sub getdict {
-    my $dict = shift;
-    my $reduced = shift;
-    my $vocab = shift;
-    my @long;
-    open(DICT, ">$dict") || die "Can't open $dict$/";
-    binmode DICT;
-    open(REDUCED, ">$reduced") || die "Can't open $reduced$/";
-    binmode REDUCED;
-    open(VOCAB, "<$vocab") || die "Can't open $vocab$/";
-    for (<VOCAB>) {
-	s/=/_/g;
-	push @long, $_;
-    }
-    foreach (&shorten(@long)) {
-	s/_/=/g;
-	print DICT "$_\n";
-	s/\bIX\b/AX/g;
-	print REDUCED "$_\n";
-    }
-    close DICT;
-    close REDUCED;
-}
-
-sub shorten {
-    open (OUTFILE, ">outfile.tmp");
-    open (LONGFILE, ">longfile.tmp");
-
-    my $counter = 0;
-    my $not_done = 1;
-    my @listlongwords;
-
-    foreach (@_) {
-	my @blah = split / /, $_;
-	for my $i (@blah) {
-	    my $len = length $i;
-	    if ($len >= 35) {
-		for my $m (@listlongwords) {
-		    if ($m =~ /$i/) {
-			$not_done = 0;
-		    }
-		}
-		
-		if ($not_done) {
-		    push(@listlongwords, $i);
-		}
-		
-		$not_done = 1;
-		my @words = split /_/, $i;
-		for my $j (@words) {
-		    print LONGFILE "$j ";
-		}
-		print LONGFILE "\n";
-	    } else {
-		print OUTFILE "$i ";
-	    }
-	}
-    }
-    
-    close(OUTFILE);
-    close(LONGFILE);
-
-    my @diclines = 
-	grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('outfile.tmp');
-    unlink 'outfile.tmp';
-    my @longlines = stat('longfile.tmp')->size? 
-	grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('longfile.tmp'): ();
-    unlink 'longfile.tmp';
-
-    my $not_used = 1;
-    
-    for my $i (@listlongwords) {
-	chomp $i;
-	my $lotsaphones = "$i ";
-	my @word_parts = split /_/, $i;
-	for my $word (@word_parts) {
-	    $word =~ s/(\W)/\\$1/;
-	    #warn "the word to look up is $word\n";
-	    for my $j (@longlines) {
-		if ($not_used) {
-		    if ($j =~ /^$word\t/) {
-			my @pronounciation = split /\t/, $j;
-			chomp $pronounciation[1];
-			$lotsaphones .= "$pronounciation[1] ";
-			$not_used = 0;
-		    }
-		}
-	    }
-	    $not_used = 1;
-	}
-	$lotsaphones .= "\n";
-	push @diclines, $lotsaphones;
-    }	
-    return @diclines;
-}
-
-sub getdic {
-    if ($#_ == -1) {
-	die "Need the corpus location as an argument\n";
-    }
-
-    my $ua = new LWP::UserAgent;
-    my $res = $ua->request(POST 'http://fife.speech.cs.cmu.edu/cgi-bin/tools/lmtool.2.pl', 
-			   Content_Type => 'form-data', 
-			   Content => [formtype => 'simple', 
-				       corpus => [$_[0]], 
-				       #handdict => undef,
-				       #extrawords => undef,
-				       #phoneset => '40',
-				       #bracket => 'Yes',
-				       #model => 'Bigram',
-				       #class => 'nil',
-				       #discount => '0.5',
-				       submit => 'COMPILE KNOWLEDGE BASE']);
-
-    my $result;
-    if ($res->is_success) {
-	$result = $res->content;
-    } else {
-	die "Couldn't execute the perl script, probably error in the form$/";
-    }
-    
-    if ($result =~ /\!-- DIC.*ct\/\/(.*)\">/) {
-	
-	my $blah = "http://fife.speech.cs.cmu.edu/tools/product//$1";
-	$res = $ua->request(GET $blah);
-    
-	if ($res->is_success) {
-	    return split(/\n/, $res->content);
-	} else {
-	    die "Can't find dictionary file$/";
-	}
-    } else {
-	die "Couldn't parse the result: $result$/";
-    }
-}
-
-sub say {
-    print shift, ": ", shift, $/;
-}
-
-sub flat {
-    my @unflat = @_;
-    my @result;
-    for (@unflat) {
-	if (!s/^\s*\((.*)\)/$1/) {
-	    push @result, $_;
-	} else {
-	    my @stack;
-	    my %flathash;
-	    push(@stack, [split]);
-	    while (my $buffref = shift @stack) {
-		my $i = 0;
-		my @buff = @$buffref;
-		my $flat;
-		for (@buff) {
-		    if (/^\*(.*)/) {
-			$flat .= "$1 ";
-			push(@stack, [ @buff[0..$i-1], @buff[$i+1..$#buff] ]);
-		    } else {
-			$flat .= "$_ ";
-		    }
-		    $i++;
-		}
-		$flathash{$flat} = 1;
-	    }
-	    foreach (keys %flathash) {
-		push @result, "\t( $_)\n";
-	    }
-	}
-    }
-    @result;
-}