[RavenclawDev 242] [19] Tools: Moved remotely
tk@edam.speech.cs.cmu.edu
tk at edam.speech.cs.cmu.edu
Thu Apr 5 10:29:08 EDT 2007
An HTML attachment was scrubbed...
URL: http://mailman.srv.cs.cmu.edu/pipermail/ravenclaw-developers/attachments/20070405/16845b02/attachment.html
-------------- next part --------------
Copied: Tools/MakeLM/makelm.pl (from rev 18, Tools/makelm.pl)
===================================================================
--- Tools/MakeLM/makelm.pl (rev 0)
+++ Tools/MakeLM/makelm.pl 2007-04-05 14:29:08 UTC (rev 19)
@@ -0,0 +1,277 @@
+#!/usr/bin/perl
+
+use LWP::UserAgent;
+use HTTP::Request::Common;
+use File::Spec;
+use File::Copy;
+use File::stat;
+$ENV{'LC_COLLATE'} = 'C';
+$ENV{'LC_ALL'} = 'C';
+use locale;
+
+my $project = 'zap2';
+my $SAMPSIZE = scalar @ARGV? shift: 30000;
+
+$GRAMMARDIR = File::Spec->catdir(File::Spec->updir(), 'Grammar');
+$GRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'.gra');
+$FLATGRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'flat.gra');
+$CORPUS = 'tempfile';
+$BASEDIC = File::Spec->catfile($GRAMMARDIR, 'base.dic');
+$VOCAB = 'vocab';
+$CCS = 'temp.ccs';
+$PHOENIX = File::Spec->catfile($GRAMMARDIR, 'compile.exe');
+$TEXT2IDNGRAM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'text2idngram');
+$IDNGRAM2LM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'idngram2lm');
+$RANDOMSAMPS = 'generate_random_samples.pl';
+$IDNGRAM = $project.'.idngram';
+$LM = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'LanguageModel',
+ $project.'LM.arpa');
+$DICT = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'Dictionary',
+ $project.'.dict');
+$REDUCED_DICT = $DICT.'.reduced_phoneset';
+
+#get language model
+#copy('../Grammar/ZapTask.gra', '../Grammar/Zap.gra');
+#copy('../Grammar/ZapTask.forms', '../Grammar/forms');
+#chdir '../Grammar';
+#system('mk_nets2.pl Zap.gra');
+#chdir '../MakeLM';
+#system("$PHOENIX -g $GRAMMARDIR -f ..\\Grammar\\Zap");
+&say('compile', 'compiling grammar...');
+chdir($GRAMMARDIR);
+system('cmp.bat');
+&say('compile', 'compiling language model...');
+chdir('../MakeLM');
+&say('compile', 'generating corpus...');
+&getcorpus($CORPUS);
+&say('compile', 'getting vocabulary...');
+&getvocab($BASEDIC, $VOCAB, $CCS);
+&say('compile', 'computing ngrams...');
+my $cmd = "$TEXT2IDNGRAM -vocab $VOCAB -temp . -write_ascii < $CORPUS > $IDNGRAM";
+&say('compile', $cmd);
+die "text2idngram failed$/" if system($cmd);
+&say('compile', 'computing language model...');
+$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -good_turing -disc_ranges 0 0 0 -ascii_input";
+#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -absolute -ascii_input";
+#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -ascii_input";
+&say('compile', "$cmd$/");
+die "idngram2lm failed$/" if system($cmd);
+
+#get dictionary
+&say('compile', 'compiling dictionary...');
+&getdict($DICT, $REDUCED_DICT, $VOCAB);
+&say('compile', "done\n");
+
+sub getcorpus {
+ my $corpusfile = shift;
+ #flatten grammar
+ open(ZAPGRA, $GRAMMARFILE) || die "Can't open grammar file$/";
+ open(ZAPFLAT, ">$FLATGRAMMARFILE") || die "Can't open grammar flat file$/";
+ print ZAPFLAT &flat(<ZAPGRA>);
+ close ZAPGRA;
+ close ZAPFLAT;
+
+ open(RANDOM, "$RANDOMSAMPS -n $SAMPSIZE -d $GRAMMARDIR|") ||
+ die "Cannot execute $RANDOMSAMPS$/";
+ open(CORPUS, ">$corpusfile") || die "Can't open $corpusfile$/";
+ binmode CORPUS;
+ while (<RANDOM>) {
+ $_ = uc($_);
+ chomp;
+ s/<\/?S> //g;
+ print CORPUS "<s> $_ </s>\n";
+ }
+ close CORPUS;
+}
+
+sub getvocab {
+ my $basefile = shift;
+ my $vocab = shift;
+ my $ccs = shift;
+ open(VOCAB, ">$vocab") || die "Can't open $vocab$/";
+ binmode VOCAB;
+ open(BASE, "<$basefile") || die "Can't open $basefile$/";
+ my @base = map { /(.*) .*/? uc("$1\n"): () } <BASE>;
+ print VOCAB grep !/<\/?S>/, sort(@base);
+ print VOCAB "<s>\n";
+ print VOCAB "</s>\n";
+ close VOCAB;
+ open(CCS, ">$ccs") || die "Can't open $ccs$/";
+ binmode CCS;
+ print CCS "<s>\n";
+ close CCS;
+}
+
+sub getdict {
+ my $dict = shift;
+ my $reduced = shift;
+ my $vocab = shift;
+ my @long;
+ open(DICT, ">$dict") || die "Can't open $dict$/";
+ binmode DICT;
+ open(REDUCED, ">$reduced") || die "Can't open $reduced$/";
+ binmode REDUCED;
+ open(VOCAB, "<$vocab") || die "Can't open $vocab$/";
+ for (<VOCAB>) {
+ s/=/_/g;
+ push @long, $_;
+ }
+ foreach (&shorten(@long)) {
+ s/_/=/g;
+ print DICT "$_\n";
+ s/\bIX\b/AX/g;
+ print REDUCED "$_\n";
+ }
+ close DICT;
+ close REDUCED;
+}
+
+sub shorten {
+ open (OUTFILE, ">outfile.tmp");
+ open (LONGFILE, ">longfile.tmp");
+
+ my $counter = 0;
+ my $not_done = 1;
+ my @listlongwords;
+
+ foreach (@_) {
+ my @blah = split / /, $_;
+ for my $i (@blah) {
+ my $len = length $i;
+ if ($len >= 35) {
+ for my $m (@listlongwords) {
+ if ($m =~ /$i/) {
+ $not_done = 0;
+ }
+ }
+
+ if ($not_done) {
+ push(@listlongwords, $i);
+ }
+
+ $not_done = 1;
+ my @words = split /_/, $i;
+ for my $j (@words) {
+ print LONGFILE "$j ";
+ }
+ print LONGFILE "\n";
+ } else {
+ print OUTFILE "$i ";
+ }
+ }
+ }
+
+ close(OUTFILE);
+ close(LONGFILE);
+
+ my @diclines =
+ grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('outfile.tmp');
+ unlink 'outfile.tmp';
+ my @longlines = stat('longfile.tmp')->size?
+ grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('longfile.tmp'): ();
+ unlink 'longfile.tmp';
+
+ my $not_used = 1;
+
+ for my $i (@listlongwords) {
+ chomp $i;
+ my $lotsaphones = "$i ";
+ my @word_parts = split /_/, $i;
+ for my $word (@word_parts) {
+ $word =~ s/(\W)/\\$1/;
+ #warn "the word to look up is $word\n";
+ for my $j (@longlines) {
+ if ($not_used) {
+ if ($j =~ /^$word\t/) {
+ my @pronounciation = split /\t/, $j;
+ chomp $pronounciation[1];
+ $lotsaphones .= "$pronounciation[1] ";
+ $not_used = 0;
+ }
+ }
+ }
+ $not_used = 1;
+ }
+ $lotsaphones .= "\n";
+ push @diclines, $lotsaphones;
+ }
+ return @diclines;
+}
+
+sub getdic {
+ if ($#_ == -1) {
+ die "Need the corpus location as an argument\n";
+ }
+
+ my $ua = new LWP::UserAgent;
+ my $res = $ua->request(POST 'http://fife.speech.cs.cmu.edu/cgi-bin/tools/lmtool.2.pl',
+ Content_Type => 'form-data',
+ Content => [formtype => 'simple',
+ corpus => [$_[0]],
+ #handdict => undef,
+ #extrawords => undef,
+ #phoneset => '40',
+ #bracket => 'Yes',
+ #model => 'Bigram',
+ #class => 'nil',
+ #discount => '0.5',
+ submit => 'COMPILE KNOWLEDGE BASE']);
+
+ my $result;
+ if ($res->is_success) {
+ $result = $res->content;
+ } else {
+ die "Couldn't execute the perl script, probably error in the form$/";
+ }
+
+ if ($result =~ /\!-- DIC.*ct\/\/(.*)\">/) {
+
+ my $blah = "http://fife.speech.cs.cmu.edu/tools/product//$1";
+ $res = $ua->request(GET $blah);
+
+ if ($res->is_success) {
+ return split(/\n/, $res->content);
+ } else {
+ die "Can't find dictionary file$/";
+ }
+ } else {
+ die "Couldn't parse the result: $result$/";
+ }
+}
+
+sub say {
+ print shift, ": ", shift, $/;
+}
+
+sub flat {
+ my @unflat = @_;
+ my @result;
+ for (@unflat) {
+ if (!s/^\s*\((.*)\)/$1/) {
+ push @result, $_;
+ } else {
+ my @stack;
+ my %flathash;
+ push(@stack, [split]);
+ while (my $buffref = shift @stack) {
+ my $i = 0;
+ my @buff = @$buffref;
+ my $flat;
+ for (@buff) {
+ if (/^\*(.*)/) {
+ $flat .= "$1 ";
+ push(@stack, [ @buff[0..$i-1], @buff[$i+1..$#buff] ]);
+ } else {
+ $flat .= "$_ ";
+ }
+ $i++;
+ }
+ $flathash{$flat} = 1;
+ }
+ foreach (keys %flathash) {
+ push @result, "\t( $_)\n";
+ }
+ }
+ }
+ @result;
+}
Deleted: Tools/makelm.pl
===================================================================
--- Tools/makelm.pl 2007-04-05 14:28:59 UTC (rev 18)
+++ Tools/makelm.pl 2007-04-05 14:29:08 UTC (rev 19)
@@ -1,277 +0,0 @@
-#!/usr/bin/perl
-
-use LWP::UserAgent;
-use HTTP::Request::Common;
-use File::Spec;
-use File::Copy;
-use File::stat;
-$ENV{'LC_COLLATE'} = 'C';
-$ENV{'LC_ALL'} = 'C';
-use locale;
-
-my $project = 'zap2';
-my $SAMPSIZE = scalar @ARGV? shift: 30000;
-
-$GRAMMARDIR = File::Spec->catdir(File::Spec->updir(), 'Grammar');
-$GRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'.gra');
-$FLATGRAMMARFILE = File::Spec->catfile($GRAMMARDIR, $project.'flat.gra');
-$CORPUS = 'tempfile';
-$BASEDIC = File::Spec->catfile($GRAMMARDIR, 'base.dic');
-$VOCAB = 'vocab';
-$CCS = 'temp.ccs';
-$PHOENIX = File::Spec->catfile($GRAMMARDIR, 'compile.exe');
-$TEXT2IDNGRAM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'text2idngram');
-$IDNGRAM2LM = File::Spec->catfile('CMU-Cam_Toolkit_v2', 'bin', 'idngram2lm');
-$RANDOMSAMPS = 'generate_random_samples.pl';
-$IDNGRAM = $project.'.idngram';
-$LM = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'LanguageModel',
- $project.'LM.arpa');
-$DICT = File::Spec->catfile(File::Spec->updir(), 'DecoderConfig', 'Dictionary',
- $project.'.dict');
-$REDUCED_DICT = $DICT.'.reduced_phoneset';
-
-#get language model
-#copy('../Grammar/ZapTask.gra', '../Grammar/Zap.gra');
-#copy('../Grammar/ZapTask.forms', '../Grammar/forms');
-#chdir '../Grammar';
-#system('mk_nets2.pl Zap.gra');
-#chdir '../MakeLM';
-#system("$PHOENIX -g $GRAMMARDIR -f ..\\Grammar\\Zap");
-&say('compile', 'compiling grammar...');
-chdir($GRAMMARDIR);
-system('cmp.bat');
-&say('compile', 'compiling language model...');
-chdir('../MakeLM');
-&say('compile', 'generating corpus...');
-&getcorpus($CORPUS);
-&say('compile', 'getting vocabulary...');
-&getvocab($BASEDIC, $VOCAB, $CCS);
-&say('compile', 'computing ngrams...');
-my $cmd = "$TEXT2IDNGRAM -vocab $VOCAB -temp . -write_ascii < $CORPUS > $IDNGRAM";
-&say('compile', $cmd);
-die "text2idngram failed$/" if system($cmd);
-&say('compile', 'computing language model...');
-$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -good_turing -disc_ranges 0 0 0 -ascii_input";
-#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -absolute -ascii_input";
-#$cmd = "$IDNGRAM2LM -idngram $IDNGRAM -vocab $VOCAB -arpa $LM -context $CCS -vocab_type 0 -ascii_input";
-&say('compile', "$cmd$/");
-die "idngram2lm failed$/" if system($cmd);
-
-#get dictionary
-&say('compile', 'compiling dictionary...');
-&getdict($DICT, $REDUCED_DICT, $VOCAB);
-&say('compile', "done\n");
-
-sub getcorpus {
- my $corpusfile = shift;
- #flatten grammar
- open(ZAPGRA, $GRAMMARFILE) || die "Can't open grammar file$/";
- open(ZAPFLAT, ">$FLATGRAMMARFILE") || die "Can't open grammar flat file$/";
- print ZAPFLAT &flat(<ZAPGRA>);
- close ZAPGRA;
- close ZAPFLAT;
-
- open(RANDOM, "$RANDOMSAMPS -n $SAMPSIZE -d $GRAMMARDIR|") ||
- die "Cannot execute $RANDOMSAMPS$/";
- open(CORPUS, ">$corpusfile") || die "Can't open $corpusfile$/";
- binmode CORPUS;
- while (<RANDOM>) {
- $_ = uc($_);
- chomp;
- s/<\/?S> //g;
- print CORPUS "<s> $_ </s>\n";
- }
- close CORPUS;
-}
-
-sub getvocab {
- my $basefile = shift;
- my $vocab = shift;
- my $ccs = shift;
- open(VOCAB, ">$vocab") || die "Can't open $vocab$/";
- binmode VOCAB;
- open(BASE, "<$basefile") || die "Can't open $basefile$/";
- my @base = map { /(.*) .*/? uc("$1\n"): () } <BASE>;
- print VOCAB grep !/<\/?S>/, sort(@base);
- print VOCAB "<s>\n";
- print VOCAB "</s>\n";
- close VOCAB;
- open(CCS, ">$ccs") || die "Can't open $ccs$/";
- binmode CCS;
- print CCS "<s>\n";
- close CCS;
-}
-
-sub getdict {
- my $dict = shift;
- my $reduced = shift;
- my $vocab = shift;
- my @long;
- open(DICT, ">$dict") || die "Can't open $dict$/";
- binmode DICT;
- open(REDUCED, ">$reduced") || die "Can't open $reduced$/";
- binmode REDUCED;
- open(VOCAB, "<$vocab") || die "Can't open $vocab$/";
- for (<VOCAB>) {
- s/=/_/g;
- push @long, $_;
- }
- foreach (&shorten(@long)) {
- s/_/=/g;
- print DICT "$_\n";
- s/\bIX\b/AX/g;
- print REDUCED "$_\n";
- }
- close DICT;
- close REDUCED;
-}
-
-sub shorten {
- open (OUTFILE, ">outfile.tmp");
- open (LONGFILE, ">longfile.tmp");
-
- my $counter = 0;
- my $not_done = 1;
- my @listlongwords;
-
- foreach (@_) {
- my @blah = split / /, $_;
- for my $i (@blah) {
- my $len = length $i;
- if ($len >= 35) {
- for my $m (@listlongwords) {
- if ($m =~ /$i/) {
- $not_done = 0;
- }
- }
-
- if ($not_done) {
- push(@listlongwords, $i);
- }
-
- $not_done = 1;
- my @words = split /_/, $i;
- for my $j (@words) {
- print LONGFILE "$j ";
- }
- print LONGFILE "\n";
- } else {
- print OUTFILE "$i ";
- }
- }
- }
-
- close(OUTFILE);
- close(LONGFILE);
-
- my @diclines =
- grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('outfile.tmp');
- unlink 'outfile.tmp';
- my @longlines = stat('longfile.tmp')->size?
- grep !(/CONTENT-TYPE/ || /TEXT\/PLAIN/), &getdic('longfile.tmp'): ();
- unlink 'longfile.tmp';
-
- my $not_used = 1;
-
- for my $i (@listlongwords) {
- chomp $i;
- my $lotsaphones = "$i ";
- my @word_parts = split /_/, $i;
- for my $word (@word_parts) {
- $word =~ s/(\W)/\\$1/;
- #warn "the word to look up is $word\n";
- for my $j (@longlines) {
- if ($not_used) {
- if ($j =~ /^$word\t/) {
- my @pronounciation = split /\t/, $j;
- chomp $pronounciation[1];
- $lotsaphones .= "$pronounciation[1] ";
- $not_used = 0;
- }
- }
- }
- $not_used = 1;
- }
- $lotsaphones .= "\n";
- push @diclines, $lotsaphones;
- }
- return @diclines;
-}
-
-sub getdic {
- if ($#_ == -1) {
- die "Need the corpus location as an argument\n";
- }
-
- my $ua = new LWP::UserAgent;
- my $res = $ua->request(POST 'http://fife.speech.cs.cmu.edu/cgi-bin/tools/lmtool.2.pl',
- Content_Type => 'form-data',
- Content => [formtype => 'simple',
- corpus => [$_[0]],
- #handdict => undef,
- #extrawords => undef,
- #phoneset => '40',
- #bracket => 'Yes',
- #model => 'Bigram',
- #class => 'nil',
- #discount => '0.5',
- submit => 'COMPILE KNOWLEDGE BASE']);
-
- my $result;
- if ($res->is_success) {
- $result = $res->content;
- } else {
- die "Couldn't execute the perl script, probably error in the form$/";
- }
-
- if ($result =~ /\!-- DIC.*ct\/\/(.*)\">/) {
-
- my $blah = "http://fife.speech.cs.cmu.edu/tools/product//$1";
- $res = $ua->request(GET $blah);
-
- if ($res->is_success) {
- return split(/\n/, $res->content);
- } else {
- die "Can't find dictionary file$/";
- }
- } else {
- die "Couldn't parse the result: $result$/";
- }
-}
-
-sub say {
- print shift, ": ", shift, $/;
-}
-
-sub flat {
- my @unflat = @_;
- my @result;
- for (@unflat) {
- if (!s/^\s*\((.*)\)/$1/) {
- push @result, $_;
- } else {
- my @stack;
- my %flathash;
- push(@stack, [split]);
- while (my $buffref = shift @stack) {
- my $i = 0;
- my @buff = @$buffref;
- my $flat;
- for (@buff) {
- if (/^\*(.*)/) {
- $flat .= "$1 ";
- push(@stack, [ @buff[0..$i-1], @buff[$i+1..$#buff] ]);
- } else {
- $flat .= "$_ ";
- }
- $i++;
- }
- $flathash{$flat} = 1;
- }
- foreach (keys %flathash) {
- push @result, "\t( $_)\n";
- }
- }
- }
- @result;
-}
More information about the Ravenclaw-developers
mailing list