#!/usr/local/bin/perl #calculates the probability values - uses logs as they are small #open the unigram and bigram files and read them into memory open(uniFile, $ARGV[0].".uni") || die "Can't open unigram file for $ARGV[0]\n"; open(biFile, $ARGV[0].".bi") || die "Can't open bigram file for $ARGV[0]\n"; print "Opened files for reading...\n"; #now read them in %uniHash; @curUni; while() { chomp; #split the line into a hash @curUni = split(/ /); $uniHash{$curUni[1]} = $curUni[0]; } print "Read unigram file, starting bigram file...\n"; #read in the bigrams %biHash; while(){ chomp; #split into a number and list and then into a hash @biargs = split(/ /); $num = $biargs[0]; $word1 = $biargs[1]; $word2 = $biargs[2]; $biHash{"$word1"."$word2"} = $num; } #now start reading lines from stdin, split into words, #and start calculating the probability value using logs $cumulVal = 0; print "beginning probability calculation...\n"; while() { #remove newline and change upper to lwoer case, eliminate punctuation chomp; tr/A-Z,.$%&?!;:()'"\/-/a-z /; @words = split(/ /); for($i=0;$i<$#words;$i++) { #this is the vital part - probability value calculation #we need to add a few more values to this to compensate for the unknown #we do this by making the count 1 if either is 0 $denom = $uniHash{$words[$i]}; $numer = $biHash{$words[$i].$words[$i+1]}; if ( $denom == 0) { $denom++; } if ($numer == 0 ) { $numer++; } $cumulVal += log ($numer / $denom); } } print "The accumulated probability log calculated is " , $cumulVal , "\n"; print "\n";