#! /pkg/gnu/bin/gawk -f BEGIN { if (!THRESH) THRESH = .25 INCLUDETRAIN = 1 # for now, we are not removing from ronbox and we are rewriting all readme.* system("rm -f readme.*") com = "ls -1 class.*" while (com | getline thisclass) { printf "\t\tREADING "thisclass classes[thisclass] = 1 ofname = "readme."thisclass nexemplars = 0 while (getline < thisclass > 0) { if ($0 ~ /^From /) { trainingheader[$0] = 1; nexemplars++ } if (INCLUDETRAIN) print $0 >> ofname for (i=1; i<=NF; i++) { wordcount[thisclass,$i]++ totwordcount[thisclass]++ words[$i] = 1 } }; close(thisclass) close(ofname) print " "nexemplars }; close(com) for (w in words) for (c in classes) if (wordcount[c,w]) clashcount[w]++ ######################## END TRAINING ######################## while (getline < "ronbox" > 0) { holdline = $0 if ($0 ~ /^Subject: /) { lastsubjectline = substr($0,1,50) gsub(/[^a-zA-Z: ]/,"",lastsubjectline) } if ($0 ~ /^From /) { # eject last message if (lastfromline && !skiptonext) { # should use asort, but let's humor sun/solaris morons system("rm -f rmail.temp") com = "sort -nr | head -3 > rmail.temp" for (c in score) print mymin(0+score[c])" "c | com close(com) res = "" getline res < "rmail.temp" split(res,tt," "); bestscore = tt[1]; bestclass = tt[2] getline res < "rmail.temp" split(res,tt," "); b2estscore = tt[1]; b2estclass = tt[2] getline res < "rmail.temp" split(res,tt," "); b3estscore = tt[1]; b3estclass = tt[2] close("rmail.temp") diff = (bestscore - b3estscore)/den(b3estscore) if (diff > THRESH) { # print "\t\t\t\t\t\t--> "bestclass"\n" ofname = "readme."bestclass if (numlines) for (i=1; i<=numlines+1; i++) print store[i] >> ofname close(ofname) # printf bestclass" " } else { print "\n\t\tMessage from "lastfromline print "\t\t"lastsubjectline print "??? "bestclass,bestscore" ??? "b2estclass,b2estscore" ??? "b3estclass,b3estscore"\t"int(100*diff)"%\n" ofname = "readme.unclassified" if (numlines) for (i=1; i<=numlines+1; i++) print store[i] >> ofname close(ofname) } } delete score; numwords = 0 $0 = holdline lastfromline = $2" "$3" "$4" "$5 delete store; numlines = 0 if (trainingheader[$0]) { skiptonext = 1 # print "SKIP TRAINING MSG "$0 } else skiptonext = 0 if (++testmsg % 10 == 0) printf "10" } if (skiptonext) continue store[++numlines] = $0 # only score first 100 lines if (numlines < 100) for (i=1; i<=NF; i++) { for (c in classes) if (clashcount[$i] < 4 && wordcount[c,$i]) score[c] += wordcount[c,$i]*sqrt(maxlength(c))/den((sqrt(NF)*clashcount[$i]*clashcount[$i]*totwordcount[c])) # favors words unique to class on inverse square # favors words freq in class # favors long words on the square root # favors words on short lines on square root numwords++ } }; close("ronbox") print "" while (getline com < "sourceme" > 0) { system(com) }; close("sourceme") } func den(x) { if (x+0) return x+0; return 1 } func maxlength(x) { gsub(/[0-9]+/,"#",x) if (length(x) < 10) return length(x); return 10 } func mymin(x) { if (x < .001) return 0; return x }