#!/usr/bin/ruby -Ke WDCNT_VERSION = "2000-01-23" =begin = wdcnt -- word counter for English/Japanese text file. == SYNOPSIS ((%wdcnt [-r] [-p|-z] [-e] ((|files|)) ...%)) ((%wdcnt [-r] [-p|-z] [-e] < ((|file|))%)) ((%wdcnt -v%)) == DESCRIPTION (('wdcnt')) counts reports English or Japanese words in files or standard input. (('wdcnt')) ignores punctuation, digits, quote signs or HTML tags. The output is sorted in the order of the occurrence frequency and can be plotted directly by (('gnuplot(1)')) as follows. gnuplot> set log xy gnuplot> plot "< wdcnt file" == OPTIONS : -p Reports probability instead of number of occurrences. Each frequency is normalized by 1.0. : -z Reports relative frequency instead of number of occurrences. 1.0 for the most occurring word. : -r Puts rank to the head of each line. : -e Does not use KAKASI. This option is NOT useful to Japanese documents. : -v, -h Prints usage and version then exit. == HISTORY For English document, a traditional one-liner is known: % tr -s '\040' '\012' files ... | sort -n | uniq -c | sort -n -r == SEE ALSO (('Ruby/KAKASI')) ((<URL:http://www.ruby-lang.org/en/raa.html#Ruby%2FKAKASI>)), (('ruby(1)')) ((<URL:http://www.ruby-lang.org/>)), (('kakasi(1)')) ((<URL:http://kakasi.namazu.org/>)), (('gnuplot(1)')), (('tr(1)')), (('sort(1)')), (('uniq(1)')) == BUGS Word separation is not accurate. == AUTHOR Gotoken ((<URL:mailto:gotoken@notwork.org>)) =end require "getopts" getopts("epzvhr") ME = File.basename($0) if $OPT_v || $OPT_h || ($OPT_p && $OPT_z) puts "#{ME} - version #{WDCNT_VERSION}" puts <<EOS USAGE: #{ME} [-r] [-p|-z] [-e] files ... #{ME} [-r] [-p|-z] [-e] < file #{ME} -v OPTIONS: -p: reports as probability. -z: reports relative frequency (1.0 for most word). -e: does not use KAKASI. -v: prints this message. EOS exit end if $OPT_e def kakasi(opt, src); src end else require "kakasi" include Kakasi end files = ARGV.dup # delete html-tag or symbol txt = kakasi("-oeuc -Ea", $<.read || "") txt.gsub!(/<[^>]*>/p, " ") txt.gsub!(/^\.\S+\s+/, "") txt.gsub!(/[\/<>\s\n\r\d%\(\),:;~"`]+/p, " ") # " txt.gsub!(/(?!\w)[-'\.]\W/, " ") # ' words = kakasi("-ieuc -w", txt).split.sort! n = 0.0 w0, c, wc = words.shift, 1, [] w0 or (STDERR.puts "#{ME}: nodata??"; exit) # counting words words.each{|w| if w != w0 wc.push [c,w0] n += c c = 1 else c += 1 end w0 = w } wc.push [c, w0] n += c # report puts "# creator: #{ME} - version #{WDCNT_VERSION}" puts "# date : #{Time.now}" puts "# source : #{files.join(',')}" puts "# comment: KAKASI was not used." if $OPT_e puts "# summary: #{wc.size} different words in #{n.to_i} words." wc.sort!.reverse!.each_with_index{|d,i| fmt, data = if $OPT_r ["%5d ", [i+1]] else ["", []] end if $OPT_p puts "#{fmt}%.5f # %s" % (data + [d[0]/n, d[1]]) elsif $OPT_z puts "#{fmt}%.5f # %s" % (data + [d[0]/wc[0][0].to_f, d[1]]) else puts "#{fmt}%4d # %s" % (data + d) end }