-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathwdcnt
executable file
·141 lines (109 loc) · 2.96 KB
/
wdcnt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#! /usr/bin/env ruby -Ke
WDCNT_VERSION = "2000-01-23"
=begin
= wdcnt -- word counter for English/Japanese text file.
== SYNOPSIS
((%wdcnt [-r] [-p|-z] [-e] ((|files|)) ...%))
((%wdcnt [-r] [-p|-z] [-e] < ((|file|))%))
((%wdcnt -v%))
== DESCRIPTION
(('wdcnt')) counts reports English or Japanese words in files or
standard input. (('wdcnt')) ignores punctuation, digits, quote signs
or HTML tags. The output is sorted in the order of the occurrence
frequency and can be plotted directly by (('gnuplot(1)')) as follows.
gnuplot> set log xy
gnuplot> plot "< wdcnt file"
== OPTIONS
: -p
Reports probability instead of number of occurrences.
Each frequency is normalized by 1.0.
: -z
Reports relative frequency instead of number of occurrences.
1.0 for the most occurring word.
: -r
Puts rank to the head of each line.
: -e
Does not use KAKASI. This option is NOT useful to Japanese documents.
: -v, -h
Prints usage and version then exit.
== HISTORY
For English document, a traditional one-liner is known:
% tr -s '\040' '\012' files ... | sort -n | uniq -c | sort -n -r
== SEE ALSO
(('Ruby/KAKASI')) ((<URL:http://www.ruby-lang.org/en/raa.html#Ruby%2FKAKASI>)),
(('ruby(1)')) ((<URL:http://www.ruby-lang.org/>)),
(('kakasi(1)')) ((<URL:http://kakasi.namazu.org/>)),
(('gnuplot(1)')), (('tr(1)')), (('sort(1)')), (('uniq(1)'))
== BUGS
Word separation is not accurate.
== AUTHOR
Gotoken ((<URL:mailto:[email protected]>))
=end
require "getopts"
getopts("epzvhr")
ME = File.basename($0)
if $OPT_v || $OPT_h || ($OPT_p && $OPT_z)
puts "#{ME} - version #{WDCNT_VERSION}"
puts <<EOS
USAGE:
#{ME} [-r] [-p|-z] [-e] files ...
#{ME} [-r] [-p|-z] [-e] < file
#{ME} -v
OPTIONS:
-p: reports as probability.
-z: reports relative frequency (1.0 for most word).
-e: does not use KAKASI.
-v: prints this message.
EOS
exit
end
if $OPT_e
def kakasi(opt, src); src end
else
require "kakasi"
include Kakasi
end
files = ARGV.dup
# delete html-tag or symbol
txt = kakasi("-oeuc -Ea", $<.read || "")
txt.gsub!(/<[^>]*>/p, " ")
txt.gsub!(/^\.\S+\s+/, "")
txt.gsub!(/[\/<>\s\n\r\d%\(\),:;~"`]+/p, " ") # "
txt.gsub!(/(?!\w)[-'\.]\W/, " ") # '
words = kakasi("-ieuc -w", txt).split.sort!
n = 0.0
w0, c, wc = words.shift, 1, []
w0 or (STDERR.puts "#{ME}: nodata??"; exit)
# counting words
words.each{|w|
if w != w0
wc.push [c,w0]
n += c
c = 1
else
c += 1
end
w0 = w
}
wc.push [c, w0]
n += c
# report
puts "# creator: #{ME} - version #{WDCNT_VERSION}"
puts "# date : #{Time.now}"
puts "# source : #{files.join(',')}"
puts "# comment: KAKASI was not used." if $OPT_e
puts "# summary: #{wc.size} different words in #{n.to_i} words."
wc.sort!.reverse!.each_with_index{|d,i|
fmt, data = if $OPT_r
["%5d ", [i+1]]
else
["", []]
end
if $OPT_p
puts "#{fmt}%.5f # %s" % (data + [d[0]/n, d[1]])
elsif $OPT_z
puts "#{fmt}%.5f # %s" % (data + [d[0]/wc[0][0].to_f, d[1]])
else
puts "#{fmt}%4d # %s" % (data + d)
end
}