forked from github-linguist/linguist
-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier.rb
203 lines (179 loc) · 6.59 KB
/
classifier.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
require 'linguist/tokenizer'
module Linguist
# Language bayesian classifier.
class Classifier
CLASSIFIER_CONSIDER_BYTES = 50 * 1024
# Public: Use the classifier to detect language of the blob.
#
# blob - An object that quacks like a blob.
# possible_languages - Array of Language objects
#
# Examples
#
# Classifier.call(FileBlob.new("path/to/file"), [
# Language["Ruby"], Language["Python"]
# ])
#
# Returns an Array of Language objects, most probable first.
def self.call(blob, possible_languages)
language_names = possible_languages.map(&:name)
classify(Samples.cache, blob.data[0...CLASSIFIER_CONSIDER_BYTES], language_names).map do |name, _|
Language[name] # Return the actual Language objects
end
end
# Public: Train classifier that data is a certain language.
#
# db - Hash classifier database object
# language - String language of data
# data - String contents of file
#
# Examples
#
# Classifier.train(db, 'Ruby', "def hello; end")
#
# Returns nothing.
#
# Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token or
# per-language. See also #dump_all_tokens, below.
def self.train!(db, language, data)
tokens = data
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
counts = Hash.new(0)
tokens.each { |tok| counts[tok] += 1 }
db['tokens_total'] ||= 0
db['languages_total'] ||= 0
db['tokens'] ||= {}
db['language_tokens'] ||= {}
db['languages'] ||= {}
counts.each do |token, count|
db['tokens'][language] ||= {}
db['tokens'][language][token] ||= 0
db['tokens'][language][token] += count
db['language_tokens'][language] ||= 0
db['language_tokens'][language] += count
db['tokens_total'] += count
end
db['languages'][language] ||= 0
db['languages'][language] += 1
db['languages_total'] += 1
nil
end
# Public: Guess language of data.
#
# db - Hash of classifier tokens database.
# data - Array of tokens or String data to analyze.
# languages - Array of language name Strings to restrict to.
#
# Examples
#
# Classifier.classify(db, "def hello; end")
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
#
# Returns sorted Array of result pairs. Each pair contains the
# String language name and a Float score.
def self.classify(db, tokens, languages = nil)
languages ||= db['languages'].keys
new(db).classify(tokens, languages)
end
# Internal: Initialize a Classifier.
def initialize(db = {})
@tokens_total = db['tokens_total']
@languages_total = db['languages_total']
@tokens = db['tokens']
@language_tokens = db['language_tokens']
@languages = db['languages']
@unknown_logprob = Math.log(1 / db['tokens_total'].to_f)
end
# Internal: Guess language of data
#
# data - Array of tokens or String data to analyze.
# languages - Array of language name Strings to restrict to.
#
# Returns sorted Array of result pairs. Each pair contains the
# String language name and a Float score.
def classify(tokens, languages)
return [] if tokens.nil? || languages.empty?
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
scores = {}
debug_dump_all_tokens(tokens, languages) if verbosity >= 2
counts = Hash.new(0)
tokens.each { |tok| counts[tok] += 1 }
languages.each do |language|
scores[language] = tokens_probability(counts, language) + language_probability(language)
debug_dump_probabilities(counts, language, scores[language]) if verbosity >= 1
end
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
end
# Internal: Probably of set of tokens in a language occurring - P(D | C)
#
# tokens - Array of String tokens.
# language - Language to check.
#
# Returns Float between 0.0 and 1.0.
def tokens_probability(counts, language)
sum = 0
counts.each do |token, count|
sum += count * token_probability(token, language)
end
sum
end
# Internal: Log-probability of token in language occurring - P(F | C)
#
# token - String token.
# language - Language to check.
#
# Returns Float.
def token_probability(token, language)
count = @tokens[language][token]
if count.nil? || count == 0
# This is usually the most common case, so we cache the result.
@unknown_logprob
else
Math.log(count.to_f / @language_tokens[language].to_f)
end
end
# Internal: Probably of a language occurring - P(C)
#
# language - Language to check.
#
# Returns Float between 0.0 and 1.0.
def language_probability(language)
Math.log(@languages[language].to_f / @languages_total.to_f)
end
private
def verbosity
@verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
end
def debug_dump_probabilities(tokens, language, score)
printf("%10s = %10.3f + %7.3f = %10.3f\n",
language, tokens_probability(tokens, language), language_probability(language), score)
end
# Internal: show a table of probabilities for each <token,language> pair.
#
# The number in each table entry is the number of "points" that each
# token contributes toward the belief that the file under test is a
# particular language. Points are additive.
#
# Points are the number of times a token appears in the file, times
# how much more likely (log of probability ratio) that token is to
# appear in one language vs. the least-likely language. Dashes
# indicate the least-likely language (and zero points) for each token.
def debug_dump_all_tokens(tokens, languages)
maxlen = tokens.map { |tok| tok.size }.max
printf "%#{maxlen}s", ""
puts " #" + languages.map { |lang| sprintf("%10s", lang) }.join
token_map = Hash.new(0)
tokens.each { |tok| token_map[tok] += 1 }
token_map.sort.each { |tok, count|
arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
min = arr.map { |a,b| b }.min
if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
printf "%#{maxlen}s%5d", tok, count
puts arr.map { |ent|
ent[1] == min ? " -" : sprintf("%10.3f", count * (ent[1] - min))
}.join
end
}
end
end
end