Skip to content

Commit

Permalink
Refactor LSI class to improve GSL integration and code clarity
Browse files Browse the repository at this point in the history
- Add class-level GSL availability flag
- Update GSL availability check and error handling
- Replace global $GSL variable with class method
- Improve code readability and consistency
  • Loading branch information
cardmagic committed Jul 31, 2024
1 parent bb971a2 commit dc6632c
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 12 deletions.
29 changes: 20 additions & 9 deletions lib/classifier/lsi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,27 @@
# Copyright:: Copyright (c) 2005 David Fayram II
# License:: LGPL

module Classifier
class LSI
@gsl_available = false

class << self
attr_accessor :gsl_available
end
end
end

begin
# to test the native vector class, try `rake test NATIVE_VECTOR=true`
raise LoadError if ENV['NATIVE_VECTOR'] == 'true'
raise LoadError unless Gem::Specification.find_all_by_name('gsl').any?

require 'gsl' # requires https://github.com/SciRuby/rb-gsl/
require 'classifier/extensions/vector_serialize'
$GSL = true
Classifier::LSI.gsl_available = true
rescue LoadError
warn 'Notice: for 10x faster LSI support, please install https://github.com/SciRuby/rb-gsl/'
$GSL = false
warn 'Notice: for 10x faster LSI support in the classifier gem, please install the gsl gem'
Classifier::LSI.gsl_available = false
require 'classifier/extensions/vector'
end

Expand All @@ -29,7 +40,7 @@ class LSI

# Create a fresh index.
# If you want to call #build_index manually, use
# Classifier::LSI.new :auto_rebuild => false
# Classifier::LSI.new auto_rebuild: false
#
def initialize(options = {})
@auto_rebuild = true unless options[:auto_rebuild] == false
Expand Down Expand Up @@ -118,7 +129,7 @@ def build_index(cutoff = 0.75)
doc_list = @items.values
tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }

if $GSL
if self.class.gsl_available
tdm = GSL::Matrix.alloc(*tda).trans
ntdm = build_reduced_matrix(tdm, cutoff)

Expand Down Expand Up @@ -152,7 +163,7 @@ def highest_relative_content(max_chunks = 10)
return [] if needs_rebuild?

avg_density = {}
@items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x, y| x + y[1] } }
@items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |i, j| i + j[1] } }

avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map
end
Expand All @@ -175,7 +186,7 @@ def proximity_array_for_content(doc, &block)
content_node = node_for_content(doc, &block)
result =
@items.keys.collect do |item|
val = if $GSL
val = if self.class.gsl_available
content_node.search_vector * @items[item].search_vector.col
else
(Matrix[content_node.search_vector] * @items[item].search_vector)[0]
Expand All @@ -196,7 +207,7 @@ def proximity_norms_for_content(doc, &block)
content_node = node_for_content(doc, &block)
result =
@items.keys.collect do |item|
val = if $GSL
val = if self.class.gsl_available
content_node.search_norm * @items[item].search_norm.col
else
(Matrix[content_node.search_norm] * @items[item].search_norm)[0]
Expand Down Expand Up @@ -314,7 +325,7 @@ def build_reduced_matrix(matrix, cutoff = 0.75)
s[ord] = 0.0 if s[ord] < s_cutoff
end
# Reconstruct the term document matrix, only with reduced rank
u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans
u * (self.class.gsl_available ? GSL::Matrix : ::Matrix).diag(s) * v.trans
end

def node_for_content(item, &block)
Expand Down
6 changes: 3 additions & 3 deletions lib/classifier/lsi/content_node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def search_norm
# Creates the raw vector out of word_hash using word_list as the
# key for mapping the vector space.
def raw_vector_with(word_list)
vec = if $GSL
vec = if Classifier::LSI.gsl_available
GSL::Vector.alloc(word_list.size)
else
Array.new(word_list.size, 0)
Expand All @@ -44,7 +44,7 @@ def raw_vector_with(word_list)
end

# Perform the scaling transform
total_words = $GSL ? vec.sum : vec.sum_with_identity
total_words = Classifier::LSI.gsl_available ? vec.sum : vec.sum_with_identity
total_unique_words = vec.count { |word| word != 0 }

# Perform first-order association transform if this vector has more
Expand All @@ -63,7 +63,7 @@ def raw_vector_with(word_list)
vec = vec.collect { |val| Math.log(val + 1) / -weighted_total }
end

if $GSL
if Classifier::LSI.gsl_available
@raw_norm = vec.normalize
@raw_vector = vec
else
Expand Down

0 comments on commit dc6632c

Please sign in to comment.