forked from slashdotdash/jekyll-lunr-js-search
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexer.rb
92 lines (68 loc) · 2.76 KB
/
indexer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
require 'rubygems'
require 'json'
module Jekyll
class Indexer < Generator
def initialize(config = {})
super(config)
lunr_config = {
'excludes' => [],
'strip_index_html' => false,
'min_length' => 3,
'stopwords' => 'stopwords.txt'
}.merge!(config['lunr_search'])
@excludes = lunr_config['excludes']
# if web host supports index.html as default doc, then optionally exclude it from the url
@strip_index_html = lunr_config['strip_index_html']
# stop word exclusion configuration
@min_length = lunr_config['min_length']
@stopwords_file = lunr_config['stopwords']
end
# Index all pages except pages matching any value in config['lunr_excludes'] or with date['exclude_from_search']
# The main content from each page is extracted and saved to disk as json
def generate(site)
puts 'Running the search indexer...'
# gather pages and posts
items = pages_to_index(site)
content_renderer = PageRenderer.new(site)
index = []
items.each do |item|
entry = SearchEntry.create(item, content_renderer)
entry.strip_index_suffix_from_url! if @strip_index_html
entry.strip_stopwords!(stopwords, @min_length) if File.exists?(@stopwords_file)
index << {
:title => entry.title,
:url => entry.url,
:date => entry.date,
:categories => entry.categories,
:body => entry.body
}
puts 'Indexed ' << "#{entry.title} (#{entry.url})"
end
json = JSON.generate({:entries => index})
# Create destination directory if it doesn't exist yet. Otherwise, we cannot write our file there.
Dir::mkdir(site.dest) unless File.directory?(site.dest)
# File I/O: create search.json file and write out pretty-printed JSON
filename = 'search.json'
File.open(File.join(site.dest, filename), "w") do |file|
file.write(json)
end
# Keep the search.json file from being cleaned by Jekyll
site.static_files << Jekyll::SearchIndexFile.new(site, site.dest, "/", filename)
end
private
# load the stopwords file
def stopwords
@stopwords ||= IO.readlines(@stopwords_file).map { |l| l.strip }
end
def pages_to_index(site)
items = []
# deep copy pages
site.pages.each {|page| items << page.dup }
site.posts.each {|post| items << post.dup }
# only process files that will be converted to .html and only non excluded files
items.select! {|i| i.output_ext == '.html' && ! @excludes.any? {|s| (i.url =~ Regexp.new(s)) != nil } }
items.reject! {|i| i.data['exclude_from_search'] }
items
end
end
end