From 5f94a4874d06b19c40c3be4cdc2463b029af20ec Mon Sep 17 00:00:00 2001 From: cdxker Date: Tue, 10 Dec 2024 17:40:42 -0800 Subject: [PATCH 1/3] Create PagefindIndex for rust lib to be more sdk like --- pagefind/src/lib.rs | 1 + pagefind/src/service/api.rs | 177 ++++++++++++++++++++++++++++++ pagefind/src/service/mod.rs | 115 +++++++------------ pagefind/src/service/responses.rs | 13 ++- 4 files changed, 227 insertions(+), 79 deletions(-) create mode 100644 pagefind/src/service/api.rs diff --git a/pagefind/src/lib.rs b/pagefind/src/lib.rs index 370a96cf..cd60dfb4 100644 --- a/pagefind/src/lib.rs +++ b/pagefind/src/lib.rs @@ -6,6 +6,7 @@ use hashbrown::HashMap; use index::PagefindIndexes; pub use options::{PagefindInboundConfig, SearchOptions}; use output::SyntheticFile; +pub use service::api; use wax::{Glob, WalkEntry}; use crate::index::build_indexes; diff --git a/pagefind/src/service/api.rs b/pagefind/src/service/api.rs new file mode 100644 index 00000000..fb44bd5a --- /dev/null +++ b/pagefind/src/service/api.rs @@ -0,0 +1,177 @@ +use hashbrown::HashMap; +use std::path::PathBuf; + +use crate::{ + fossick::{parser::DomParserResult, Fossicker}, + PagefindInboundConfig, SearchOptions, SearchState, +}; +use base64::{engine::general_purpose, Engine as _}; + +use super::{IndexedFileResponse, SyntheticFileResponse}; + +pub struct PagefindIndex { + search_index: SearchState, +} + +impl PagefindIndex { + /// Create a new PagefindIndex instance. + /// + /// # Arguments + /// * `config` - An optional PagefindServiceConfig to apply to the service. + /// + /// # Returns + /// An optional PagefindIndex instance. If the search options are invalid, it + /// will return None. + pub fn new(config: PagefindInboundConfig) -> Option { + match SearchOptions::load(config) { + Ok(opts) => Some(Self { + search_index: SearchState::new(opts), + }), + Err(_) => None, + } + } + + /// Add a file into this search index. + /// Either a filepath or a URL must be provided. + /// + /// # Arguments + /// * `file_path` - The path to the file to add. + /// * `url` - The URL to the file to add. + /// * `file_contents` - The contents of the file to add. + /// + /// # Returns + /// Either the PageFragmentData of the file added or an error message, if it fails to add the + /// file. + pub async fn add_file( + &mut self, + file_path: Option, + url: Option, + file_contents: String, + ) -> Result { + if file_path.is_none() && url.is_none() { + return Err("Either file_path or url must be provided".into()); + } + + let file = Fossicker::new_synthetic(file_path.map(PathBuf::from), url, file_contents); + let data = self.search_index.fossick_one(file).await; + + match data { + Ok(data) => Ok(IndexedFileResponse { + page_word_count: data.fragment.data.word_count as u32, + page_url: data.fragment.data.url, + page_meta: data.fragment.data.meta + }), + Err(_) => Err("Failed to add file".to_string()), + } + } + + /// Add a record to the search index. + /// This is a more manual way to add a record to the search index, allowing for more control + /// over the data. This is useful for adding records that are not files. + /// + /// # Arguments + /// * `url` - The URL of the record. + /// * `content` - The content of the record. + /// * `language` - The language of the record. + /// * `meta` - Optional metadata to add to the record. + /// * `filters` - Optional filters to apply to the record. + /// * `sort` - Optional sorting to apply to the record. + pub async fn add_record( + &mut self, + url: String, + content: String, + language: String, + meta: Option>, + filters: Option>>, + sort: Option>, + ) -> Result { + let data = DomParserResult { + digest: content, + filters: filters.unwrap_or_default(), + sort: sort.unwrap_or_default(), + meta: meta.unwrap_or_default(), + anchor_content: HashMap::new(), + has_custom_body: false, + force_inclusion: true, + has_html_element: true, + has_old_bundle_reference: false, + language: self + .search_index + .options + .force_language + .clone() + .unwrap_or(language), + }; + let file = Fossicker::new_with_data(url, data); + let data = self.search_index.fossick_one(file).await; + + match data { + Ok(data) => Ok(IndexedFileResponse { + page_word_count: data.fragment.data.word_count as u32, + page_url: data.fragment.data.url, + page_meta: data.fragment.data.meta + }), + Err(_) => Err("Failed to add file".to_string()), + } + } + + /// Add a directory to the search index with a glob pattern. + /// + /// # Arguments + /// * `path` - The path to the directory to index. + /// * `glob` - A glob pattern to match files in the directory. If not provided, the default glob pattern will be used. + /// + /// # Returns + /// Either the number of pages indexed or an error message, if it fails to index the directory. + pub async fn add_dir(&mut self, path: String, glob: Option) -> Result { + let defaults: PagefindInboundConfig = + serde_json::from_str("{}").expect("All fields have serde defaults"); + let glob = glob.unwrap_or(defaults.glob); + + let data = self + .search_index + .fossick_many(PathBuf::from(path), glob) + .await; + match data { + Ok(page_count) => Ok(page_count), + Err(_) => Err("Failed to index directory".to_string()), + } + } + + /// Build the search index for this instance and hold it in memory. + pub async fn build_indexes(&mut self) { + self.search_index.build_indexes().await; + } + + /// Build the search index for this instance and write the files to disk. + /// + /// # Arguments + /// * `output_path` - The path to write the files to. If not provided, the default output path will be used. + pub async fn write_files(&mut self, output_path: Option) -> String { + self.search_index.build_indexes().await; + let resolved_output_path = self + .search_index + .write_files(output_path.map(Into::into)) + .await; + + resolved_output_path.to_string_lossy().into() + } + + /// Build the search index for this instance and return the files as a list of + /// SyntheticFileResponse. + /// + /// # Returns + /// A list of SyntheticFileResponse containing the path and content of each file. + pub async fn get_files(&mut self) -> Vec { + self.search_index.build_indexes().await; + self.search_index + .get_files() + .await + .into_iter() + .map(|file| SyntheticFileResponse { + path: file.filename.to_string_lossy().into(), + content: general_purpose::STANDARD.encode(file.contents), + }) + .collect() + } +} diff --git a/pagefind/src/service/mod.rs b/pagefind/src/service/mod.rs index 56e9b23a..06eb830e 100644 --- a/pagefind/src/service/mod.rs +++ b/pagefind/src/service/mod.rs @@ -1,21 +1,17 @@ -use std::{ - io::{BufRead, Write}, - path::PathBuf, -}; +use std::io::{BufRead, Write}; +pub use api::PagefindIndex; use base64::{engine::general_purpose, Engine as _}; -use hashbrown::HashMap; use rust_patch::Patch; use tokio::sync::mpsc; -use crate::{ - fossick::{parser::DomParserResult, Fossicker}, - PagefindInboundConfig, SearchOptions, SearchState, -}; +pub mod api; use requests::*; use responses::*; +use crate::PagefindInboundConfig; + mod requests; mod responses; @@ -37,19 +33,18 @@ pub async fn run_service() { std::process::exit(0); } - let Ok(decoded) = general_purpose::STANDARD - .decode(buf) else { - parse_error_outgoing_tx - .send(ServiceResponse { - message_id: None, - payload: ResponseAction::Error { - original_message: None, - message: "Unparseable message, not valid base64".into() - }, - }) - .expect("Channel is open"); - return; - }; + let Ok(decoded) = general_purpose::STANDARD.decode(buf) else { + parse_error_outgoing_tx + .send(ServiceResponse { + message_id: None, + payload: ResponseAction::Error { + original_message: None, + message: "Unparseable message, not valid base64".into(), + }, + }) + .expect("Channel is open"); + return; + }; match serde_json::from_slice::(&decoded) { Ok(msg) => { @@ -118,10 +113,10 @@ pub async fn run_service() { }; fn get_index<'a>( - indexes: &'a mut Vec>, + indexes: &'a mut Vec>, index_id: u32, err: impl FnOnce(&str), - ) -> Option<&'a mut SearchState> { + ) -> Option<&'a mut api::PagefindIndex> { match indexes.get_mut(index_id as usize) { Some(Some(index)) => Some(index), Some(None) => { @@ -138,22 +133,22 @@ pub async fn run_service() { match msg.payload { RequestAction::NewIndex { config } => { let index_id = indexes.len(); - let mut service_options: PagefindInboundConfig = serde_json::from_str("{}").expect("All fields have serde defaults"); + service_options.service = true; if let Some(config) = config { service_options = config.apply(service_options); } - match SearchOptions::load(service_options) { - Ok(opts) => { - indexes.insert(index_id, Some(SearchState::new(opts))); + match PagefindIndex::new(service_options) { + Some(index) => { + indexes.insert(index_id, Some(index)); send(ResponseAction::NewIndex { index_id: index_id as u32, }); } - Err(_) => { + None => { err("Invalid config supplied"); } } @@ -165,22 +160,14 @@ pub async fn run_service() { file_contents, } => { if let Some(index) = get_index(&mut indexes, index_id, err) { - if file_path.is_none() && url.is_none() { - return err( - "Either a source path to the file, or an explicit URL must be provided", - ); - } - - let file = - Fossicker::new_synthetic(file_path.map(PathBuf::from), url, file_contents); - let data = index.fossick_one(file).await; - match data { + let page_fragment = index.add_file(file_path, url, file_contents).await; + match page_fragment { Ok(data) => send(ResponseAction::IndexedFile { - page_word_count: data.fragment.data.word_count as u32, - page_url: data.fragment.data.url.clone(), - page_meta: data.fragment.data.meta.clone(), + page_word_count: data.page_word_count, + page_url: data.page_url.clone(), + page_meta: data.page_meta.clone(), }), - Err(_) => err("Failed to add file"), + Err(message) => err(&message), } } } @@ -194,25 +181,14 @@ pub async fn run_service() { sort, } => { if let Some(index) = get_index(&mut indexes, index_id, err) { - let data = DomParserResult { - digest: content, - filters: filters.unwrap_or_default(), - sort: sort.unwrap_or_default(), - meta: meta.unwrap_or_default(), - anchor_content: HashMap::new(), - has_custom_body: false, - force_inclusion: true, - has_html_element: true, - has_old_bundle_reference: false, - language: index.options.force_language.clone().unwrap_or(language), - }; - let file = Fossicker::new_with_data(url, data); - let data = index.fossick_one(file).await; + let data = index + .add_record(url, content, language, meta, filters, sort) + .await; match data { Ok(data) => send(ResponseAction::IndexedFile { - page_word_count: data.fragment.data.word_count as u32, - page_url: data.fragment.data.url.clone(), - page_meta: data.fragment.data.meta.clone(), + page_word_count: data.page_word_count, + page_url: data.page_url.clone(), + page_meta: data.page_meta.clone(), }), Err(_) => err("Failed to add file"), } @@ -224,12 +200,7 @@ pub async fn run_service() { glob, } => { if let Some(index) = get_index(&mut indexes, index_id, err) { - let defaults: PagefindInboundConfig = - serde_json::from_str("{}").expect("All fields have serde defaults"); - let glob = glob.unwrap_or_else(|| defaults.glob); - - let data = index.fossick_many(PathBuf::from(path), glob).await; - match data { + match index.add_dir(path, glob).await { Ok(page_count) => send(ResponseAction::IndexedDir { page_count: page_count as u32, }), @@ -251,7 +222,7 @@ pub async fn run_service() { index.build_indexes().await; let resolved_output_path = index.write_files(output_path.map(Into::into)).await; send(ResponseAction::WriteFiles { - output_path: resolved_output_path.to_string_lossy().into(), + output_path: resolved_output_path, }); } } @@ -259,15 +230,7 @@ pub async fn run_service() { if let Some(index) = get_index(&mut indexes, index_id, err) { index.build_indexes().await; let files = index.get_files().await; - send(ResponseAction::GetFiles { - files: files - .into_iter() - .map(|file| SyntheticFileResponse { - path: file.filename.to_string_lossy().into(), - content: general_purpose::STANDARD.encode(file.contents), - }) - .collect(), - }); + send(ResponseAction::GetFiles { files }); } } RequestAction::DeleteIndex { index_id } => match indexes.get_mut(index_id as usize) { diff --git a/pagefind/src/service/responses.rs b/pagefind/src/service/responses.rs index 14e0875a..4ba2cf4f 100644 --- a/pagefind/src/service/responses.rs +++ b/pagefind/src/service/responses.rs @@ -36,7 +36,14 @@ pub(super) enum ResponseAction { } #[derive(Debug, Deserialize, Serialize)] -pub(super) struct SyntheticFileResponse { - pub(super) path: String, - pub(super) content: String, +pub struct IndexedFileResponse { + pub page_word_count: u32, + pub page_url: String, + pub page_meta: HashMap, +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct SyntheticFileResponse { + pub path: String, + pub content: String, } From 1fdbac3dccad627aa0d041f207b3b2d4b001cc38 Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Mon, 16 Dec 2024 12:15:09 +1300 Subject: [PATCH 2/3] Refine Pagefind library interface for long-term maintenance --- Cargo.lock | 33 +++- pagefind/Cargo.toml | 6 +- pagefind/src/fossick/mod.rs | 15 +- pagefind/src/index/mod.rs | 12 +- pagefind/src/lib.rs | 57 ++++--- pagefind/src/logging.rs | 2 + pagefind/src/main.rs | 152 +------------------ pagefind/src/options.rs | 107 +++++++------ pagefind/src/runner.rs | 164 ++++++++++++++++++++ pagefind/src/service/api.rs | 242 ++++++++++++++++++++---------- pagefind/src/service/mod.rs | 70 +++++---- pagefind/src/service/responses.rs | 7 - 12 files changed, 514 insertions(+), 353 deletions(-) create mode 100644 pagefind/src/runner.rs diff --git a/Cargo.lock b/Cargo.lock index f790390f..5514db7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -769,18 +769,18 @@ dependencies = [ [[package]] name = "derive_builder" -version = "0.20.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" dependencies = [ "derive_builder_macro", ] [[package]] name = "derive_builder_core" -version = "0.20.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" dependencies = [ "darling", "proc-macro2", @@ -790,9 +790,9 @@ dependencies = [ [[package]] name = "derive_builder_macro" -version = "0.20.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", "syn 2.0.55", @@ -2036,6 +2036,7 @@ dependencies = [ "sha-1", "tokio", "twelf", + "typed-builder", "unicode-segmentation", "wax", ] @@ -2945,6 +2946,26 @@ dependencies = [ "toml", ] +[[package]] +name = "typed-builder" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e14ed59dc8b7b26cacb2a92bad2e8b1f098806063898ab42a3bd121d7d45e75" +dependencies = [ + "typed-builder-macro", +] + +[[package]] +name = "typed-builder-macro" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.55", +] + [[package]] name = "typenum" version = "1.17.0" diff --git a/pagefind/Cargo.toml b/pagefind/Cargo.toml index f638fa9f..49e98c74 100644 --- a/pagefind/Cargo.toml +++ b/pagefind/Cargo.toml @@ -49,7 +49,10 @@ pagefind_stem = { version = "0.2.0", features = [ "yiddish", ] } convert_case = "0.6.0" -charabia = { version = "0.8.8", optional = true, default-features = false, features = ["chinese", "japanese"] } +charabia = { version = "0.8.8", optional = true, default-features = false, features = [ + "chinese", + "japanese", +] } unicode-segmentation = "1.10.1" emojis = "0.6.1" hashbrown = { version = "0.13.1", features = ["serde"] } @@ -81,6 +84,7 @@ actix-files = "0.6" lexical-core = "0.8.5" path-slash = "0.2" rust-patch = "0.1.3" +typed-builder = "0.20.0" [features] diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index 0c346a0d..c23456d3 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -1,3 +1,4 @@ +use anyhow::{bail, Result}; use async_compression::tokio::bufread::GzipDecoder; #[cfg(feature = "extended")] use charabia::Segment; @@ -64,16 +65,6 @@ pub struct Fossicker { } impl Fossicker { - pub fn new(file_path: PathBuf) -> Self { - Self { - file_path: Some(file_path), - root_path: None, - page_url: None, - synthetic_content: None, - data: None, - } - } - pub fn new_relative_to(file_path: PathBuf, root_path: PathBuf) -> Self { Self { file_path: Some(file_path), @@ -459,7 +450,7 @@ impl Fossicker { } } - pub async fn fossick(mut self, options: &SearchOptions) -> Result { + pub async fn fossick(mut self, options: &SearchOptions) -> Result { if (self.file_path.is_some() || self.synthetic_content.is_some()) && self.data.is_none() { self.fossick_html(options).await; }; @@ -480,7 +471,7 @@ impl Fossicker { options .logger .error("Tried to index file with no specified URL or file path, ignoring."); - return Err(()); + bail!("Tried to index file with no specified URL or file path, ignoring."); }; Ok(FossickedData { diff --git a/pagefind/src/index/mod.rs b/pagefind/src/index/mod.rs index 9676dbb5..a301c7df 100644 --- a/pagefind/src/index/mod.rs +++ b/pagefind/src/index/mod.rs @@ -6,6 +6,7 @@ use crate::{ utils::full_hash, SearchOptions, }; +use anyhow::{bail, Result}; use index_filter::{FilterIndex, PackedValue}; use index_metadata::{MetaChunk, MetaIndex, MetaPage}; use index_words::{PackedPage, PackedWord, WordIndex}; @@ -44,7 +45,7 @@ pub async fn build_indexes( mut pages: Vec, language: String, options: &SearchOptions, -) -> PagefindIndexes { +) -> Result { let mut meta = MetaIndex { version: options.version.into(), pages: Vec::new(), @@ -265,7 +266,10 @@ pub async fn build_indexes( language, u32::MAX )); - std::process::exit(1); + bail!( + "Language {language} has too many documents to index, must be < {}", + u32::MAX + ); } // TODO: Parameterize these chunk sizes via options @@ -306,7 +310,7 @@ pub async fn build_indexes( &full_hash(&meta_index)[0..=(language.len() + 7)] ); - PagefindIndexes { + Ok(PagefindIndexes { word_indexes, filter_indexes, sorts, @@ -317,7 +321,7 @@ pub async fn build_indexes( .collect(), language, word_count, - } + }) } fn chunk_index(word_map: HashMap, chunk_size: usize) -> Vec> { diff --git a/pagefind/src/lib.rs b/pagefind/src/lib.rs index cd60dfb4..25af98d6 100644 --- a/pagefind/src/lib.rs +++ b/pagefind/src/lib.rs @@ -1,10 +1,11 @@ use std::{cmp::Ordering, path::PathBuf}; -pub use fossick::{FossickedData, Fossicker}; +use anyhow::{bail, Result}; +use fossick::{FossickedData, Fossicker}; use futures::future::join_all; use hashbrown::HashMap; use index::PagefindIndexes; -pub use options::{PagefindInboundConfig, SearchOptions}; +use options::{PagefindInboundConfig, SearchOptions}; use output::SyntheticFile; pub use service::api; use wax::{Glob, WalkEntry}; @@ -16,16 +17,17 @@ mod fragments; mod index; #[macro_use] mod logging; -mod options; +pub mod options; mod output; -pub mod serve; -pub mod service; +pub mod runner; +mod serve; +mod service; mod utils; -pub struct SearchState { - pub options: SearchOptions, - pub fossicked_pages: Vec, - pub built_indexes: Vec, +struct SearchState { + options: SearchOptions, + fossicked_pages: Vec, + built_indexes: Vec, } impl SearchState { @@ -37,28 +39,31 @@ impl SearchState { } } - pub async fn walk_for_files(&mut self, dir: PathBuf, glob: String) -> Vec { + pub async fn walk_for_files(&mut self, dir: PathBuf, glob: String) -> Result> { let log = &self.options.logger; log.status("[Walking source directory]"); if let Ok(glob) = Glob::new(&glob) { - glob.walk(&dir) + Ok(glob + .walk(&dir) .filter_map(Result::ok) .map(WalkEntry::into_path) .map(|file_path| Fossicker::new_relative_to(file_path, dir.clone())) - .collect() + .collect()) } else { log.error(format!( "Error: Provided glob \"{}\" did not parse as a valid glob.", self.options.glob )); - // TODO: Bubble this error back to the Node API if applicable - std::process::exit(1); + bail!( + "Error: Provided glob \"{}\" did not parse as a valid glob.", + self.options.glob + ); } } - pub async fn fossick_many(&mut self, dir: PathBuf, glob: String) -> Result { - let files = self.walk_for_files(dir.clone(), glob).await; + pub async fn fossick_many(&mut self, dir: PathBuf, glob: String) -> Result { + let files = self.walk_for_files(dir.clone(), glob).await?; let log = &self.options.logger; log.info(format!( @@ -81,23 +86,23 @@ impl SearchState { Ok(self.fossicked_pages.len() - existing_page_count) } - pub async fn fossick_one(&mut self, file: Fossicker) -> Result { + pub async fn fossick_one(&mut self, file: Fossicker) -> Result { let result = file.fossick(&self.options).await; - if let Ok(result) = result.clone() { + if let Some(result) = result.as_ref().ok() { let existing = self .fossicked_pages .iter() .position(|page| page.url == result.url); if let Some(existing) = existing { - *self.fossicked_pages.get_mut(existing).unwrap() = result; + *self.fossicked_pages.get_mut(existing).unwrap() = result.clone(); } else { - self.fossicked_pages.push(result); + self.fossicked_pages.push(result.clone()); } } result } - pub async fn build_indexes(&mut self) { + pub async fn build_indexes(&mut self) -> Result<()> { let log = &self.options.logger; let used_custom_body = self.fossicked_pages.iter().any(|page| page.has_custom_body); @@ -211,7 +216,8 @@ impl SearchState { .into_iter() .map(|(language, pages)| async { build_indexes(pages, language, &self.options).await }) .collect(); - self.built_indexes = join_all(indexes).await; + let built_indexes = join_all(indexes).await; + self.built_indexes = built_indexes.into_iter().flat_map(|i| i.ok()).collect(); let stats = self.built_indexes.iter().fold((0, 0, 0, 0), |mut stats, index| { log.v_info(format!( @@ -267,8 +273,13 @@ impl SearchState { Most likely, the directory passed to Pagefind was empty \ or did not contain any html files.", ); - std::process::exit(1); + bail!( + "Error: Pagefind wasn't able to build an index. \n\ + Most likely, the directory passed to Pagefind was empty \ + or did not contain any html files." + ); } + Ok(()) } pub async fn write_files(&self, custom_outdir: Option) -> PathBuf { diff --git a/pagefind/src/logging.rs b/pagefind/src/logging.rs index ab027733..670b2d24 100644 --- a/pagefind/src/logging.rs +++ b/pagefind/src/logging.rs @@ -14,6 +14,7 @@ pub enum LogLevel { Verbose, } +#[allow(dead_code)] #[derive(Debug, Clone)] pub enum LogStyle { Info, @@ -55,6 +56,7 @@ lazy_static! { static ref SUCCESS: Style = Style::new().green(); } +#[allow(dead_code)] impl Logger { pub fn new(log_level: LogLevel, use_terminal: bool, logfile: Option) -> Self { if let Some(filename) = &logfile { diff --git a/pagefind/src/main.rs b/pagefind/src/main.rs index 614dd65e..13af26a3 100644 --- a/pagefind/src/main.rs +++ b/pagefind/src/main.rs @@ -1,153 +1,11 @@ -use pagefind::service::run_service; -use pagefind::{PagefindInboundConfig, SearchOptions, SearchState}; -use std::path::PathBuf; -use std::time::Instant; -use twelf::reexports::clap::CommandFactory; -use twelf::Layer; - -const CONFIGS: &[&str] = &[ - "pagefind.json", - "pagefind.yml", - "pagefind.yaml", - "pagefind.toml", -]; +use pagefind::runner::run_indexer; #[tokio::main] async fn main() { - let start = Instant::now(); - - let matches = PagefindInboundConfig::command() - // .ignore_errors(true) - .get_matches(); - - let mut config_layers = vec![]; - - let configs: Vec<&str> = CONFIGS - .iter() - .filter(|c| std::path::Path::new(c).exists()) - .cloned() - .collect(); - if configs.len() > 1 { - eprintln!( - "Found multiple possible config files: [{}]", - configs.join(", ") - ); - eprintln!("Pagefind only supports loading one configuration file format, please ensure only one file exists."); - std::process::exit(1); - } - - for config in configs { - let layer_fn = if config.ends_with("json") { - Layer::Json - } else if config.ends_with("toml") { - Layer::Toml - } else if config.ends_with("yaml") || config.ends_with("yml") { - Layer::Yaml - } else { - eprintln!("Unknown config file format {}", config); - std::process::exit(1); - }; - config_layers.push(layer_fn(config.into())); - } - - config_layers.push(Layer::Env(Some("PAGEFIND_".to_string()))); - config_layers.push(Layer::Clap(matches)); - - match PagefindInboundConfig::with_layers(&config_layers) { - Ok(config) => { - if let Ok(options) = SearchOptions::load(config.clone()) { - if config.service { - run_service().await; - } else { - let mut runner = SearchState::new(options.clone()); - let logger = runner.options.logger.clone(); - - runner.log_start(); - // TODO: Error handling - _ = runner - .fossick_many(options.site_source.clone(), options.glob) - .await; - - let use_old_bundle = options.config_warnings.unconfigured_bundle_output - && runner - .fossicked_pages - .iter() - .filter(|p| p.has_old_bundle_reference) - .next() - .is_some(); - if use_old_bundle { - logger.warn( - "!! Found references to a /_pagefind/ resource, running in pre-1.0 compatibility mode.", - ); - } - - runner.build_indexes().await; - _ = &runner.write_files(None).await; - - if use_old_bundle { - let old_bundle_location = options.site_source.join("_pagefind"); - _ = &runner.write_files(Some(old_bundle_location)).await; - } - - let duration = start.elapsed(); - - logger.status(&format!( - "Finished in {}.{:03} seconds", - duration.as_secs(), - duration.subsec_millis() - )); - - let warnings = options.config_warnings.get_strings(); - if !warnings.is_empty() { - logger.warn(&format!("{} configuration warning(s):", warnings.len())); - - for warning in options.config_warnings.get_strings() { - logger.warn(warning); - } - } - - if use_old_bundle { - logger.warn(&format!( - "\n\nWarning: Running in pre-1.0 compatibility mode.\n\ - Pagefind 1.0 changes the default output directory from /_pagefind/ to /pagefind/\n\ - but references to the /_pagefind/ URL were found on your site, and the output directory is unconfigured.\n\ - To preserve your setup, the search files have been written twice, to both /_pagefind/ and /pagefind/\n\n\ - To remove this warning, either update your script and style references to the new `/pagefind/` URL\n\ - or run Pagefind with `--output-subdir _pagefind` to ensure pre-1.0 behaviour" - )); - } - - if config.serve { - pagefind::serve::serve_dir(PathBuf::from(options.site_source)).await; - } - } - } - } - Err(e) => { - eprintln!("Error loading Pagefind config:"); - match e { - twelf::Error::Io(e) => { - eprintln!("{}", e); - } - twelf::Error::Envy(e) => { - eprintln!("{}", e); - } - twelf::Error::Json(e) => { - eprintln!("{}", e); - } - twelf::Error::Toml(e) => { - eprintln!("{}", e); - } - twelf::Error::Yaml(e) => { - eprintln!("{}", e); - } - twelf::Error::Deserialize(e) => { - eprintln!("{}", e); - } - _ => { - eprintln!("Unknown Error"); - } - } + match run_indexer().await { + Ok(_) => { /* success */ } + Err(msg) => { + eprintln!("{msg}"); std::process::exit(1); } } diff --git a/pagefind/src/options.rs b/pagefind/src/options.rs index b6826324..e1f889e0 100644 --- a/pagefind/src/options.rs +++ b/pagefind/src/options.rs @@ -1,9 +1,12 @@ +//! Configuration that can be supplied to the `api` module when using Pagefind as a service. + use anyhow::{bail, Result}; use clap::Parser; use rust_patch::Patch; use serde::{Deserialize, Serialize}; use std::{env, path::PathBuf}; use twelf::config; +use typed_builder::TypedBuilder; use crate::logging::{LogLevel, Logger}; @@ -19,16 +22,16 @@ use crate::logging::{LogLevel, Logger}; #[config] #[derive(Parser, Debug, Clone)] #[clap(author, version, about, long_about = None)] -pub struct PagefindInboundConfig { +pub(crate) struct PagefindInboundConfig { #[clap(long, help = "DEPRECATED: Use the `site` option instead")] #[clap(required = false, hide = true)] #[serde(default)] // This is actually required, but we validate that later - pub source: String, + pub(crate) source: String, #[clap(long, short, help = "The location of your built static website")] #[clap(required = false)] #[serde(default)] // This is actually required, but we validate that later - pub site: String, + pub(crate) site: String, #[clap( long, @@ -36,21 +39,21 @@ pub struct PagefindInboundConfig { help = "DEPRECATED: Use `output_subdir` or `output_path` instead" )] #[clap(required = false, hide = true)] - pub bundle_dir: Option, + pub(crate) bundle_dir: Option, #[clap( long, help = "Where to output the search bundle, relative to the processed site" )] #[clap(required = false)] - pub output_subdir: Option, + pub(crate) output_subdir: Option, #[clap( long, help = "Where to output the search bundle, relative to the working directory of the command" )] #[clap(required = false)] - pub output_path: Option, + pub(crate) output_path: Option, #[clap( long, @@ -58,7 +61,7 @@ pub struct PagefindInboundConfig { )] #[clap(required = false)] #[serde(default = "defaults::default_root_selector")] - pub root_selector: String, + pub(crate) root_selector: String, #[clap( long, @@ -66,7 +69,7 @@ pub struct PagefindInboundConfig { )] #[clap(required = false)] #[serde(default)] - pub exclude_selectors: Vec, + pub(crate) exclude_selectors: Vec, #[clap( long, @@ -74,14 +77,14 @@ pub struct PagefindInboundConfig { )] #[clap(required = false)] #[serde(default = "defaults::default_glob")] - pub glob: String, + pub(crate) glob: String, #[clap( long, help = "Ignore any detected languages and index the whole site as a single language. Expects an ISO 639-1 code." )] #[clap(required = false)] - pub force_language: Option, + pub(crate) force_language: Option, #[clap( long, @@ -89,7 +92,7 @@ pub struct PagefindInboundConfig { )] #[clap(required = false)] #[serde(default = "defaults::default_false")] - pub serve: bool, + pub(crate) serve: bool, #[clap( long, @@ -98,7 +101,7 @@ pub struct PagefindInboundConfig { )] #[clap(required = false)] #[serde(default = "defaults::default_false")] - pub verbose: bool, + pub(crate) verbose: bool, #[clap( long, @@ -107,7 +110,7 @@ pub struct PagefindInboundConfig { )] #[clap(required = false)] #[serde(default)] - pub logfile: Option, + pub(crate) logfile: Option, #[clap( long, @@ -116,28 +119,46 @@ pub struct PagefindInboundConfig { )] #[clap(required = false)] #[serde(default = "defaults::default_false")] - pub keep_index_url: bool, + pub(crate) keep_index_url: bool, #[clap(long)] #[clap(required = false, hide = true)] #[serde(default = "defaults::default_false")] - pub service: bool, + pub(crate) service: bool, } -#[derive(Debug, Deserialize, Serialize, Patch)] +#[derive(Debug, Deserialize, Serialize, Patch, TypedBuilder)] #[patch = "PagefindInboundConfig"] +#[builder( + doc, + field_defaults(default, setter(strip_option)), + builder_method( + vis = "pub", + doc = "Create a builder for building `PagefindServiceConfig` for the api." + ), + builder_type(vis = "pub"), + build_method(vis = "pub") +)] /// Fields that can be set via the Pagefind service. -/// In other words, the subset of the above fields that make sense to set globally, -/// excluding those that are set when each individual method is called. +/// In other words, the subset of the Pagefind configuration that makes sense to set globally, +/// excluding fields that are irrelevant or set when each individual method is called. +/// +/// Must be constructed through the `PagefindServiceConfigBuilder` interface. pub struct PagefindServiceConfig { - pub root_selector: Option, - pub exclude_selectors: Option>, + /// The element Pagefind should treat as the root of the document. + pub(crate) root_selector: Option, + /// Custom selectors that Pagefind should ignore when indexing. + pub(crate) exclude_selectors: Option>, #[patch(as_option)] - pub force_language: Option, - pub verbose: Option, + /// Ignore any detected languages and index the whole site as a single language. Expects an ISO 639-1 code. + pub(crate) force_language: Option, + /// Print verbose logging while indexing the site. Does not impact the web-facing search. + pub(crate) verbose: Option, #[patch(as_option)] - pub logfile: Option, - pub keep_index_url: Option, + /// Path to a logfile to write to. Will replace the file on each run + pub(crate) logfile: Option, + /// Keep \"index.html\" at the end of search result paths. Defaults to false, stripping \"index.html\". + pub(crate) keep_index_url: Option, } mod defaults { @@ -157,30 +178,30 @@ mod defaults { // The configuration object used internally #[derive(Debug, Clone)] -pub struct SearchOptions { - pub working_directory: PathBuf, - pub site_source: PathBuf, - pub bundle_output: PathBuf, - pub root_selector: String, - pub exclude_selectors: Vec, - pub glob: String, - pub force_language: Option, - pub version: &'static str, - pub logger: Logger, - pub keep_index_url: bool, - pub running_as_service: bool, - pub config_warnings: ConfigWarnings, +pub(crate) struct SearchOptions { + pub(crate) working_directory: PathBuf, + pub(crate) site_source: PathBuf, + pub(crate) bundle_output: PathBuf, + pub(crate) root_selector: String, + pub(crate) exclude_selectors: Vec, + pub(crate) glob: String, + pub(crate) force_language: Option, + pub(crate) version: &'static str, + pub(crate) logger: Logger, + pub(crate) keep_index_url: bool, + pub(crate) running_as_service: bool, + pub(crate) config_warnings: ConfigWarnings, } #[derive(Debug, Clone)] -pub struct ConfigWarnings { - pub unconfigured_bundle_output: bool, - pub using_deprecated_source: bool, - pub using_deprecated_bundle_dir: bool, +pub(crate) struct ConfigWarnings { + pub(crate) unconfigured_bundle_output: bool, + pub(crate) using_deprecated_source: bool, + pub(crate) using_deprecated_bundle_dir: bool, } impl SearchOptions { - pub fn load(config: PagefindInboundConfig) -> Result { + pub(crate) fn load(config: PagefindInboundConfig) -> Result { if !config.service && config.site.is_empty() && config.source.is_empty() { eprintln!("Required argument site not supplied. Pagefind needs to know the root of your built static site."); eprintln!("Provide a --site flag, a PAGEFIND_SITE environment variable, or a site key in a Pagefind configuration file."); @@ -249,7 +270,7 @@ impl SearchOptions { } impl ConfigWarnings { - pub fn get_strings(&self) -> Vec { + pub(crate) fn get_strings(&self) -> Vec { let mut strings = vec![]; if self.using_deprecated_bundle_dir { strings.push( diff --git a/pagefind/src/runner.rs b/pagefind/src/runner.rs new file mode 100644 index 00000000..429ec954 --- /dev/null +++ b/pagefind/src/runner.rs @@ -0,0 +1,164 @@ +//! The full Pagefind indexer as run by the CLI. + +use crate::options::SearchOptions; +use crate::serve; + +use super::service::run_service; +use super::{PagefindInboundConfig, SearchState}; +use anyhow::{bail, Result}; +use std::path::PathBuf; +use std::time::Instant; +use twelf::reexports::clap::CommandFactory; +use twelf::Layer; + +const CONFIGS: &[&str] = &[ + "pagefind.json", + "pagefind.yml", + "pagefind.yaml", + "pagefind.toml", +]; + +/// Runs the full Pagefind indexing process used by the Pagefind binary. +/// +/// Will log to stdout/stderr. +pub async fn run_indexer() -> Result<()> { + let start = Instant::now(); + + let matches = PagefindInboundConfig::command() + // .ignore_errors(true) + .get_matches(); + + let mut config_layers = vec![]; + + let configs: Vec<&str> = CONFIGS + .iter() + .filter(|c| std::path::Path::new(c).exists()) + .cloned() + .collect(); + if configs.len() > 1 { + let found = configs.join(", "); + bail!("\ + Found multiple possible config files: [{found}]\n\ + Pagefind only supports loading one configuration file format, please ensure only one file exists.\ + "); + } + + for config in configs { + let layer_fn = if config.ends_with("json") { + Layer::Json + } else if config.ends_with("toml") { + Layer::Toml + } else if config.ends_with("yaml") || config.ends_with("yml") { + Layer::Yaml + } else { + bail!("Unknown config file format {config}"); + }; + config_layers.push(layer_fn(config.into())); + } + + config_layers.push(Layer::Env(Some("PAGEFIND_".to_string()))); + config_layers.push(Layer::Clap(matches)); + + match PagefindInboundConfig::with_layers(&config_layers) { + Ok(config) => { + let options = match SearchOptions::load(config.clone()) { + Ok(o) => o, + Err(e) => return Err(e), + }; + + if config.service { + run_service().await; + Ok(()) + } else { + let mut runner = SearchState::new(options.clone()); + let logger = runner.options.logger.clone(); + + runner.log_start(); + // TODO: Error handling + _ = runner + .fossick_many(options.site_source.clone(), options.glob) + .await; + + let use_old_bundle = options.config_warnings.unconfigured_bundle_output + && runner + .fossicked_pages + .iter() + .filter(|p| p.has_old_bundle_reference) + .next() + .is_some(); + if use_old_bundle { + logger.warn( + "!! Found references to a /_pagefind/ resource, running in pre-1.0 compatibility mode.", + ); + } + + runner.build_indexes().await?; + _ = &runner.write_files(None).await; + + if use_old_bundle { + let old_bundle_location = options.site_source.join("_pagefind"); + _ = &runner.write_files(Some(old_bundle_location)).await; + } + + let duration = start.elapsed(); + + logger.status(&format!( + "Finished in {}.{:03} seconds", + duration.as_secs(), + duration.subsec_millis() + )); + + let warnings = options.config_warnings.get_strings(); + if !warnings.is_empty() { + logger.warn(&format!("{} configuration warning(s):", warnings.len())); + + for warning in options.config_warnings.get_strings() { + logger.warn(warning); + } + } + + if use_old_bundle { + logger.warn(&format!( + "\n\nWarning: Running in pre-1.0 compatibility mode.\n\ + Pagefind 1.0 changes the default output directory from /_pagefind/ to /pagefind/\n\ + but references to the /_pagefind/ URL were found on your site, and the output directory is unconfigured.\n\ + To preserve your setup, the search files have been written twice, to both /_pagefind/ and /pagefind/\n\n\ + To remove this warning, either update your script and style references to the new `/pagefind/` URL\n\ + or run Pagefind with `--output-subdir _pagefind` to ensure pre-1.0 behaviour" + )); + } + + if config.serve { + serve::serve_dir(PathBuf::from(options.site_source)).await; + } + Ok(()) + } + } + Err(e) => { + let inner_err = match e { + twelf::Error::Io(e) => { + format!("{}", e) + } + twelf::Error::Envy(e) => { + format!("{}", e) + } + twelf::Error::Json(e) => { + format!("{}", e) + } + twelf::Error::Toml(e) => { + format!("{}", e) + } + twelf::Error::Yaml(e) => { + format!("{}", e) + } + twelf::Error::Deserialize(e) => { + format!("{}", e) + } + _ => { + format!("Unknown Error") + } + }; + bail!("Error loading Pagefind config:\n{inner_err}") + } + } +} diff --git a/pagefind/src/service/api.rs b/pagefind/src/service/api.rs index fb44bd5a..5e9605c8 100644 --- a/pagefind/src/service/api.rs +++ b/pagefind/src/service/api.rs @@ -1,82 +1,127 @@ +//! The programmatic service interface for integrating Pagefind into another Rust project. +//! +//! # Examples +//! +//! ``` +//! use pagefind::api::{PagefindIndex}; +//! use pagefind::options::{PagefindServiceConfig}; +//! +//! #[tokio::main] +//! async fn main() { +//! let options = PagefindServiceConfig::builder() +//! .keep_index_url(true) +//! .force_language("en".to_string()) +//! .build(); +//! let mut index = PagefindIndex::new(Some(options)).expect("Options should be valid"); +//! let indexing_response = index +//! .add_html_file( +//! Some("test/index.html".into()), +//! None, +//! "

Test content

".into(), +//! ) +//! .await; +//! +//! if let Ok(file) = indexing_response { +//! println!("Page word count: {}", file.page_word_count); +//! println!("Page URL: {}", file.page_url); +//! } +//! +//! let files_response = index.get_files().await; +//! if let Ok(files) = files_response { +//! println!("Have {} files to write to disk", files.len()); +//! } +//! } +//! ``` + +pub use crate::output::SyntheticFile; +use anyhow::{bail, Result}; use hashbrown::HashMap; +use rust_patch::Patch; use std::path::PathBuf; use crate::{ fossick::{parser::DomParserResult, Fossicker}, + options::PagefindServiceConfig, PagefindInboundConfig, SearchOptions, SearchState, }; -use base64::{engine::general_purpose, Engine as _}; -use super::{IndexedFileResponse, SyntheticFileResponse}; +#[derive(Debug)] +pub struct IndexedFileResponse { + pub page_word_count: u32, + pub page_url: String, + pub page_meta: HashMap, +} pub struct PagefindIndex { search_index: SearchState, } impl PagefindIndex { - /// Create a new PagefindIndex instance. + /// Create a new PagefindIndex instance that files can be added to. /// /// # Arguments - /// * `config` - An optional PagefindServiceConfig to apply to the service. + /// * `config` - An optional PagefindServiceConfig to override default options for the service. /// /// # Returns - /// An optional PagefindIndex instance. If the search options are invalid, it - /// will return None. - pub fn new(config: PagefindInboundConfig) -> Option { - match SearchOptions::load(config) { - Ok(opts) => Some(Self { - search_index: SearchState::new(opts), - }), - Err(_) => None, + /// A PagefindIndex instance if search options are valid, otherwise an Error. + pub fn new(config: Option) -> Result { + let mut service_options: PagefindInboundConfig = + serde_json::from_str("{}").expect("All fields have serde defaults"); + + service_options.service = true; + if let Some(config) = config { + service_options = config.apply(service_options); } + + let options = SearchOptions::load(service_options)?; + Ok(Self { + search_index: SearchState::new(options), + }) } - /// Add a file into this search index. + /// Add an HTML file that isn't on disk into this search index. /// Either a filepath or a URL must be provided. /// /// # Arguments - /// * `file_path` - The path to the file to add. - /// * `url` - The URL to the file to add. - /// * `file_contents` - The contents of the file to add. + /// * `source_path` - The source path of the HTML file if it were to exist on disk. + /// * `url` - An explicit URL to use, instead of having Pagefind compute the URL based on the `source_path`. + /// * `content` - The source HTML content of the file to be parsed. /// /// # Returns - /// Either the PageFragmentData of the file added or an error message, if it fails to add the - /// file. - pub async fn add_file( + /// Metadata about the added file if successful, otherwise an Error. + pub async fn add_html_file( &mut self, - file_path: Option, + source_path: Option, url: Option, - file_contents: String, - ) -> Result { - if file_path.is_none() && url.is_none() { - return Err("Either file_path or url must be provided".into()); + content: String, + ) -> Result { + if source_path.is_none() && url.is_none() { + bail!("Either source_path or url must be provided"); } - let file = Fossicker::new_synthetic(file_path.map(PathBuf::from), url, file_contents); - let data = self.search_index.fossick_one(file).await; + let file = Fossicker::new_synthetic(source_path.map(PathBuf::from), url, content); + let data = self.search_index.fossick_one(file).await?; - match data { - Ok(data) => Ok(IndexedFileResponse { - page_word_count: data.fragment.data.word_count as u32, - page_url: data.fragment.data.url, - page_meta: data.fragment.data.meta - }), - Err(_) => Err("Failed to add file".to_string()), - } + Ok(IndexedFileResponse { + page_word_count: data.fragment.data.word_count as u32, + page_url: data.fragment.data.url, + page_meta: data.fragment.data.meta, + }) } - /// Add a record to the search index. - /// This is a more manual way to add a record to the search index, allowing for more control - /// over the data. This is useful for adding records that are not files. + /// Index a custom record that isn't backed by an HTML file. /// /// # Arguments - /// * `url` - The URL of the record. - /// * `content` - The content of the record. - /// * `language` - The language of the record. - /// * `meta` - Optional metadata to add to the record. - /// * `filters` - Optional filters to apply to the record. - /// * `sort` - Optional sorting to apply to the record. - pub async fn add_record( + /// * `url` - The output URL of this record. Pagefind will not alter this. + /// * `content` - The raw content of this record. + /// * `language` - What language is this record written in. Expects an ISO 639-1 code. + /// * `meta` - The metadata to attach to this record. Supplying a `title` is highly recommended. + /// * `filters` - The filters to attach to this record. + /// * `sort` - The sort keys to attach to this record. + /// + /// # Returns + /// Metadata about the added record if successful, otherwise an Error. + pub async fn add_custom_record( &mut self, url: String, content: String, @@ -84,7 +129,7 @@ impl PagefindIndex { meta: Option>, filters: Option>>, sort: Option>, - ) -> Result { + ) -> Result { let data = DomParserResult { digest: content, filters: filters.unwrap_or_default(), @@ -103,75 +148,110 @@ impl PagefindIndex { .unwrap_or(language), }; let file = Fossicker::new_with_data(url, data); - let data = self.search_index.fossick_one(file).await; - - match data { - Ok(data) => Ok(IndexedFileResponse { - page_word_count: data.fragment.data.word_count as u32, - page_url: data.fragment.data.url, - page_meta: data.fragment.data.meta - }), - Err(_) => Err("Failed to add file".to_string()), - } + let data = self.search_index.fossick_one(file).await?; + + Ok(IndexedFileResponse { + page_word_count: data.fragment.data.word_count as u32, + page_url: data.fragment.data.url, + page_meta: data.fragment.data.meta, + }) } - /// Add a directory to the search index with a glob pattern. + /// Index a directory of HTML files from disk. /// /// # Arguments /// * `path` - The path to the directory to index. /// * `glob` - A glob pattern to match files in the directory. If not provided, the default glob pattern will be used. /// /// # Returns - /// Either the number of pages indexed or an error message, if it fails to index the directory. - pub async fn add_dir(&mut self, path: String, glob: Option) -> Result { + /// The number of pages indexed if successful, otherwise an Error. + pub async fn add_directory(&mut self, path: String, glob: Option) -> Result { let defaults: PagefindInboundConfig = serde_json::from_str("{}").expect("All fields have serde defaults"); let glob = glob.unwrap_or(defaults.glob); - let data = self + let page_count = self .search_index .fossick_many(PathBuf::from(path), glob) - .await; - match data { - Ok(page_count) => Ok(page_count), - Err(_) => Err("Failed to index directory".to_string()), - } + .await?; + + Ok(page_count) } /// Build the search index for this instance and hold it in memory. - pub async fn build_indexes(&mut self) { - self.search_index.build_indexes().await; + pub async fn build_indexes(&mut self) -> Result<()> { + self.search_index.build_indexes().await } /// Build the search index for this instance and write the files to disk. /// /// # Arguments /// * `output_path` - The path to write the files to. If not provided, the default output path will be used. - pub async fn write_files(&mut self, output_path: Option) -> String { - self.search_index.build_indexes().await; + /// + /// # Returns + /// The path files were written to if successful, otherwise an Error. + pub async fn write_files(&mut self, output_path: Option) -> Result { + self.search_index.build_indexes().await?; let resolved_output_path = self .search_index .write_files(output_path.map(Into::into)) .await; - resolved_output_path.to_string_lossy().into() + Ok(resolved_output_path.to_string_lossy().into()) } /// Build the search index for this instance and return the files as a list of /// SyntheticFileResponse. /// /// # Returns - /// A list of SyntheticFileResponse containing the path and content of each file. - pub async fn get_files(&mut self) -> Vec { - self.search_index.build_indexes().await; - self.search_index - .get_files() - .await - .into_iter() - .map(|file| SyntheticFileResponse { - path: file.filename.to_string_lossy().into(), - content: general_purpose::STANDARD.encode(file.contents), - }) - .collect() + /// A list of SyntheticFiles containing the path and content of each file. + pub async fn get_files(&mut self) -> Result> { + self.search_index.build_indexes().await?; + Ok(self.search_index.get_files().await) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio; + + #[tokio::test] + async fn test_add_file() { + let options = PagefindServiceConfig::builder() + .keep_index_url(true) + .force_language("en".to_string()) + .build(); + let mut index = PagefindIndex::new(Some(options)).unwrap(); + let file_response = index + .add_html_file( + Some("test/index.html".into()), + None, + "

Test content

".into(), + ) + .await; + + let file = file_response.expect("Adding a file should succeed"); + assert_eq!(file.page_word_count, 2); + assert_eq!(file.page_url, "/test/index.html"); + + let files_response = index.get_files().await; + + let files = files_response.expect("Getting files should succeed"); + let filenames: Vec<_> = files.into_iter().map(|f| f.filename).collect(); + assert!(filenames.contains(&PathBuf::from("pagefind.js"))); + assert!(filenames.contains(&PathBuf::from("pagefind-ui.js"))); + assert!(filenames.contains(&PathBuf::from("pagefind-ui.css"))); + assert!(filenames.contains(&PathBuf::from("wasm.en.pagefind"))); + assert!(filenames.contains(&PathBuf::from("pagefind-entry.json"))); + assert!(filenames + .iter() + .any(|f| f.to_string_lossy().ends_with(".pf_meta"))); + assert!(filenames + .iter() + .any(|f| f.to_string_lossy().ends_with(".pf_fragment"))); + assert!(filenames + .iter() + .any(|f| f.to_string_lossy().ends_with(".pf_index"))); } } diff --git a/pagefind/src/service/mod.rs b/pagefind/src/service/mod.rs index 06eb830e..41d772ad 100644 --- a/pagefind/src/service/mod.rs +++ b/pagefind/src/service/mod.rs @@ -2,7 +2,6 @@ use std::io::{BufRead, Write}; pub use api::PagefindIndex; use base64::{engine::general_purpose, Engine as _}; -use rust_patch::Patch; use tokio::sync::mpsc; pub mod api; @@ -10,8 +9,6 @@ pub mod api; use requests::*; use responses::*; -use crate::PagefindInboundConfig; - mod requests; mod responses; @@ -133,22 +130,15 @@ pub async fn run_service() { match msg.payload { RequestAction::NewIndex { config } => { let index_id = indexes.len(); - let mut service_options: PagefindInboundConfig = - serde_json::from_str("{}").expect("All fields have serde defaults"); - - service_options.service = true; - if let Some(config) = config { - service_options = config.apply(service_options); - } - match PagefindIndex::new(service_options) { - Some(index) => { + match PagefindIndex::new(config) { + Ok(index) => { indexes.insert(index_id, Some(index)); send(ResponseAction::NewIndex { index_id: index_id as u32, }); } - None => { + Err(_) => { err("Invalid config supplied"); } } @@ -160,14 +150,14 @@ pub async fn run_service() { file_contents, } => { if let Some(index) = get_index(&mut indexes, index_id, err) { - let page_fragment = index.add_file(file_path, url, file_contents).await; + let page_fragment = index.add_html_file(file_path, url, file_contents).await; match page_fragment { Ok(data) => send(ResponseAction::IndexedFile { page_word_count: data.page_word_count, page_url: data.page_url.clone(), page_meta: data.page_meta.clone(), }), - Err(message) => err(&message), + Err(message) => err(&message.to_string()), } } } @@ -182,7 +172,7 @@ pub async fn run_service() { } => { if let Some(index) = get_index(&mut indexes, index_id, err) { let data = index - .add_record(url, content, language, meta, filters, sort) + .add_custom_record(url, content, language, meta, filters, sort) .await; match data { Ok(data) => send(ResponseAction::IndexedFile { @@ -190,7 +180,7 @@ pub async fn run_service() { page_url: data.page_url.clone(), page_meta: data.page_meta.clone(), }), - Err(_) => err("Failed to add file"), + Err(message) => err(&message.to_string()), } } } @@ -200,18 +190,20 @@ pub async fn run_service() { glob, } => { if let Some(index) = get_index(&mut indexes, index_id, err) { - match index.add_dir(path, glob).await { + match index.add_directory(path, glob).await { Ok(page_count) => send(ResponseAction::IndexedDir { page_count: page_count as u32, }), - Err(_) => err("Failed to index directory"), + Err(message) => err(&message.to_string()), } } } RequestAction::BuildIndex { index_id } => { if let Some(index) = get_index(&mut indexes, index_id, err) { - index.build_indexes().await; - send(ResponseAction::BuildIndex {}); + match index.build_indexes().await { + Ok(_) => send(ResponseAction::BuildIndex {}), + Err(e) => err(&e.to_string()), + } } } RequestAction::WriteFiles { @@ -219,18 +211,38 @@ pub async fn run_service() { output_path, } => { if let Some(index) = get_index(&mut indexes, index_id, err) { - index.build_indexes().await; - let resolved_output_path = index.write_files(output_path.map(Into::into)).await; - send(ResponseAction::WriteFiles { - output_path: resolved_output_path, - }); + match index.build_indexes().await { + Ok(_) => match index.write_files(output_path.map(Into::into)).await { + Ok(resolved_output_path) => send(ResponseAction::WriteFiles { + output_path: resolved_output_path, + }), + Err(e) => err(&e.to_string()), + }, + Err(e) => err(&e.to_string()), + } } } RequestAction::GetFiles { index_id } => { if let Some(index) = get_index(&mut indexes, index_id, err) { - index.build_indexes().await; - let files = index.get_files().await; - send(ResponseAction::GetFiles { files }); + match index.build_indexes().await { + Ok(_) => match index.get_files().await { + Ok(files) => { + let response_files = files + .into_iter() + .map(|file| SyntheticFileResponse { + path: file.filename.to_string_lossy().into(), + content: general_purpose::STANDARD.encode(file.contents), + }) + .collect(); + + send(ResponseAction::GetFiles { + files: response_files, + }) + } + Err(e) => err(&e.to_string()), + }, + Err(e) => err(&e.to_string()), + } } } RequestAction::DeleteIndex { index_id } => match indexes.get_mut(index_id as usize) { diff --git a/pagefind/src/service/responses.rs b/pagefind/src/service/responses.rs index 4ba2cf4f..843d3676 100644 --- a/pagefind/src/service/responses.rs +++ b/pagefind/src/service/responses.rs @@ -35,13 +35,6 @@ pub(super) enum ResponseAction { DeleteIndex {}, } -#[derive(Debug, Deserialize, Serialize)] -pub struct IndexedFileResponse { - pub page_word_count: u32, - pub page_url: String, - pub page_meta: HashMap, -} - #[derive(Debug, Deserialize, Serialize)] pub struct SyntheticFileResponse { pub path: String, From 477a142681876938051c5490f810f6c52cdb2f5e Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Mon, 16 Dec 2024 23:01:19 +1300 Subject: [PATCH 3/3] Fix ranking bug affecting compound words --- .../weighting/compound-words-sum-to-a-full-weight.toolproof.yml | 2 +- pagefind_web/src/search.rs | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pagefind/integration_tests/weighting/compound-words-sum-to-a-full-weight.toolproof.yml b/pagefind/integration_tests/weighting/compound-words-sum-to-a-full-weight.toolproof.yml index b74d0864..9454ff7b 100644 --- a/pagefind/integration_tests/weighting/compound-words-sum-to-a-full-weight.toolproof.yml +++ b/pagefind/integration_tests/weighting/compound-words-sum-to-a-full-weight.toolproof.yml @@ -27,4 +27,4 @@ steps: - step: In my browser, I evaluate {js} js: |- let val = await toolproof.querySelector("p"); - toolproof.assert_eq(val.innerHTML, `weight:1/bal:512.14/loc:4`); + toolproof.assert_eq(val.innerHTML, `weight:1/bal:1024.29/loc:4`); diff --git a/pagefind_web/src/search.rs b/pagefind_web/src/search.rs index 50132445..d7005f7f 100644 --- a/pagefind_web/src/search.rs +++ b/pagefind_web/src/search.rs @@ -336,10 +336,12 @@ impl SearchIndex { // If the new word is weighted _lower_ than the working word, // we want to use the lower value. (Lowest weight wins) working_word.weight = next_word.weight; + working_word.length_bonus = next_word.length_bonus; } else if next_word.weight == working_word.weight { // If the new word is weighted the same, // we want to combine them to boost matching both halves of a compound word working_word.weight += next_word.weight; + working_word.length_bonus += next_word.length_bonus; } // We don't want to do anything if the new word is weighted higher // (Lowest weight wins)