Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create PagefindIndex struct for rust lib to be more sdk like #751

Merged
merged 3 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 27 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion pagefind/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ pagefind_stem = { version = "0.2.0", features = [
"yiddish",
] }
convert_case = "0.6.0"
charabia = { version = "0.8.8", optional = true, default-features = false, features = ["chinese", "japanese"] }
charabia = { version = "0.8.8", optional = true, default-features = false, features = [
"chinese",
"japanese",
] }
unicode-segmentation = "1.10.1"
emojis = "0.6.1"
hashbrown = { version = "0.13.1", features = ["serde"] }
Expand Down Expand Up @@ -81,6 +84,7 @@ actix-files = "0.6"
lexical-core = "0.8.5"
path-slash = "0.2"
rust-patch = "0.1.3"
typed-builder = "0.20.0"

[features]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ steps:
- step: In my browser, I evaluate {js}
js: |-
let val = await toolproof.querySelector("p");
toolproof.assert_eq(val.innerHTML, `weight:1/bal:512.14/loc:4`);
toolproof.assert_eq(val.innerHTML, `weight:1/bal:1024.29/loc:4`);
15 changes: 3 additions & 12 deletions pagefind/src/fossick/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use anyhow::{bail, Result};
use async_compression::tokio::bufread::GzipDecoder;
#[cfg(feature = "extended")]
use charabia::Segment;
Expand Down Expand Up @@ -64,16 +65,6 @@ pub struct Fossicker {
}

impl Fossicker {
pub fn new(file_path: PathBuf) -> Self {
Self {
file_path: Some(file_path),
root_path: None,
page_url: None,
synthetic_content: None,
data: None,
}
}

pub fn new_relative_to(file_path: PathBuf, root_path: PathBuf) -> Self {
Self {
file_path: Some(file_path),
Expand Down Expand Up @@ -459,7 +450,7 @@ impl Fossicker {
}
}

pub async fn fossick(mut self, options: &SearchOptions) -> Result<FossickedData, ()> {
pub async fn fossick(mut self, options: &SearchOptions) -> Result<FossickedData> {
if (self.file_path.is_some() || self.synthetic_content.is_some()) && self.data.is_none() {
self.fossick_html(options).await;
};
Expand All @@ -480,7 +471,7 @@ impl Fossicker {
options
.logger
.error("Tried to index file with no specified URL or file path, ignoring.");
return Err(());
bail!("Tried to index file with no specified URL or file path, ignoring.");
};

Ok(FossickedData {
Expand Down
12 changes: 8 additions & 4 deletions pagefind/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::{
utils::full_hash,
SearchOptions,
};
use anyhow::{bail, Result};
use index_filter::{FilterIndex, PackedValue};
use index_metadata::{MetaChunk, MetaIndex, MetaPage};
use index_words::{PackedPage, PackedWord, WordIndex};
Expand Down Expand Up @@ -44,7 +45,7 @@ pub async fn build_indexes(
mut pages: Vec<FossickedData>,
language: String,
options: &SearchOptions,
) -> PagefindIndexes {
) -> Result<PagefindIndexes> {
let mut meta = MetaIndex {
version: options.version.into(),
pages: Vec::new(),
Expand Down Expand Up @@ -265,7 +266,10 @@ pub async fn build_indexes(
language,
u32::MAX
));
std::process::exit(1);
bail!(
"Language {language} has too many documents to index, must be < {}",
u32::MAX
);
}

// TODO: Parameterize these chunk sizes via options
Expand Down Expand Up @@ -306,7 +310,7 @@ pub async fn build_indexes(
&full_hash(&meta_index)[0..=(language.len() + 7)]
);

PagefindIndexes {
Ok(PagefindIndexes {
word_indexes,
filter_indexes,
sorts,
Expand All @@ -317,7 +321,7 @@ pub async fn build_indexes(
.collect(),
language,
word_count,
}
})
}

fn chunk_index(word_map: HashMap<String, PackedWord>, chunk_size: usize) -> Vec<Vec<PackedWord>> {
Expand Down
58 changes: 35 additions & 23 deletions pagefind/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
use std::{cmp::Ordering, path::PathBuf};

pub use fossick::{FossickedData, Fossicker};
use anyhow::{bail, Result};
use fossick::{FossickedData, Fossicker};
use futures::future::join_all;
use hashbrown::HashMap;
use index::PagefindIndexes;
pub use options::{PagefindInboundConfig, SearchOptions};
use options::{PagefindInboundConfig, SearchOptions};
use output::SyntheticFile;
pub use service::api;
use wax::{Glob, WalkEntry};

use crate::index::build_indexes;
Expand All @@ -15,16 +17,17 @@ mod fragments;
mod index;
#[macro_use]
mod logging;
mod options;
pub mod options;
mod output;
pub mod serve;
pub mod service;
pub mod runner;
mod serve;
mod service;
mod utils;

pub struct SearchState {
pub options: SearchOptions,
pub fossicked_pages: Vec<FossickedData>,
pub built_indexes: Vec<PagefindIndexes>,
struct SearchState {
options: SearchOptions,
fossicked_pages: Vec<FossickedData>,
built_indexes: Vec<PagefindIndexes>,
}

impl SearchState {
Expand All @@ -36,28 +39,31 @@ impl SearchState {
}
}

pub async fn walk_for_files(&mut self, dir: PathBuf, glob: String) -> Vec<Fossicker> {
pub async fn walk_for_files(&mut self, dir: PathBuf, glob: String) -> Result<Vec<Fossicker>> {
let log = &self.options.logger;

log.status("[Walking source directory]");
if let Ok(glob) = Glob::new(&glob) {
glob.walk(&dir)
Ok(glob
.walk(&dir)
.filter_map(Result::ok)
.map(WalkEntry::into_path)
.map(|file_path| Fossicker::new_relative_to(file_path, dir.clone()))
.collect()
.collect())
} else {
log.error(format!(
"Error: Provided glob \"{}\" did not parse as a valid glob.",
self.options.glob
));
// TODO: Bubble this error back to the Node API if applicable
std::process::exit(1);
bail!(
"Error: Provided glob \"{}\" did not parse as a valid glob.",
self.options.glob
);
}
}

pub async fn fossick_many(&mut self, dir: PathBuf, glob: String) -> Result<usize, ()> {
let files = self.walk_for_files(dir.clone(), glob).await;
pub async fn fossick_many(&mut self, dir: PathBuf, glob: String) -> Result<usize> {
let files = self.walk_for_files(dir.clone(), glob).await?;
let log = &self.options.logger;

log.info(format!(
Expand All @@ -80,23 +86,23 @@ impl SearchState {
Ok(self.fossicked_pages.len() - existing_page_count)
}

pub async fn fossick_one(&mut self, file: Fossicker) -> Result<FossickedData, ()> {
pub async fn fossick_one(&mut self, file: Fossicker) -> Result<FossickedData> {
let result = file.fossick(&self.options).await;
if let Ok(result) = result.clone() {
if let Some(result) = result.as_ref().ok() {
let existing = self
.fossicked_pages
.iter()
.position(|page| page.url == result.url);
if let Some(existing) = existing {
*self.fossicked_pages.get_mut(existing).unwrap() = result;
*self.fossicked_pages.get_mut(existing).unwrap() = result.clone();
} else {
self.fossicked_pages.push(result);
self.fossicked_pages.push(result.clone());
}
}
result
}

pub async fn build_indexes(&mut self) {
pub async fn build_indexes(&mut self) -> Result<()> {
let log = &self.options.logger;

let used_custom_body = self.fossicked_pages.iter().any(|page| page.has_custom_body);
Expand Down Expand Up @@ -210,7 +216,8 @@ impl SearchState {
.into_iter()
.map(|(language, pages)| async { build_indexes(pages, language, &self.options).await })
.collect();
self.built_indexes = join_all(indexes).await;
let built_indexes = join_all(indexes).await;
self.built_indexes = built_indexes.into_iter().flat_map(|i| i.ok()).collect();

let stats = self.built_indexes.iter().fold((0, 0, 0, 0), |mut stats, index| {
log.v_info(format!(
Expand Down Expand Up @@ -266,8 +273,13 @@ impl SearchState {
Most likely, the directory passed to Pagefind was empty \
or did not contain any html files.",
);
std::process::exit(1);
bail!(
"Error: Pagefind wasn't able to build an index. \n\
Most likely, the directory passed to Pagefind was empty \
or did not contain any html files."
);
}
Ok(())
}

pub async fn write_files(&self, custom_outdir: Option<PathBuf>) -> PathBuf {
Expand Down
2 changes: 2 additions & 0 deletions pagefind/src/logging.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub enum LogLevel {
Verbose,
}

#[allow(dead_code)]
#[derive(Debug, Clone)]
pub enum LogStyle {
Info,
Expand Down Expand Up @@ -55,6 +56,7 @@ lazy_static! {
static ref SUCCESS: Style = Style::new().green();
}

#[allow(dead_code)]
impl Logger {
pub fn new(log_level: LogLevel, use_terminal: bool, logfile: Option<PathBuf>) -> Self {
if let Some(filename) = &logfile {
Expand Down
Loading
Loading