diff --git a/components/suggest/src/db.rs b/components/suggest/src/db.rs index fbeda64c65..abd9ecaa50 100644 --- a/components/suggest/src/db.rs +++ b/components/suggest/src/db.rs @@ -21,7 +21,7 @@ use crate::{ fakespot, geoname::GeonameCache, pocket::{split_keyword, KeywordConfidence}, - provider::SuggestionProvider, + provider::{AmpMatchingStrategy, SuggestionProvider}, query::FtsQuery, rs::{ DownloadedAmoSuggestion, DownloadedAmpSuggestion, DownloadedAmpWikipediaSuggestion, @@ -321,35 +321,67 @@ impl<'a> SuggestDao<'a> { &self, query: &SuggestionQuery, suggestion_type: AmpSuggestionType, + ) -> Result> { + let strategy = query + .provider_constraints + .as_ref() + .and_then(|c| c.amp_alternative_matching.as_ref()); + match strategy { + None => self.fetch_amp_suggestions_using_keywords(query, suggestion_type, true), + Some(AmpMatchingStrategy::NoKeywordExpansion) => { + self.fetch_amp_suggestions_using_keywords(query, suggestion_type, false) + } + Some(AmpMatchingStrategy::Fts) => { + self.fetch_amp_suggestions_using_fts(query, suggestion_type, "full_keywords") + } + Some(AmpMatchingStrategy::FtsAgainstTitle) => { + self.fetch_amp_suggestions_using_fts(query, suggestion_type, "title") + } + } + } + + pub fn fetch_amp_suggestions_using_keywords( + &self, + query: &SuggestionQuery, + suggestion_type: AmpSuggestionType, + allow_keyword_expansion: bool, ) -> Result> { let keyword_lowercased = &query.keyword.to_lowercase(); let provider = match suggestion_type { AmpSuggestionType::Mobile => SuggestionProvider::AmpMobile, AmpSuggestionType::Desktop => SuggestionProvider::Amp, }; + let where_extra = if allow_keyword_expansion { + "" + } else { + "AND INSTR(CONCAT(fk.full_keyword, ' '), k.keyword) != 0" + }; let suggestions = self.conn.query_rows_and_then_cached( - r#" - SELECT - s.id, - k.rank, - s.title, - s.url, - s.provider, - s.score, - fk.full_keyword - FROM - suggestions s - JOIN - keywords k - ON k.suggestion_id = s.id - LEFT JOIN - full_keywords fk - ON k.full_keyword_id = fk.id - WHERE - s.provider = :provider - AND k.keyword = :keyword - AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) - "#, + &format!( + r#" + SELECT + s.id, + k.rank, + s.title, + s.url, + s.provider, + s.score, + fk.full_keyword + FROM + suggestions s + JOIN + keywords k + ON k.suggestion_id = s.id + LEFT JOIN + full_keywords fk + ON k.full_keyword_id = fk.id + WHERE + s.provider = :provider + AND k.keyword = :keyword + {where_extra} + AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) + "# + ), named_params! { ":keyword": keyword_lowercased, ":provider": provider @@ -427,6 +459,96 @@ impl<'a> SuggestDao<'a> { Ok(suggestions) } + pub fn fetch_amp_suggestions_using_fts( + &self, + query: &SuggestionQuery, + suggestion_type: AmpSuggestionType, + fts_column: &str, + ) -> Result> { + let fts_query = query.fts_query(); + let match_arg = &fts_query.match_arg; + let provider = match suggestion_type { + AmpSuggestionType::Mobile => SuggestionProvider::AmpMobile, + AmpSuggestionType::Desktop => SuggestionProvider::Amp, + }; + let suggestions = self.conn.query_rows_and_then_cached( + &format!( + r#" + SELECT + s.id, + s.title, + s.url, + s.provider, + s.score + FROM + suggestions s + JOIN + amp_fts fts + ON fts.rowid = s.id + WHERE + s.provider = :provider + AND amp_fts match '{fts_column}: {match_arg}' + AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) + ORDER BY rank + LIMIT 1 + "# + ), + named_params! { + ":provider": provider + }, + |row| -> Result { + let suggestion_id: i64 = row.get("id")?; + let title = row.get("title")?; + let raw_url: String = row.get("url")?; + let score: f64 = row.get("score")?; + + self.conn.query_row_and_then( + r#" + SELECT + amp.advertiser, + amp.block_id, + amp.iab_category, + amp.impression_url, + amp.click_url, + i.data AS icon, + i.mimetype AS icon_mimetype + FROM + amp_custom_details amp + LEFT JOIN + icons i ON amp.icon_id = i.id + WHERE + amp.suggestion_id = :suggestion_id + "#, + named_params! { + ":suggestion_id": suggestion_id + }, + |row| { + let cooked_url = cook_raw_suggestion_url(&raw_url); + let raw_click_url = row.get::<_, String>("click_url")?; + let cooked_click_url = cook_raw_suggestion_url(&raw_click_url); + + Ok(Suggestion::Amp { + block_id: row.get("block_id")?, + advertiser: row.get("advertiser")?, + iab_category: row.get("iab_category")?, + title, + url: cooked_url, + raw_url, + full_keyword: query.keyword.clone(), + icon: row.get("icon")?, + icon_mimetype: row.get("icon_mimetype")?, + impression_url: row.get("impression_url")?, + click_url: cooked_click_url, + raw_click_url, + score, + }) + }, + ) + }, + )?; + Ok(suggestions) + } + /// Fetches Suggestions of type Wikipedia provider that match the given query pub fn fetch_wikipedia_suggestions(&self, query: &SuggestionQuery) -> Result> { let keyword_lowercased = &query.keyword.to_lowercase(); @@ -954,6 +1076,7 @@ impl<'a> SuggestDao<'a> { let mut amp_insert = AmpInsertStatement::new(self.conn)?; let mut wiki_insert = WikipediaInsertStatement::new(self.conn)?; let mut keyword_insert = KeywordInsertStatement::new(self.conn)?; + let mut fts_insert = AmpFtsInsertStatement::new(self.conn)?; for suggestion in suggestions { self.scope.err_if_interrupted()?; let common_details = suggestion.common_details(); @@ -974,6 +1097,11 @@ impl<'a> SuggestDao<'a> { wiki_insert.execute(suggestion_id, wikipedia)?; } } + fts_insert.execute( + suggestion_id, + &common_details.full_keywords_joined(), + &common_details.title, + )?; let mut full_keyword_inserter = FullKeywordInserter::new(self.conn, suggestion_id); for keyword in common_details.keywords() { let full_keyword_id = match (suggestion, keyword.full_keyword) { @@ -1764,6 +1892,30 @@ impl<'conn> KeywordMetricsInsertStatement<'conn> { } } +pub(crate) struct AmpFtsInsertStatement<'conn>(rusqlite::Statement<'conn>); + +impl<'conn> AmpFtsInsertStatement<'conn> { + pub(crate) fn new(conn: &'conn Connection) -> Result { + Ok(Self(conn.prepare( + "INSERT INTO amp_fts(rowid, full_keywords, title) + VALUES(?, ?, ?) + ", + )?)) + } + + pub(crate) fn execute( + &mut self, + suggestion_id: i64, + full_keywords: &str, + title: &str, + ) -> Result<()> { + self.0 + .execute((suggestion_id, full_keywords, title)) + .with_context("amp fts insert")?; + Ok(()) + } +} + fn provider_config_meta_key(provider: SuggestionProvider) -> String { format!("{}{}", PROVIDER_CONFIG_META_KEY_PREFIX, provider as u8) } diff --git a/components/suggest/src/lib.rs b/components/suggest/src/lib.rs index 689f39a57b..7c64b7c6ca 100644 --- a/components/suggest/src/lib.rs +++ b/components/suggest/src/lib.rs @@ -28,7 +28,7 @@ pub use config::{SuggestGlobalConfig, SuggestProviderConfig}; pub use error::{Error, SuggestApiError}; pub use geoname::{Geoname, GeonameMatch, GeonameType}; pub use metrics::{LabeledTimingSample, SuggestIngestionMetrics}; -pub use provider::{SuggestionProvider, SuggestionProviderConstraints}; +pub use provider::{AmpMatchingStrategy, SuggestionProvider, SuggestionProviderConstraints}; pub use query::{QueryWithMetricsResult, SuggestionQuery}; pub use store::{InterruptKind, SuggestIngestionConstraints, SuggestStore, SuggestStoreBuilder}; pub use suggestion::{raw_suggestion_url_matches, Suggestion}; diff --git a/components/suggest/src/provider.rs b/components/suggest/src/provider.rs index 7a88f39f30..3cd28c40d6 100644 --- a/components/suggest/src/provider.rs +++ b/components/suggest/src/provider.rs @@ -130,4 +130,21 @@ pub struct SuggestionProviderConstraints { /// settings record(s). #[uniffi(default = None)] pub exposure_suggestion_types: Option>, + /// Which experiment branch should we use for the AMP provider while the FTS experiment is + /// ongoing? + /// `0` means the treatment branch. + #[uniffi(default = None)] + pub amp_alternative_matching: Option, +} + +#[derive(Clone, Debug, uniffi::Enum)] +pub enum AmpMatchingStrategy { + /// Disable keywords added via keyword expansion. + /// This eliminates keywords that for terms related to the "real" keywords, for example + /// misspellings like "underarmor" instead of "under armor"'. + NoKeywordExpansion, + /// Use FTS matching against the full keywords, joined together. + Fts, + /// Use FTS matching against the title field + FtsAgainstTitle, } diff --git a/components/suggest/src/query.rs b/components/suggest/src/query.rs index f094f8c53a..3607549a8f 100644 --- a/components/suggest/src/query.rs +++ b/components/suggest/src/query.rs @@ -131,6 +131,7 @@ impl SuggestionQuery { exposure_suggestion_types: Some( suggestion_types.iter().map(|s| s.to_string()).collect(), ), + ..SuggestionProviderConstraints::default() }), ..Self::default() } diff --git a/components/suggest/src/rs.rs b/components/suggest/src/rs.rs index b74042868a..7dcd76b80f 100644 --- a/components/suggest/src/rs.rs +++ b/components/suggest/src/rs.rs @@ -31,7 +31,7 @@ //! the new suggestion in their results, and return `Suggestion::T` variants //! as needed. -use std::fmt; +use std::{collections::HashSet, fmt}; use remote_settings::{Attachment, RemoteSettingsRecord}; use serde::{Deserialize, Deserializer}; @@ -436,6 +436,23 @@ impl DownloadedSuggestionCommonDetails { }, ) } + + /// Get the full keywords as a single string + pub fn full_keywords_joined(&self) -> String { + let parts: HashSet<_> = self + .full_keywords + .iter() + .flat_map(|(s, _)| s.split_whitespace()) + .collect(); + let mut result = String::new(); + for (i, part) in parts.into_iter().enumerate() { + if i != 0 { + result.push(' '); + } + result.push_str(part); + } + result + } } #[derive(Debug, PartialEq, Eq)] diff --git a/components/suggest/src/schema.rs b/components/suggest/src/schema.rs index 8690ae1521..86931c8c9a 100644 --- a/components/suggest/src/schema.rs +++ b/components/suggest/src/schema.rs @@ -23,7 +23,7 @@ use sql_support::{ /// `clear_database()` by adding their names to `conditional_tables`, unless /// they are cleared via a deletion trigger or there's some other good /// reason not to do so. -pub const VERSION: u32 = 31; +pub const VERSION: u32 = 32; /// The current Suggest database schema. pub const SQL: &str = " @@ -143,6 +143,14 @@ CREATE TRIGGER fakespot_ai AFTER INSERT ON fakespot_custom_details BEGIN WHERE id = new.suggestion_id; END; +CREATE VIRTUAL TABLE IF NOT EXISTS amp_fts USING FTS5( + full_keywords, + title, + content='', + contentless_delete=1, + tokenize=\"porter unicode61 remove_diacritics 2 tokenchars '''-'\" +); + -- DELETE/UPDATE triggers are difficult to implement, since the FTS contents are split between the fakespot_custom_details and suggestions tables. -- If you use an AFTER trigger, then the data from the other table has already been deleted. -- BEFORE triggers are discouraged by the SQLite docs. @@ -597,6 +605,23 @@ CREATE INDEX geonames_alternates_geoname_id ON geonames_alternates(geoname_id); )?; Ok(()) } + 31 => { + // Need to clear the database so that the FTS index will get filled. + clear_database(tx)?; + tx.execute_batch( + " +CREATE VIRTUAL TABLE IF NOT EXISTS amp_fts USING FTS5( + full_keywords, + title, + content='', + contentless_delete=1, + tokenize=\"porter unicode61 remove_diacritics 2 tokenchars '''-'\" +); + + ", + )?; + Ok(()) + } _ => Err(open_database::Error::IncompatibleVersion(version)), } } diff --git a/components/suggest/src/store.rs b/components/suggest/src/store.rs index 0a8eb240f6..1806a40060 100644 --- a/components/suggest/src/store.rs +++ b/components/suggest/src/store.rs @@ -912,7 +912,9 @@ pub(crate) mod tests { use std::sync::atomic::{AtomicUsize, Ordering}; - use crate::{suggestion::FtsMatchInfo, testing::*, SuggestionProvider}; + use crate::{ + provider::AmpMatchingStrategy, suggestion::FtsMatchInfo, testing::*, SuggestionProvider, + }; /// In-memory Suggest store for testing pub(crate) struct TestStore { @@ -1147,6 +1149,128 @@ pub(crate) mod tests { Ok(()) } + #[test] + fn amp_no_keyword_expansion() -> anyhow::Result<()> { + before_each(); + + let store = TestStore::new( + MockRemoteSettingsClient::default() + // Setup the keywords such that: + // * There's a `chicken` keyword, which is not a substring of any full + // keywords (i.e. it was the result of keyword expansion). + // * There's a `los pollos ` keyword with an extra space + .with_record( + "data", + "1234", + los_pollos_amp().merge(json!({ + "keywords": ["los", "los pollos", "los pollos ", "los pollos hermanos", "chicken"], + "full_keywords": [("los pollos", 3), ("los pollos hermanos", 2)], + })) + ) + .with_icon(los_pollos_icon()), + ); + store.ingest(SuggestIngestionConstraints::all_providers()); + assert_eq!( + store.fetch_suggestions(SuggestionQuery { + provider_constraints: Some(SuggestionProviderConstraints { + amp_alternative_matching: Some(AmpMatchingStrategy::NoKeywordExpansion), + ..SuggestionProviderConstraints::default() + }), + // Should not match, because `chicken` is not a substring of a full keyword. + // i.e. it was added because of keyword expansion. + ..SuggestionQuery::amp("chicken") + }), + vec![], + ); + assert_eq!( + store.fetch_suggestions(SuggestionQuery { + provider_constraints: Some(SuggestionProviderConstraints { + amp_alternative_matching: Some(AmpMatchingStrategy::NoKeywordExpansion), + ..SuggestionProviderConstraints::default() + }), + // Should match, even though "los pollos " technically is not a substring + // because there's an extra space. The reason these keywords are in the DB is + // because we want to keep showing the current suggestion when the user types + // the space key. + ..SuggestionQuery::amp("los pollos ") + }), + vec![los_pollos_suggestion("los pollos")], + ); + Ok(()) + } + + #[test] + fn amp_fts() -> anyhow::Result<()> { + before_each(); + + let store = TestStore::new( + MockRemoteSettingsClient::default() + // Setup the keywords such that: + // * There's a `chicken` keyword, which is not a substring of any full + // keywords (i.e. it was the result of keyword expansion). + // * There's a `los pollos ` keyword with an extra space + .with_record( + "data", + "1234", + los_pollos_amp().merge(json!({ + "keywords": ["los", "los pollos", "los pollos ", "los pollos hermanos"], + "full_keywords": [("los pollos", 3), ("los pollos hermanos", 1)], + })), + ) + .with_icon(los_pollos_icon()), + ); + store.ingest(SuggestIngestionConstraints::all_providers()); + assert_eq!( + store.fetch_suggestions(SuggestionQuery { + provider_constraints: Some(SuggestionProviderConstraints { + amp_alternative_matching: Some(AmpMatchingStrategy::Fts), + ..SuggestionProviderConstraints::default() + }), + // "Hermanos" should match, even though it's not listed in the keywords, + // because this strategy uses an FTS match against the full keyword list. + ..SuggestionQuery::amp("Hermanos") + }), + vec![los_pollos_suggestion("Hermanos")], + ); + Ok(()) + } + + #[test] + fn amp_fts_against_title() -> anyhow::Result<()> { + before_each(); + + let store = TestStore::new( + MockRemoteSettingsClient::default() + // Setup the keywords such that: + // * There's a `chicken` keyword, which is not a substring of any full + // keywords (i.e. it was the result of keyword expansion). + // * There's a `los pollos ` keyword with an extra space + .with_record( + "data", + "1234", + los_pollos_amp().merge(json!({ + "keywords": ["los", "los pollos", "los pollos ", "los pollos hermanos"], + "full_keywords": [("los pollos", 3), ("los pollos hermanos", 1)], + })), + ) + .with_icon(los_pollos_icon()), + ); + store.ingest(SuggestIngestionConstraints::all_providers()); + assert_eq!( + store.fetch_suggestions(SuggestionQuery { + provider_constraints: Some(SuggestionProviderConstraints { + amp_alternative_matching: Some(AmpMatchingStrategy::FtsAgainstTitle), + ..SuggestionProviderConstraints::default() + }), + // "Albuquerque" should match, even though it's not listed in the keywords, + // because this strategy uses an FTS match against the title + ..SuggestionQuery::amp("Albuquerque") + }), + vec![los_pollos_suggestion("Albuquerque")], + ); + Ok(()) + } + /// Tests ingesting a data attachment containing a single suggestion, /// instead of an array of suggestions. #[test] @@ -2538,6 +2662,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string(), "bbb".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -2810,6 +2935,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -2847,6 +2973,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -2946,6 +3073,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["bbb".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -2979,6 +3107,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -3029,6 +3158,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); @@ -3063,6 +3193,7 @@ pub(crate) mod tests { providers: Some(vec![SuggestionProvider::Exposure]), provider_constraints: Some(SuggestionProviderConstraints { exposure_suggestion_types: Some(vec!["aaa".to_string()]), + ..SuggestionProviderConstraints::default() }), ..SuggestIngestionConstraints::all_providers() }); diff --git a/examples/suggest-cli/src/main.rs b/examples/suggest-cli/src/main.rs index 8565e42dd2..3cf3a27533 100644 --- a/examples/suggest-cli/src/main.rs +++ b/examples/suggest-cli/src/main.rs @@ -9,8 +9,8 @@ use clap::{Parser, Subcommand, ValueEnum}; use remote_settings::RemoteSettingsServer; use suggest::{ - SuggestIngestionConstraints, SuggestStore, SuggestStoreBuilder, SuggestionProvider, - SuggestionQuery, + AmpMatchingStrategy, SuggestIngestionConstraints, SuggestStore, SuggestStoreBuilder, + SuggestionProvider, SuggestionProviderConstraints, SuggestionQuery, }; static DB_PATH: &str = "suggest.db"; @@ -56,9 +56,31 @@ enum Commands { provider: Option, /// Input to search input: String, + #[clap(long, short)] + amp_matching_strategy: Option, }, } +#[derive(Clone, Debug, ValueEnum)] +enum AmpMatchingStrategyArg { + /// Use keyword matching, without keyword expansion + NoKeyword, + /// Use FTS matching + Fts, + /// Use FTS matching against the title + FtsTitle, +} + +impl Into for AmpMatchingStrategyArg { + fn into(self) -> AmpMatchingStrategy { + match self { + Self::NoKeyword => AmpMatchingStrategy::NoKeywordExpansion, + Self::Fts => AmpMatchingStrategy::Fts, + Self::FtsTitle => AmpMatchingStrategy::FtsAgainstTitle, + } + } +} + #[derive(Clone, Debug, ValueEnum)] enum SuggestionProviderArg { Amp, @@ -109,7 +131,15 @@ fn main() -> Result<()> { provider, input, fts_match_info, - } => query(&store, provider, input, fts_match_info, cli.verbose), + amp_matching_strategy, + } => query( + &store, + provider, + input, + fts_match_info, + amp_matching_strategy, + cli.verbose, + ), }; Ok(()) } @@ -185,6 +215,7 @@ fn query( provider: Option, input: String, fts_match_info: bool, + amp_matching_strategy: Option, verbose: bool, ) { let query = SuggestionQuery { @@ -193,6 +224,10 @@ fn query( None => SuggestionProvider::all().to_vec(), }, keyword: input, + provider_constraints: Some(SuggestionProviderConstraints { + amp_alternative_matching: amp_matching_strategy.map(Into::into), + ..SuggestionProviderConstraints::default() + }), ..SuggestionQuery::default() }; let mut results = store