-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[search] Search in downloader by country names.
When a request to search in downloader arrives, we used to only find features on the world map that match the request and return the mwms that contain these features. This commit mixes in the results of search directly in the country tree (countries.txt), or, to be more precise, by the translations of the names of the countries there (countries_names.txt). This is not the most efficient implementation but hopefully it isolated enough to make improvements easy and it was also useful as an exploration where our current search APIs are lacking, for example * The unnecessary std::string<->UniString conversions. * Indexes such as MemSearchIndex pretending to be generic while in fact being tailored to a particular use-case. * The difficulty of mixing search results from different sources.
- Loading branch information
Showing
18 changed files
with
314 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../data/countries_names.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#include "search/countries_names_index.hpp" | ||
|
||
#include "platform/platform.hpp" | ||
|
||
#include "coding/file_reader.hpp" | ||
|
||
#include "base/assert.hpp" | ||
|
||
#include <fstream> | ||
#include <set> | ||
#include <sstream> | ||
|
||
using namespace std; | ||
|
||
namespace search | ||
{ | ||
CountriesNamesIndex::CountriesNamesIndex() | ||
{ | ||
ReadCountryNamesFromFile(m_countries); | ||
BuildIndexFromTranslations(); | ||
} | ||
|
||
void CountriesNamesIndex::CollectMatchingCountries(string const & query, | ||
vector<storage::CountryId> & results) | ||
{ | ||
set<size_t> ids; | ||
auto insertId = [&ids](size_t id, bool /* exactMatch */) { ids.insert(id); }; | ||
|
||
vector<strings::UniString> tokens; | ||
search::NormalizeAndTokenizeString(query, tokens); | ||
search::Delimiters delims; | ||
bool const lastTokenIsPrefix = !query.empty() && !delims(strings::LastUniChar(query)); | ||
for (size_t i = 0; i < tokens.size(); ++i) | ||
{ | ||
auto const & token = tokens[i]; | ||
if (i + 1 == tokens.size() && lastTokenIsPrefix) | ||
Retrieve<strings::PrefixDFAModifier<strings::LevenshteinDFA>>(token, insertId); | ||
else | ||
Retrieve<strings::LevenshteinDFA>(token, insertId); | ||
} | ||
|
||
// todo(@m) Do not bother with tf/idf for now. | ||
results.clear(); | ||
for (auto id : ids) | ||
{ | ||
CHECK_LESS(id, m_countries.size(), ()); | ||
results.emplace_back(m_countries[id].m_countryId); | ||
} | ||
} | ||
|
||
void CountriesNamesIndex::ReadCountryNamesFromFile(vector<Country> & countries) | ||
{ | ||
string contents; | ||
|
||
GetPlatform().GetReader(COUNTRIES_NAMES_FILE)->ReadAsString(contents); | ||
istringstream ifs(contents); | ||
|
||
string line; | ||
countries.clear(); | ||
while (getline(ifs, line)) | ||
{ | ||
if (line.empty()) | ||
continue; | ||
strings::Trim(line); | ||
if (line[0] == '[') | ||
{ | ||
CHECK_EQUAL(line[line.size() - 1], ']', ()); | ||
countries.push_back({}); | ||
countries.back().m_countryId = line.substr(1, line.size() - 2); | ||
continue; | ||
} | ||
auto pos = line.find('='); | ||
if (pos == string::npos) | ||
continue; | ||
// Ignore the language code: the language sets differ for StringUtf8Multilang | ||
// and for the translations used by this class. | ||
auto t = line.substr(pos + 1); | ||
strings::Trim(t); | ||
if (!countries.empty()) | ||
countries.back().m_doc.m_translations.push_back(t); | ||
} | ||
} | ||
|
||
void CountriesNamesIndex::BuildIndexFromTranslations() | ||
{ | ||
for (size_t i = 0; i < m_countries.size(); ++i) | ||
m_index.Add(i, m_countries[i].m_doc); | ||
} | ||
} // namespace search |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#pragma once | ||
|
||
#include "search/base/mem_search_index.hpp" | ||
#include "search/feature_offset_match.hpp" | ||
|
||
#include "storage/storage_defines.hpp" | ||
|
||
#include "indexer/search_string_utils.hpp" | ||
|
||
#include "base/string_utils.hpp" | ||
|
||
#include <cstddef> | ||
#include <string> | ||
#include <utility> | ||
#include <vector> | ||
|
||
namespace search | ||
{ | ||
class CountriesNamesIndex | ||
{ | ||
public: | ||
struct Doc | ||
{ | ||
template <typename Fn> | ||
void ForEachToken(Fn && fn) const | ||
{ | ||
for (auto const & s : m_translations) | ||
fn(StringUtf8Multilang::kDefaultCode, NormalizeAndSimplifyString(s)); | ||
} | ||
|
||
std::vector<std::string> m_translations; | ||
}; | ||
|
||
CountriesNamesIndex(); | ||
|
||
void CollectMatchingCountries(std::string const & query, | ||
std::vector<storage::CountryId> & results); | ||
|
||
private: | ||
struct Country | ||
{ | ||
storage::CountryId m_countryId; | ||
Doc m_doc; | ||
}; | ||
|
||
// todo(@m) Almost the same as in bookmarks/processor.hpp. | ||
template <typename DFA, typename Fn> | ||
void Retrieve(strings::UniString const & s, Fn && fn) const | ||
{ | ||
SearchTrieRequest<DFA> request; | ||
request.m_names.emplace_back(BuildLevenshteinDFA(s)); | ||
request.m_langs.insert(StringUtf8Multilang::kDefaultCode); | ||
|
||
MatchFeaturesInTrie( | ||
request, m_index.GetRootIterator(), [](size_t id) { return true; } /* filter */, | ||
std::forward<Fn>(fn)); | ||
} | ||
|
||
void ReadCountryNamesFromFile(std::vector<Country> & countries); | ||
void BuildIndexFromTranslations(); | ||
|
||
std::vector<Country> m_countries; | ||
search_base::MemSearchIndex<size_t> m_index; | ||
}; | ||
} // namespace search |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.