diff --git a/.gitignore b/.gitignore
index a1c2a23..c5c2590 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,8 @@
+# ~~~ Custom ignorables ~~~
+hh-school-search-test/
+Todos.java
+
+# ~~~ Java .gitignore template ~~~
# Compiled class file
*.class
@@ -21,3 +26,89 @@
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
+
+# ~~~ Maven .gitignore template ~~~
+target/
+pom.xml.tag
+pom.xml.releaseBackup
+pom.xml.versionsBackup
+pom.xml.next
+release.properties
+dependency-reduced-pom.xml
+buildNumber.properties
+.mvn/timing.properties
+# https://github.com/takari/maven-wrapper#usage-without-binary-jar
+.mvn/wrapper/maven-wrapper.jar
+
+# ~~~ JetBrains .gitignore template ~~~
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn. Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
diff --git a/.idea/compiler.xml b/.idea/compiler.xml
new file mode 100644
index 0000000..7bfaa6d
--- /dev/null
+++ b/.idea/compiler.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..d24ea8e
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
index a06ef0a..081b728 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,58 @@
# hh-school-search
+
+## Реализованные возможности
+
+**Основные задания:**
+1. Индексация
+2. Поиск
+
+Возможность явного указания пути к индексу была намеренно исключена ввиду сложной структуры файлов, однако имеется возможность вручную редактировать, копировать, заменять эти файлы с помощью нативных средств ОС,
+а также пересоздать их с помощью специальных команд. Подробнее описано в следующих разделах.
+
+**Дополнительные задания:**
+1. Операторы AND и NOT (NOT реализован в виде бинарного оператора, исключающего из результатов левого операнда результаты правого операнда - как разность множеств).
+2. Фразовые запросы (они же - c ограничением расстояния). Синтаксис: `<< term1 term2 ... >>` (двойные угловые скобки, как подобие кавычек).
+В качестве результата запрос возвращает первый токен удовлетворяющей фразы в документе.
+Поддерживается символ "любого слова": `<< term1 * term3 >>`. Стоп-слова в запросе не поддерживаются, но их положение учитывается в самом документе при поиске.
+Пример: `<< Marina * * Diamond >>` => "Marina and the Diamond", "Marina Developer in Diamond", ...
+3. Применена оптимизация по нахождению пересечения множеств (см. Также реализовано)
+4. ENTRY_OR: OR с задаваемым минимальным количеством вхождений. Синтаксис: `[ ...]`.
+Пример из задания: `2[java scala kotlin]`.
+5. *(От себя):* Вложенные подзапросы и приоритет операций (круглые скобки).
+Примеры: `Developer AND (C# OR C++)`, `3[белый (серый NOT (гусь OR утка)) <<из ржаной муки>> черный] AND (хлеб OR булка)`.
+
+**Также реализовано:**
+1. Работа со стоп-словами, знаками препинания, специальными терминами (включающими в себя знаки препинания, как "C++").
+2. Оптимизация: инвертированный индекс по токенам позволяет находить конкретное место в документе, удовлетворяющее запросу.
+3. Оптимизация: инвертированный индекс по документам позволяет быстрее находить пересечение по документам (используется в AND, ENTRY_OR, NOT), не выполняя distinct над токенами.
+4. Оптимизация: операция пересечения (AND) выполняется в порядке увеличения результатов, удовлетворяющих отдельным операндам пересечения. Результат: меньшее число проверок и операций добавления/удаления.
+
+**Примечания:**
+1. OR должен быть явно указан между операндами (сделано так с перспективой лучшего анализа и ранжирования. Например, понимание ".NET Framework" в первую очередь как целого терма).
+2. Фразовый запрос не может содержать подзапросов, только термы или `*` через пробел.
+3. Использование стоп-слов в запросах не запрещается, но поиск ведется по термам, из которых таковые исключены, следовательно для такого операнда не найдется удовлетворяющих токенов, о чем появляется предупреждение при попытке выполнить такой запрос.
+
+## Сборка и запуск
+
+1. Сборка в среде IntelliJ IDEA: `Вкладка 'Maven' > package`
+2. Скопируйте файл `hh-school-search-1.0-SNAPSHOT-jar-with-dependencies.jar` в удобную директорию
+3. Инициализация рабочей директории (файлы ресурсов): `java -jar hh-school-search-1.0-SNAPSHOT-jar-with-dependencies.jar init`
+4. Индексация: `java -jar hh-school-search-1.0-SNAPSHOT-jar-with-dependencies.jar index <путь к файлу>`
+5. Поиск: `java -jar hh-school-search-1.0-SNAPSHOT-jar-with-dependencies.jar search "<запрос>"`
+
+* Инициализация (восстановление по умолчанию) только файлов словарей: `java -jar hh-school-search-1.0-SNAPSHOT-jar-with-dependencies.jar init dict`
+* Инициализация (восстановление по умолчанию) только файлов индекса: `java -jar hh-school-search-1.0-SNAPSHOT-jar-with-dependencies.jar init index`
+
+## Настройка ресурсов
+
+Пакет jar создает и использует файлы словарей и индекса в папке `./res`.
+Вы можете изменить их в любом удобном текстовом редакторе.
+
+В папке репозитория [`src/main/resources`](src/main/resources) расположены файлы, содержащие словарь знаков пунктуации, специальных терминов, стоп-слов и т.п., встраиваемые в пакет jar и генерируемые по умолчанию командой `init`.
+
+Вы можете использовать комментарии в этих файлах.
+Начало комментария = (начало строки или пробел, решетка, не решетка); конец комментария = конец строки (regex: `/(^|\s)#[^#].*$/`).
+
## Дз по поиску
Реализовать свой простой поиск на java, который может работать в 2 режимах:
diff --git a/hh-school-search.iml b/hh-school-search.iml
new file mode 100644
index 0000000..78b2cc5
--- /dev/null
+++ b/hh-school-search.iml
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..c502520
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,75 @@
+
+
+ 4.0.0
+
+ org.example
+ hh-school-search
+ 1.0-SNAPSHOT
+
+
+ 11
+ 11
+ UTF-8
+
+
+
+
+ com.google.code.gson
+ gson
+ 2.8.6
+
+
+ junit
+ junit
+ 4.12
+ compile
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+
+ true
+ HHSchoolSearch.Main
+
+
+
+
+
+
+
+ maven-assembly-plugin
+ 3.2.0
+
+
+ jar-with-dependencies
+
+
+
+ HHSchoolSearch.Main
+
+
+
+
+
+ make-assembly
+ package
+
+ single
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/main/java/HHSchoolSearch/Commands.java b/src/main/java/HHSchoolSearch/Commands.java
new file mode 100644
index 0000000..eb5849d
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Commands.java
@@ -0,0 +1,155 @@
+package HHSchoolSearch;
+
+import HHSchoolSearch.Index.Indexer;
+import HHSchoolSearch.Search.Querier;
+import HHSchoolSearch.Utils.Export;
+import HHSchoolSearch.Utils.Files;
+
+import java.util.List;
+
+public class Commands
+{
+ public static void execute(String[] args)
+ {
+ if (args == null || args.length == 0)
+ help(args);
+
+ else switch (args[0])
+ {
+ case "help": help(args); break;
+ case "init": init(args); break;
+ case "index": index(args); break;
+ case "search": search(args); break;
+ default: error(args); break;
+ }
+ }
+
+ private static void error(String[] args)
+ {
+ System.out.println(String.format("Command \"%s\" was not recognized.", args[0]));
+ System.out.println();
+
+ help(null);
+ }
+
+ private static void init(String[] args)
+ {
+ boolean
+ dict = args.length < 2 || args[1].equals("dict"),
+ index = args.length < 2 || args[1].equals("index"),
+ unknown = !(dict || index);
+
+ try
+ {
+ if (dict)
+ {
+ System.out.print("init dict ...");
+
+ for (var path : List.of(
+ Resources.getPathSpecialWords(),
+ Resources.getPathPunctuations(),
+ Resources.getPathStopWords()
+ ))
+ Export.exportResourceFile(path, "." + path);
+
+ System.out.println("OK");
+ }
+
+ if (index)
+ {
+ System.out.print("init index ...");
+
+ for (var path : List.of(
+ Resources.getPathAllDocs(),
+ Resources.getPathAllTerms(),
+ Resources.getPathInvertedDocs(),
+ Resources.getPathInvertedTerms()
+ ))
+ Export.exportResourceFile(path, "." + path);
+
+ System.out.println("OK");
+ }
+
+ if (unknown)
+ {
+ System.err.println(String.format("Unknown directory \"%s\".", args[1]));
+ }
+ }
+ catch (Exception ex)
+ {
+ System.err.println(String.format("FAILED. Message: %s", ex.getMessage()));
+ }
+ }
+
+ private static void index(String[] args)
+ {
+ if (!Resources.initialize())
+ {
+ System.err.println("Failed to load resources. Check validity of the files in \"./res\" or try creating samples by \"init\" command.");
+ }
+ else if (!Resources.loadIndex())
+ {
+ System.err.println("Failed to load index. Check validity of the files in \"./res\" or try creating samples by \"init\" command.");
+ }
+ else
+ {
+ if (args.length < 2)
+ {
+ help(args);
+ return;
+ }
+
+ try
+ {
+ var path = args[1];
+ var lines = Files.readLinesFromFile(path);
+ Indexer.indexLinesAsDocs(path, lines);
+ }
+ catch (Exception ex)
+ {
+ System.err.println(String.format("Failed indexing documents. Message: %s", ex.getMessage()));
+ }
+ finally
+ {
+ Resources.saveIndex();
+ }
+ }
+ }
+
+ private static void search(String[] args)
+ {
+ if (!Resources.initialize())
+ {
+ System.err.println("Failed to load resources. Check validity of the files in \"./res\" or try creating samples by \"init\" command.");
+ }
+ else if (!Resources.loadIndex())
+ {
+ System.err.println("Failed to load index. Check validity of the files in \"./res\" or try creating samples by \"init\" command.");
+ }
+ else
+ {
+ System.out.println();
+
+ if (args.length < 2)
+ {
+ help(args);
+ return;
+ }
+
+ try
+ {
+ var query = args[1];
+ Querier.performQuery(query);
+ }
+ catch (Throwable th)
+ {
+ System.err.println(String.format("Failed completing search. Message: %s", th.getMessage()));
+ }
+ }
+ }
+
+ private static void help(String[] args)
+ {
+ System.out.println("Available params: help, init, init dict, init index, index , search \"\"");
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Index/AllDocs.java b/src/main/java/HHSchoolSearch/Index/AllDocs.java
new file mode 100644
index 0000000..a5c4d20
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Index/AllDocs.java
@@ -0,0 +1,64 @@
+package HHSchoolSearch.Index;
+
+import HHSchoolSearch.Model.Document;
+import com.google.gson.annotations.Expose;
+
+import java.util.HashMap;
+
+public class AllDocs
+{
+ @Expose
+ private int lastIssuedId;
+ @Expose
+ private HashMap docsIdMap;
+
+ private static AllDocs singleton;
+ private AllDocs() { }
+
+ public static boolean initialize(AllDocs providedSingleton) throws Exception
+ {
+ if (providedSingleton != null)
+ {
+ if (providedSingleton.docsIdMap == null)
+ throw new Exception("Invalid JSON content in res/index/all_docs.json");
+
+ singleton = providedSingleton;
+ }
+ else
+ {
+ singleton = new AllDocs();
+ singleton.lastIssuedId = -1;
+ singleton.docsIdMap = new HashMap<>();
+ }
+
+ return true;
+ }
+
+ public static void registerDocument(Document doc)
+ {
+ var docId = doc.getDocId();
+ getDocsIdMap().put(docId, doc);
+ }
+
+ public static boolean containsDoc(int docId)
+ {
+ return getDocsIdMap().containsKey(docId);
+ }
+
+ public static Document getDoc(int docId)
+ {
+ return getDocsIdMap().getOrDefault(docId, null);
+ }
+
+ public static int issueId()
+ {
+ var newId = getLastIssuedId() + 1;
+ singleton.lastIssuedId = newId;
+
+ return newId;
+ }
+
+ public static AllDocs getSingleton() { return singleton; }
+ public static int getLastIssuedId() { return singleton.lastIssuedId; }
+ public static HashMap getDocsIdMap() { return singleton.docsIdMap; }
+}
diff --git a/src/main/java/HHSchoolSearch/Index/AllTerms.java b/src/main/java/HHSchoolSearch/Index/AllTerms.java
new file mode 100644
index 0000000..82da22f
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Index/AllTerms.java
@@ -0,0 +1,97 @@
+package HHSchoolSearch.Index;
+
+import HHSchoolSearch.Model.Term;
+import com.google.gson.annotations.Expose;
+
+import java.util.HashMap;
+
+public class AllTerms
+{
+ @Expose
+ private int lastIssuedId;
+ @Expose
+ private HashMap termsIdMap;
+ @Expose
+ private HashMap termsStrMap;
+
+ private static AllTerms singleton;
+ public static AllTerms getSingleton() { return singleton; }
+
+ private static int getLastIssuedId() { return singleton.lastIssuedId; }
+ private static void setLastIssuedId(int lastIssuedId) { singleton.lastIssuedId = lastIssuedId; }
+
+ public static HashMap getTermsIdMap() { return singleton.termsIdMap; }
+ public static void setTermsIdMap(HashMap termsIdMap) { singleton.termsIdMap = termsIdMap; }
+
+ public static HashMap getTermsStrMap() { return singleton.termsStrMap; }
+ public static void setTermsStrMap(HashMap termsStrMap) { singleton.termsStrMap = termsStrMap; }
+
+ private AllTerms() { }
+
+ public static boolean initialize(AllTerms providedSingleton) throws Exception
+ {
+ if (providedSingleton != null)
+ {
+ if (providedSingleton.termsIdMap == null || providedSingleton.termsStrMap == null)
+ throw new Exception("Invalid JSON content in res/index/all_terms.json");
+
+ singleton = providedSingleton;
+ }
+ else
+ {
+ singleton = new AllTerms();
+
+ setLastIssuedId(-1);
+ setTermsIdMap(new HashMap<>());
+ setTermsStrMap(new HashMap<>());
+ }
+
+ return true;
+ }
+
+ public static void registerTerm(Term term)
+ {
+ getTermsIdMap().put(term.getTermId(), term);
+ getTermsStrMap().put(term.getTermString(), term);
+ }
+
+ public static boolean containsTerm(String termString)
+ {
+ return getTermsStrMap().containsKey(termString);
+ }
+ public static Term findTerm(String termString)
+ {
+ return getTermsStrMap().getOrDefault(termString, null);
+ }
+
+ public static boolean containsTerm(int termId)
+ {
+ return getTermsIdMap().containsKey(termId);
+ }
+ public static Term findTerm(int termId)
+ {
+ return getTermsIdMap().getOrDefault(termId, null);
+ }
+
+ private static int issueId()
+ {
+ var newId = getLastIssuedId() + 1;
+ setLastIssuedId(newId);
+
+ return newId;
+ }
+
+ public static Term provideTerm(String termString)
+ {
+ if (getTermsStrMap().containsKey(termString))
+ return getTermsStrMap().get(termString);
+ else
+ {
+ var termId = issueId();
+ var term = new Term(termId, termString);
+
+ registerTerm(term);
+ return term;
+ }
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Index/Indexer.java b/src/main/java/HHSchoolSearch/Index/Indexer.java
new file mode 100644
index 0000000..e189201
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Index/Indexer.java
@@ -0,0 +1,25 @@
+package HHSchoolSearch.Index;
+
+import HHSchoolSearch.Model.Document;
+
+import java.util.List;
+
+public class Indexer
+{
+ public static void indexLinesAsDocs(String fileName, List lines)
+ {
+ for (var docLine : lines)
+ {
+ var docName = String.format("%s[%d]", fileName, lines.indexOf(docLine));
+ // вынести factory или пусть так
+ var docId = AllDocs.issueId();
+ var doc = new Document(docId, docName, docLine);
+
+ Tokenizer.tokenize(doc, true);
+
+ AllDocs.registerDocument(doc);
+ InvertedTerms.registerEntriesInDoc(doc);
+ InvertedDocs.registerEntriesInDoc(doc);
+ }
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Index/InvertedDocs.java b/src/main/java/HHSchoolSearch/Index/InvertedDocs.java
new file mode 100644
index 0000000..f73a47f
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Index/InvertedDocs.java
@@ -0,0 +1,80 @@
+package HHSchoolSearch.Index;
+
+import HHSchoolSearch.Model.Document;
+import HHSchoolSearch.Model.Term;
+import HHSchoolSearch.Model.Token;
+import com.google.gson.annotations.Expose;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
+public class InvertedDocs
+{
+ @Expose
+ private HashMap> entries;
+
+ private static InvertedDocs singleton;
+ private InvertedDocs() { }
+
+ public static boolean initialize(InvertedDocs providedSingleton) throws Exception
+ {
+ if (providedSingleton != null)
+ {
+ if (providedSingleton.entries == null)
+ throw new Exception("Invalid JSON content in res/index/inverted_docs.json");
+
+ singleton = providedSingleton;
+ }
+ else
+ {
+ singleton = new InvertedDocs();
+ setEntries(new HashMap<>());
+ }
+
+ return true;
+ }
+
+ public static HashMap> getEntries() { return singleton.entries; }
+ public static void setEntries(HashMap> entries) { singleton.entries = entries; }
+
+ public static InvertedDocs getSingleton() { return singleton; }
+
+ public static void registerEntriesInDoc(Document doc)
+ {
+ for (var token : doc.getTokens()) registerEntry(token);
+ }
+
+ public static void registerEntry(Token token)
+ {
+ var term = AllTerms.findTerm(token.getTermId());
+ var termId = term.getTermId();
+
+ Set docs;
+ if (getEntries().containsKey(termId))
+ docs = getEntries().get(termId);
+ else
+ {
+ docs = new HashSet<>();
+ getEntries().put(termId, docs);
+ }
+
+ docs.add(token.getDocumentId());
+ }
+
+ public static boolean containsTerm(Term term)
+ {
+ return getEntries().containsKey(term.getTermId());
+ }
+
+ public static Set getTermEntries(Term term)
+ {
+ if (term == null) return new HashSet<>();
+
+ var termId = term.getTermId();
+
+ if (getEntries().containsKey(termId))
+ return getEntries().get(termId);
+ return new HashSet<>();
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Index/InvertedTerms.java b/src/main/java/HHSchoolSearch/Index/InvertedTerms.java
new file mode 100644
index 0000000..33d7b02
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Index/InvertedTerms.java
@@ -0,0 +1,80 @@
+package HHSchoolSearch.Index;
+
+import HHSchoolSearch.Model.Document;
+import HHSchoolSearch.Model.Term;
+import HHSchoolSearch.Model.Token;
+import com.google.gson.annotations.Expose;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
+public class InvertedTerms
+{
+ @Expose
+ private HashMap> entries;
+
+ private static InvertedTerms singleton;
+ private InvertedTerms() { }
+
+ public static boolean initialize(InvertedTerms providedSingleton) throws Exception
+ {
+ if (providedSingleton != null)
+ {
+ if (providedSingleton.entries == null)
+ throw new Exception("Invalid JSON content in res/index/inverted_terms.json");
+
+ singleton = providedSingleton;
+ }
+ else
+ {
+ singleton = new InvertedTerms();
+ setEntries(new HashMap<>());
+ }
+
+ return true;
+ }
+
+ public static HashMap> getEntries() { return singleton.entries; }
+ public static void setEntries(HashMap> entries) { singleton.entries = entries; }
+
+ public static InvertedTerms getSingleton() { return singleton; }
+
+ public static void registerEntriesInDoc(Document doc)
+ {
+ for (var token : doc.getTokens()) registerEntry(token);
+ }
+
+ public static void registerEntry(Token token)
+ {
+ var term = AllTerms.findTerm(token.getTermId());
+ var termId = term.getTermId();
+
+ Set tokens;
+ if (getEntries().containsKey(termId))
+ tokens = getEntries().get(termId);
+ else
+ {
+ tokens = new HashSet<>();
+ getEntries().put(termId, tokens);
+ }
+
+ tokens.add(token);
+ }
+
+ public static boolean containsTerm(Term term)
+ {
+ return getEntries().containsKey(term.getTermId());
+ }
+
+ public static Set getTermEntries(Term term)
+ {
+ if (term == null) return new HashSet<>();
+
+ var termId = term.getTermId();
+
+ if (getEntries().containsKey(termId))
+ return getEntries().get(termId);
+ return new HashSet<>();
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Index/Serializer.java b/src/main/java/HHSchoolSearch/Index/Serializer.java
new file mode 100644
index 0000000..9d950df
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Index/Serializer.java
@@ -0,0 +1,72 @@
+package HHSchoolSearch.Index;
+
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+
+public class Serializer
+{
+ private static Gson initGson()
+ {
+ return new GsonBuilder()
+ .excludeFieldsWithoutExposeAnnotation()
+ .create();
+ }
+
+ // Сначала задумывалось, что эти методы будут сильно различаться. Пока оставил.
+
+ public static String jsonAllDocs()
+ {
+ var gson = initGson();
+ var allDocs = AllDocs.getSingleton();
+
+ return gson.toJson(allDocs);
+ }
+
+ public static String jsonAllTerms()
+ {
+ var gson = initGson();
+ var allTerms = AllTerms.getSingleton();
+
+ return gson.toJson(allTerms);
+ }
+
+ public static String jsonInvertedDocs()
+ {
+ var gson = initGson();
+ var invertedDocs = InvertedDocs.getSingleton();
+
+ return gson.toJson(invertedDocs);
+ }
+
+ public static String jsonInvertedTerms()
+ {
+ var gson = initGson();
+ var invertedTerms = InvertedTerms.getSingleton();
+
+ return gson.toJson(invertedTerms);
+ }
+
+ public static AllDocs loadAllDocs(String json)
+ {
+ var gson = initGson();
+ return gson.fromJson(json, AllDocs.class);
+ }
+
+ public static AllTerms loadAllTerms(String json)
+ {
+ var gson = initGson();
+ return gson.fromJson(json, AllTerms.class);
+ }
+
+ public static InvertedDocs loadInvertedDocs(String json)
+ {
+ var gson = initGson();
+ return gson.fromJson(json, InvertedDocs.class);
+ }
+
+ public static InvertedTerms loadInvertedTerms(String json)
+ {
+ var gson = initGson();
+ return gson.fromJson(json, InvertedTerms.class);
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Index/Tokenizer.java b/src/main/java/HHSchoolSearch/Index/Tokenizer.java
new file mode 100644
index 0000000..bd782cc
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Index/Tokenizer.java
@@ -0,0 +1,175 @@
+package HHSchoolSearch.Index;
+
+import HHSchoolSearch.Model.Document;
+import HHSchoolSearch.Model.Term;
+import HHSchoolSearch.Model.Token;
+import HHSchoolSearch.Resources;
+import HHSchoolSearch.Utils.Strings;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+public class Tokenizer
+{
+ public static List tokenize(Document document, boolean assign)
+ {
+ // Получение ресурсов
+ var specialsSet = Resources.getSpecialWords();
+ var punctuationsSet = Resources.getPunctuations();
+ var stopWordsSet = Resources.getStopWords();
+
+ // Приведение к единому регистру
+ var rawContent = document.getDocContent().toLowerCase();
+ var specialsLow = specialsSet.stream().map(String::toLowerCase).collect(Collectors.toList());
+ var stopWordsLow = stopWordsSet.stream().map(String::toLowerCase).collect(Collectors.toSet());
+
+ // Шаг 1: выборка специальных терминов
+ var tokensAfterSpecials = tokenizeSpecials(rawContent, specialsLow, 0);
+ // Шаг 2: разбивка остального по словам
+ var tokens = dropPunctuationsAndSpaces(tokensAfterSpecials, punctuationsSet);
+ // Шаг 3: исключение шумовых стоп-слов
+ dropStopWords(tokens, stopWordsLow);
+
+ // Связывание токенов с термами
+ tokenizeTerms(tokens);
+
+ // Связывание токенов с положением в документе
+ var readyTokens = finishTokenizing(tokens, document);
+
+ // Сохранение токенов в объекте документа
+ if (assign) document.putTokens(readyTokens);
+
+ return readyTokens;
+ }
+
+ private static List tokenizeSpecials(String content, List specials, int currentSpecialIndex)
+ {
+ if (content.isBlank())
+ {
+ return new ArrayList<>();
+ }
+
+ if (currentSpecialIndex >= specials.size())
+ {
+ var tokenizedContent = new RawToken(null, content);
+ return List.of(tokenizedContent);
+ }
+
+ var currentSpecial = specials.get(currentSpecialIndex);
+ var currentSpecialRegex = Strings.escapeRegex(currentSpecial);
+
+ var specialTerm = AllTerms.provideTerm(currentSpecial);
+ var specialToken = new RawToken(specialTerm, currentSpecial);
+
+ var parts = content.split(currentSpecialRegex, -1);
+ var resultingList = new ArrayList();
+
+ for (var part : parts)
+ {
+ var innerTokens = tokenizeSpecials(part, specials, currentSpecialIndex + 1);
+
+ resultingList.addAll(innerTokens);
+ resultingList.add(specialToken);
+ }
+ resultingList.remove(resultingList.size() - 1);
+
+ var nonEmptyResultingList = resultingList.stream()
+ .filter(token -> !token.string.isBlank())
+ .collect(Collectors.toList());
+
+ return nonEmptyResultingList;
+ }
+
+ private static List dropPunctuationsAndSpaces(List source, Set punctuations)
+ {
+ // builds /[\sp1p2...pn]+/
+ StringBuilder punctuationRegexBld = new StringBuilder("[\\s");
+ for (var p : punctuations) punctuationRegexBld.append(Strings.escapeRegex(p));
+ punctuationRegexBld.append("]+");
+
+ var punctuationRegex = punctuationRegexBld.toString();
+ var result = new ArrayList();
+
+ for (var token : source)
+ {
+ if (token.isReady()) result.add(token);
+ else
+ {
+ var parts = token.string.split(punctuationRegex);
+ var tokens = Arrays.stream(parts)
+ .map(part -> new RawToken(null, part))
+ .filter(part -> !part.string.isBlank())
+ .collect(Collectors.toList());
+ result.addAll(tokens);
+ }
+ }
+
+ return result;
+ }
+
+ private static void dropStopWords(List source, Set stopWords)
+ {
+ var stopTokens = source.stream()
+ .filter(token ->
+ !token.isReady() &&
+ stopWords.contains(token.string))
+ .collect(Collectors.toList());
+
+ for (var stopToken : stopTokens) stopToken.stop = true;
+ }
+
+ private static void tokenizeTerms(List tokens)
+ {
+ tokens.stream()
+ .filter(RawToken::notReady)
+ .filter(RawToken::notStop)
+ .forEach(token -> token.term = AllTerms.provideTerm(token.string));
+ }
+
+ private static List finishTokenizing(List source, Document document)
+ {
+ // set positions
+ source.stream().sequential()
+ .forEach(token -> token.position = source.indexOf(token));
+
+ // remove stop-words
+ var pureTokens = source.stream().sequential()
+ .filter(RawToken::notStop)
+ .collect(Collectors.toList());
+
+ var result = source.stream().sequential()
+ .filter(RawToken::notStop)
+ .map(rawToken -> new Token(
+ rawToken.position,
+ rawToken.term,
+ document))
+ .collect(Collectors.toList());
+
+ return result;
+ }
+
+ private static class RawToken
+ {
+ public boolean stop;
+ public int position;
+
+ public Term term;
+ public String string;
+
+ public RawToken(Term _term, String _string)
+ {
+ stop = false;
+ term = _term;
+ string = _string;
+ }
+
+ public boolean isStop() { return stop; }
+ public boolean notStop() { return !stop; }
+
+ public boolean isReady() { return term != null; }
+ public boolean notReady() { return term == null; }
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Main.java b/src/main/java/HHSchoolSearch/Main.java
new file mode 100644
index 0000000..4c1b86a
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Main.java
@@ -0,0 +1,11 @@
+package HHSchoolSearch;
+
+public class Main
+{
+ public static void main(String[] args)
+ {
+ System.out.println("hh-school-search v1.0\n");
+
+ Commands.execute(args);
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Model/Document.java b/src/main/java/HHSchoolSearch/Model/Document.java
new file mode 100644
index 0000000..86b1cee
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Model/Document.java
@@ -0,0 +1,47 @@
+package HHSchoolSearch.Model;
+
+import com.google.gson.annotations.Expose;
+
+import java.util.*;
+
+public class Document
+{
+ @Expose
+ private int docId;
+ @Expose
+ private String docName;
+ @Expose
+ private String docContent;
+
+ @Expose
+ private HashMap docPositionTokensMap;
+
+ public int getDocId() { return docId; }
+ public String getDocName() { return docName; }
+ public String getDocContent() { return docContent; }
+ public Set getTokens() { return new HashSet<>(docPositionTokensMap.values()); }
+
+ public Document(int id, String name, String content)
+ {
+ docId = id;
+ docName = name;
+ docContent = content;
+
+ docPositionTokensMap = new HashMap<>();
+ }
+
+ public void putToken(Token token)
+ {
+ docPositionTokensMap.put(token.getPosition(), token);
+ }
+
+ public void putTokens(Collection tokens)
+ {
+ for (var token : tokens) putToken(token);
+ }
+
+ public Token getTokenAt(int position)
+ {
+ return docPositionTokensMap.getOrDefault(position, null);
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Model/Term.java b/src/main/java/HHSchoolSearch/Model/Term.java
new file mode 100644
index 0000000..2af13e4
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Model/Term.java
@@ -0,0 +1,28 @@
+package HHSchoolSearch.Model;
+
+import com.google.gson.annotations.Expose;
+
+public class Term
+{
+ @Expose
+ private int termId;
+ @Expose
+ private String termString;
+
+ public int getTermId() { return termId; }
+ public String getTermString() { return termString; }
+
+ public Term(int id, String string)
+ {
+ termId = id;
+ termString = string;
+ }
+
+ @Override
+ public int hashCode()
+ {
+ return termString != null
+ ? termString.hashCode()
+ : 0;
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Model/Token.java b/src/main/java/HHSchoolSearch/Model/Token.java
new file mode 100644
index 0000000..2ac1c97
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Model/Token.java
@@ -0,0 +1,24 @@
+package HHSchoolSearch.Model;
+
+import com.google.gson.annotations.Expose;
+
+public class Token
+{
+ @Expose
+ private int position;
+ @Expose
+ private int termId;
+ @Expose
+ private int documentId;
+
+ public int getPosition() { return position; }
+ public int getTermId() { return termId; }
+ public int getDocumentId() { return documentId; }
+
+ public Token(int _position, Term _term, Document _document)
+ {
+ position = _position;
+ termId = _term.getTermId();
+ documentId = _document.getDocId();
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Resources.java b/src/main/java/HHSchoolSearch/Resources.java
new file mode 100644
index 0000000..72e3db0
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Resources.java
@@ -0,0 +1,137 @@
+package HHSchoolSearch;
+
+import HHSchoolSearch.Index.*;
+import HHSchoolSearch.Utils.Files;
+import HHSchoolSearch.Utils.Strings;
+
+import java.util.HashSet;
+import java.util.List;
+
+public class Resources
+{
+ public static String getPathSpecialWords() { return "/res/dict/special_words.txt"; }
+ public static String getPathPunctuations() { return "/res/dict/punctuations.txt"; }
+ public static String getPathStopWords() { return "/res/dict/stop_words.txt"; }
+
+ public static String getPathAllDocs() { return "/res/index/all_docs.json"; }
+ public static String getPathAllTerms() { return "/res/index/all_terms.json"; }
+ public static String getPathInvertedDocs() { return "/res/index/inverted_docs.json"; }
+ public static String getPathInvertedTerms() { return "/res/index/inverted_terms.json"; }
+
+ private static HashSet specialWords;
+ private static HashSet punctuations;
+ private static HashSet stopWords;
+
+ public static HashSet getSpecialWords() { return specialWords; }
+ public static HashSet getPunctuations() { return punctuations; }
+ public static HashSet getStopWords() { return stopWords; }
+
+ public static boolean initialize()
+ {
+ System.out.println("Loading resources...");
+
+ try
+ {
+ loadSpecialWords();
+ loadPunctuations();
+ loadStopWords();
+ }
+ catch (Exception ex)
+ {
+ System.err.print("Couldn't load resources. Error message: ");
+ System.err.println(ex.getMessage());
+ return false;
+ }
+
+ System.out.println("Resources loaded successfully.");
+ return true;
+ }
+
+ private static void loadSpecialWords() throws Exception
+ {
+ var path = getPathSpecialWords();
+ var lines = Files.readLinesFromFile("." + path);
+ var tokens = Strings.splitTokensBySpace(Strings.dropComments(lines));
+
+ specialWords = new HashSet<>(tokens);
+ }
+
+ private static void loadPunctuations() throws Exception
+ {
+ var path = getPathPunctuations();
+ var lines = Files.readLinesFromFile("." + path);
+ var tokens = Strings.splitTokensBySpace(Strings.dropComments(lines));
+
+ punctuations = new HashSet<>(tokens);
+ }
+
+ private static void loadStopWords() throws Exception
+ {
+ var path = getPathStopWords();
+ var lines = Files.readLinesFromFile("." + path);
+ var tokens = Strings.splitTokensBySpace(Strings.dropComments(lines));
+
+ stopWords = new HashSet<>(tokens);
+ }
+
+ public static boolean saveIndex()
+ {
+ System.out.println("Saving index...");
+ try
+ {
+ Files.writeLinesToFile(
+ "." + getPathAllDocs(),
+ List.of(Serializer.jsonAllDocs()));
+ Files.writeLinesToFile(
+ "." + getPathAllTerms(),
+ List.of(Serializer.jsonAllTerms()));
+ Files.writeLinesToFile(
+ "." + getPathInvertedDocs(),
+ List.of(Serializer.jsonInvertedDocs()));
+ Files.writeLinesToFile(
+ "." + getPathInvertedTerms(),
+ List.of(Serializer.jsonInvertedTerms()));
+
+ System.out.println("Index saved successfully.");
+ return true;
+ }
+ catch (Exception ex)
+ {
+ System.err.print("Couldn't save index. Error message: ");
+ System.err.println(ex.getMessage());
+ return false;
+ }
+ }
+
+ public static boolean loadIndex()
+ {
+ System.out.println("Loading index...");
+ try
+ {
+ AllDocs.initialize(
+ Serializer.loadAllDocs(
+ Files.readWholeFile("." + getPathAllDocs())));
+
+ AllTerms.initialize(
+ Serializer.loadAllTerms(
+ Files.readWholeFile("." + getPathAllTerms())));
+
+ InvertedDocs.initialize(
+ Serializer.loadInvertedDocs(
+ Files.readWholeFile("." + getPathInvertedDocs())));
+
+ InvertedTerms.initialize(
+ Serializer.loadInvertedTerms(
+ Files.readWholeFile("." + getPathInvertedTerms())));
+
+ System.out.println("Index loaded successfully.");
+ return true;
+ }
+ catch (Exception ex)
+ {
+ System.err.print("Couldn't load index. Error message: ");
+ System.err.println(ex.getMessage());
+ return false;
+ }
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Search/Evaluator.java b/src/main/java/HHSchoolSearch/Search/Evaluator.java
new file mode 100644
index 0000000..0663c2e
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Search/Evaluator.java
@@ -0,0 +1,152 @@
+package HHSchoolSearch.Search;
+
+import HHSchoolSearch.Model.Token;
+import HHSchoolSearch.Resources;
+import HHSchoolSearch.Search.QueryStructure.*;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static HHSchoolSearch.Utils.Exceptions.throwFormat;
+
+public class Evaluator
+{
+ private static Set evaluateLeaf(QTerm leaf)
+ {
+ var termString = leaf.getTermString();
+
+ checkStops(List.of(termString));
+
+ return Searcher.searchTerm(termString);
+ }
+
+ public static Set evaluateNode(QOperand node) throws Exception
+ {
+ if (node instanceof QTerm)
+ {
+ return evaluateLeaf((QTerm)node);
+ }
+ else if (node instanceof QMultOperation)
+ {
+ var mult = (QMultOperation)node;
+ switch (mult.getOper())
+ {
+ case EXACT_PHRASE: return exactPhrase(mult.getOperands());
+ case ENTRY_OR: return entryOR(mult.getOperands());
+ default:
+ throwFormat("Evaluation Error: Unsupported operator '%s'.", mult.getOper().toString());
+ return null;
+ }
+ }
+ else if (node instanceof QBinOperation)
+ {
+ var bin = (QBinOperation)node;
+ switch (bin.getOper())
+ {
+ case AND: return and(bin.getLeft(), bin.getRight());
+ case OR: return or(bin.getLeft(), bin.getRight());
+ case NOT: return not(bin.getLeft(), bin.getRight());
+ default:
+ throwFormat("Evaluation Error: Unsupported operator '%s'.", bin.getOper().toString());
+ return null;
+ }
+ }
+ else
+ {
+ throwFormat("Evaluation Error: Unsupported operation type '%s'.", node.getClass().getName());
+ return null;
+ }
+ }
+
+ private static Set exactPhrase(List extends QOperand> args) throws Exception
+ {
+ var termStrings = args.stream()
+ .map(arg -> ((QTerm)arg).getTermString())
+ .collect(Collectors.toList());
+
+ if (termStrings.stream().allMatch(str -> str.equals("*")))
+ throwFormat("Invalid Arguments: Empty phrase search.");
+
+ checkStops(termStrings);
+
+ return Searcher.searchExactPhrase(termStrings);
+ }
+
+ private static Set entryOR(List extends QOperand> args) throws Exception
+ {
+ var entriesQtyArg = (QTerm)args.get(0);
+ var entriesQty = Integer.parseInt(entriesQtyArg.getTermString());
+
+ var inners = new ArrayList>();
+ for (int i = 1; i < args.size(); i++)
+ {
+ var qOperand = (QOperand)args.get(i);
+ inners.add(evaluateNode(qOperand));
+ }
+
+ if (inners.isEmpty())
+ throwFormat("Invalid Arguments: Empty ENTRY_OR query.");
+
+ return Searcher.appearingAtLeast(inners, entriesQty);
+ }
+
+ private static Set and(QOperand left, QOperand right) throws Exception
+ {
+ if (Stream.of(left, right)
+ .allMatch(arg -> arg instanceof QTerm))
+ {
+ var argsStr = Stream.of(left, right)
+ .map(arg -> (QTerm)arg)
+ .map(QTerm::getTermString)
+ .collect(Collectors.toList());
+
+ return Searcher.intersectSimple(argsStr);
+ }
+ else
+ {
+ var inners = Arrays.asList(
+ evaluateNode(left),
+ evaluateNode(right)
+ );
+
+ return Searcher.intersect(inners);
+ }
+ }
+
+ private static Set or(QOperand left, QOperand right) throws Exception
+ {
+ var inners = Arrays.asList(
+ evaluateNode(left),
+ evaluateNode(right)
+ );
+
+ return Searcher.unite(inners);
+ }
+
+ private static Set not(QOperand left, QOperand right) throws Exception
+ {
+ var leftRes = evaluateNode(left);
+
+ if (right instanceof QTerm)
+ {
+ var rightStr = ((QTerm)right).getTermString();
+ return Searcher.except(leftRes, List.of(rightStr));
+ }
+ else
+ {
+ var rightRes = evaluateNode(right);
+ return Searcher.except(leftRes, rightRes);
+ }
+ }
+
+ private static void checkStops(List terms)
+ {
+ for (var termString : terms)
+ if (Resources.getStopWords().contains(termString))
+ System.out.println(String.format("Warning: the term \"%s\" is a stop-word and cannot be found.", termString));
+ }
+}
diff --git a/src/main/java/HHSchoolSearch/Search/Parser.java b/src/main/java/HHSchoolSearch/Search/Parser.java
new file mode 100644
index 0000000..e40653b
--- /dev/null
+++ b/src/main/java/HHSchoolSearch/Search/Parser.java
@@ -0,0 +1,245 @@
+package HHSchoolSearch.Search;
+
+import HHSchoolSearch.Search.QueryStructure.*;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static HHSchoolSearch.Utils.Exceptions.throwFormat;
+import static HHSchoolSearch.Utils.Lists.dropTokensFromTo;
+import static HHSchoolSearch.Utils.Lists.subList;
+import static HHSchoolSearch.Utils.Lists.indexOfAny;
+
+public class Parser
+{
+ public static QOperand parseQuery(String queryString) throws Exception
+ {
+ var tokens = splitTokens(queryString);
+ return parseDeep(tokens);
+ }
+
+ private static List