From 39fd308f07c4430ca230384e1da1c7acc2e76213 Mon Sep 17 00:00:00 2001 From: "Christian F." Date: Fri, 1 Nov 2024 14:43:23 +0100 Subject: [PATCH] - fix lucene not being able to search special characters - StandardAnalyzer does not preserve whitespace and does not allow special character search --- pom.xml | 14 +++++- .../helpers/LuceneGuiFilmeModelHelper.java | 23 ++++++++- .../gui/tasks/LuceneIndexWorker.java | 22 ++++++++- .../mediathek/tool/LuceneDefaultAnalyzer.java | 48 +++++++++++++++++++ 4 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 src/main/java/mediathek/tool/LuceneDefaultAnalyzer.java diff --git a/pom.xml b/pom.xml index dc9338699..e91b32b61 100755 --- a/pom.xml +++ b/pom.xml @@ -170,7 +170,19 @@ lucene-queryparser ${lucene.version} - + + org.apache.lucene + lucene-analysis-common + ${lucene.version} + + + org.apache.lucene + lucene-queries + ${lucene.version} + + + + org.jfree jfreechart diff --git a/src/main/java/mediathek/gui/tabs/tab_film/helpers/LuceneGuiFilmeModelHelper.java b/src/main/java/mediathek/gui/tabs/tab_film/helpers/LuceneGuiFilmeModelHelper.java index eea4e14d0..abe1f1bc7 100644 --- a/src/main/java/mediathek/gui/tabs/tab_film/helpers/LuceneGuiFilmeModelHelper.java +++ b/src/main/java/mediathek/gui/tabs/tab_film/helpers/LuceneGuiFilmeModelHelper.java @@ -1,3 +1,21 @@ +/* + * Copyright (c) 2024 derreisende77. + * This code was developed as part of the MediathekView project https://github.com/mediathekview/MediathekView + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + package mediathek.gui.tabs.tab_film.helpers; import com.google.common.base.Stopwatch; @@ -11,11 +29,12 @@ import mediathek.javafx.filterpanel.FilterActionPanel; import mediathek.javafx.filterpanel.ZeitraumSpinner; import mediathek.mainwindow.MediathekGui; +import mediathek.tool.LuceneDefaultAnalyzer; import mediathek.tool.SwingErrorDialog; import mediathek.tool.models.TModelFilm; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; @@ -42,7 +61,7 @@ public class LuceneGuiFilmeModelHelper extends GuiModelHelper { PARSER_CONFIG_MAP.put(LuceneIndexKeys.FILM_LENGTH, new PointsConfig(new DecimalFormat(), Integer.class)); } - private final StandardAnalyzer analyzer = new StandardAnalyzer(); + private final Analyzer analyzer = LuceneDefaultAnalyzer.buildAnalyzer(); public LuceneGuiFilmeModelHelper(@NotNull FilterActionPanel filterActionPanel, @NotNull SeenHistoryController historyController, diff --git a/src/main/java/mediathek/gui/tasks/LuceneIndexWorker.java b/src/main/java/mediathek/gui/tasks/LuceneIndexWorker.java index b85f565b8..672575adb 100644 --- a/src/main/java/mediathek/gui/tasks/LuceneIndexWorker.java +++ b/src/main/java/mediathek/gui/tasks/LuceneIndexWorker.java @@ -1,3 +1,21 @@ +/* + * Copyright (c) 2024 derreisende77. + * This code was developed as part of the MediathekView project https://github.com/mediathekview/MediathekView + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + package mediathek.gui.tasks; import com.google.common.base.Stopwatch; @@ -7,11 +25,11 @@ import mediathek.daten.IndexedFilmList; import mediathek.mainwindow.MediathekGui; import mediathek.tool.FileUtils; +import mediathek.tool.LuceneDefaultAnalyzer; import mediathek.tool.SwingErrorDialog; import mediathek.tool.datum.DateUtil; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.*; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; @@ -86,7 +104,7 @@ protected Void doInBackground() { }); //index filmlist after blacklist only - IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new StandardAnalyzer()); + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneDefaultAnalyzer.buildAnalyzer()); indexWriterConfig.setRAMBufferSizeMB(256d); try (var writer = new IndexWriter(filmListe.getLuceneDirectory(), indexWriterConfig)) { diff --git a/src/main/java/mediathek/tool/LuceneDefaultAnalyzer.java b/src/main/java/mediathek/tool/LuceneDefaultAnalyzer.java new file mode 100644 index 000000000..f5eebffa0 --- /dev/null +++ b/src/main/java/mediathek/tool/LuceneDefaultAnalyzer.java @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 derreisende77. + * This code was developed as part of the MediathekView project https://github.com/mediathekview/MediathekView + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +package mediathek.tool; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +import java.io.IOException; + +public class LuceneDefaultAnalyzer { + private LuceneDefaultAnalyzer() {} + private static final Logger logger = LogManager.getLogger(); + + public static Analyzer buildAnalyzer() { + Analyzer analyzer; + try { + analyzer = CustomAnalyzer.builder() + .withTokenizer("whitespace") + .addTokenFilter("lowercase") + .build(); + } + catch (IOException e) { + logger.error("Could not build custom analyzer", e); + logger.error("Falling back to standard analyzer"); + analyzer = new StandardAnalyzer(); + } + return analyzer; + } +}