Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index quarkiverse guides #260

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
8 changes: 7 additions & 1 deletion src/main/java/io/quarkus/search/app/SearchService.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import io.quarkus.search.app.entity.Guide;
import io.quarkus.search.app.entity.Language;
import io.quarkus.search.app.entity.QuarkusVersionAndLanguageRoutingBinder;
import io.quarkus.search.app.quarkusio.QuarkusIO;

import io.quarkus.runtime.LaunchMode;

Expand Down Expand Up @@ -62,6 +63,7 @@ public void init(@Observes Router router) {
public SearchResult<GuideSearchHit> search(@RestQuery @DefaultValue(QuarkusVersions.LATEST) String version,
@RestQuery List<String> categories,
@RestQuery String q,
@RestQuery String origin,
@RestQuery @DefaultValue("en") Language language,
@RestQuery @DefaultValue("highlighted") String highlightCssClass,
@RestQuery @DefaultValue("0") @Min(0) int page,
Expand All @@ -83,6 +85,9 @@ public SearchResult<GuideSearchHit> search(@RestQuery @DefaultValue(QuarkusVersi
if (categories != null && !categories.isEmpty()) {
root.add(f.terms().field("categories").matchingAny(categories));
}
if (origin != null && !origin.isEmpty()) {
root.add(f.match().field("origin").matching(origin));
}

if (q != null && !q.isBlank()) {
root.add(f.bool().must(f.simpleQueryString()
Expand All @@ -101,7 +106,8 @@ public SearchResult<GuideSearchHit> search(@RestQuery @DefaultValue(QuarkusVersi
// we also add phrase flag so that entire phrases could be searched as well, e.g.: "hibernate search"
.flags(SimpleQueryFlag.AND, SimpleQueryFlag.OR, SimpleQueryFlag.PHRASE)
.defaultOperator(BooleanOperator.AND))
.should(f.match().field("origin").matching("quarkus").boost(50.0f))
.should(f.match().field("origin").matching(QuarkusIO.QUARKUS_ORIGIN).constantScore()
.boost(1000.0f))
.should(f.not(f.match().field(language.addSuffix("topics"))
.matching("compatibility", ValueConvert.NO))
.boost(50.0f)));
Expand Down
8 changes: 8 additions & 0 deletions src/main/java/io/quarkus/search/app/entity/I18nData.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ public void set(Language language, T value) {
}
}

public void set(T value) {
en = value;
es = value;
pt = value;
cn = value;
ja = value;
}

public T get(Language language) {
return switch (language) {
case ENGLISH -> en;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import java.util.List;

import io.quarkus.search.app.quarkiverseio.QuarkiverseIO;
import io.quarkus.search.app.quarkusio.QuarkusIO;

import org.hibernate.search.mapper.pojo.bridge.RoutingBridge;
import org.hibernate.search.mapper.pojo.bridge.binding.RoutingBindingContext;
import org.hibernate.search.mapper.pojo.bridge.mapping.programmatic.RoutingBinder;
Expand All @@ -10,14 +13,23 @@

public class QuarkusVersionAndLanguageRoutingBinder implements RoutingBinder {
private static String key(String version, Language language) {
if (language == null) {
return version;
return key(version, language, QuarkusIO.QUARKUS_ORIGIN);
}

private static String key(String version, Language language, String origin) {
StringBuilder key = new StringBuilder();
key.append(origin);
if (version != null) {
key.append("/").append(version);
}
if (language != null) {
key.append("/").append(language.code);
}
return version + "/" + language.code;
return key.toString();
}

public static List<String> searchKeys(String version, Language language) {
return List.of(key(version, language), key(version, null));
return List.of(key(version, language), key(version, null), key(null, null, QuarkiverseIO.QUARKIVERSE_ORIGIN));
}

@Override
Expand All @@ -34,7 +46,11 @@ public static class GuideRoutingBridge implements RoutingBridge<Guide> {
@Override
public void route(DocumentRoutes routes, Object entityIdentifier, Guide entity,
RoutingBridgeRouteContext context) {
routes.addRoute().routingKey(key(entity.quarkusVersion, entity.language));
if (QuarkiverseIO.QUARKIVERSE_ORIGIN.equals(entity.origin)) {
routes.addRoute().routingKey(key(null, null, QuarkiverseIO.QUARKIVERSE_ORIGIN));
} else {
routes.addRoute().routingKey(key(entity.quarkusVersion, entity.language));
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import io.quarkus.search.app.entity.Language;
import io.quarkus.search.app.indexing.FailureCollector;
import io.quarkus.search.app.quarkiverseio.QuarkiverseIO;
import io.quarkus.search.app.quarkiverseio.QuarkiverseIOConfig;
import io.quarkus.search.app.quarkusio.QuarkusIO;
import io.quarkus.search.app.quarkusio.QuarkusIOConfig;
import io.quarkus.search.app.util.CloseableDirectory;
Expand All @@ -41,9 +43,16 @@ public class FetchingService {
@Inject
QuarkusIOConfig quarkusIOConfig;

@Inject
QuarkiverseIOConfig quarkiverseIOConfig;

private final Map<URI, GitCloneDirectory.Details> detailsCache = new ConcurrentHashMap<>();
private final Set<CloseableDirectory> tempDirectories = new ConcurrentHashSet<>();

public QuarkiverseIO fetchQuarkiverseIo(FailureCollector failureCollector) {
return new QuarkiverseIO(quarkiverseIOConfig, failureCollector);
}

public QuarkusIO fetchQuarkusIo(FailureCollector failureCollector) {
CompletableFuture<GitCloneDirectory> main = null;
Map<Language, CompletableFuture<GitCloneDirectory>> localized = new LinkedHashMap<>();
Expand Down
10 changes: 10 additions & 0 deletions src/main/java/io/quarkus/search/app/indexing/IndexableGuides.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package io.quarkus.search.app.indexing;

import java.io.IOException;
import java.util.stream.Stream;

import io.quarkus.search.app.entity.Guide;

public interface IndexableGuides {
Stream<Guide> guides() throws IOException;
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.LongAdder;
import java.util.stream.Stream;

import jakarta.annotation.PreDestroy;
import jakarta.enterprise.context.ApplicationScoped;
Expand All @@ -18,6 +19,7 @@

import io.quarkus.search.app.ReferenceService;
import io.quarkus.search.app.fetching.FetchingService;
import io.quarkus.search.app.quarkiverseio.QuarkiverseIO;
import io.quarkus.search.app.quarkusio.QuarkusIO;
import io.quarkus.search.app.util.SimpleExecutor;

Expand Down Expand Up @@ -230,8 +232,9 @@ private void createIndexes() {
private void indexAll(FailureCollector failureCollector) {
Log.info("Indexing...");
try (Rollover rollover = Rollover.start(searchMapping)) {
try (QuarkusIO quarkusIO = fetchingService.fetchQuarkusIo(failureCollector)) {
indexQuarkusIo(quarkusIO);
try (QuarkusIO quarkusIO = fetchingService.fetchQuarkusIo(failureCollector);
QuarkiverseIO quarkiverseIO = fetchingService.fetchQuarkiverseIo(failureCollector)) {
indexQuarkusIo(quarkusIO, quarkiverseIO);
}

// Refresh BEFORE committing the rollover,
Expand All @@ -248,9 +251,9 @@ private void indexAll(FailureCollector failureCollector) {
}
}

private void indexQuarkusIo(QuarkusIO quarkusIO) throws IOException {
Log.info("Indexing quarkus.io...");
try (var guideStream = quarkusIO.guides();
private void indexQuarkusIo(IndexableGuides quarkus, IndexableGuides quarkiverse) throws IOException {
Log.info("Indexing quarkus.io/quarkiverse.io...");
try (var guideStream = Stream.concat(quarkus.guides(), quarkiverse.guides());
var executor = new SimpleExecutor(indexingConfig.parallelism())) {
indexAll(executor, guideStream.iterator());
}
Expand Down
162 changes: 162 additions & 0 deletions src/main/java/io/quarkus/search/app/quarkiverseio/QuarkiverseIO.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
package io.quarkus.search.app.quarkiverseio;

import java.io.Closeable;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import java.util.stream.Stream;

import jakarta.ws.rs.core.UriBuilder;

import io.quarkus.search.app.entity.Guide;
import io.quarkus.search.app.hibernate.InputProvider;
import io.quarkus.search.app.indexing.FailureCollector;
import io.quarkus.search.app.indexing.IndexableGuides;
import io.quarkus.search.app.util.CloseableDirectory;

import io.quarkus.logging.Log;

import org.hibernate.search.util.common.impl.Closer;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class QuarkiverseIO implements IndexableGuides, Closeable {

public static final String QUARKIVERSE_ORIGIN = "quarkiverse-hub";

private final URI quarkiverseDocsIndex;
private final FailureCollector failureCollector;

private final List<Guide> quarkiverseGuides = new ArrayList<>();
private final boolean enabled;
private final CloseableDirectory guideHtmls;

public QuarkiverseIO(QuarkiverseIOConfig config, FailureCollector failureCollector) {
this.quarkiverseDocsIndex = config.webUri();
this.enabled = config.enabled();
this.failureCollector = failureCollector;
try {
guideHtmls = CloseableDirectory.temp("quarkiverse_htmls_");
} catch (IOException e) {
throw new IllegalStateException("Failed to fetch quarkiverse guides: %s".formatted(e.getMessage()), e);
}
}

public void parseGuides() {
Document index = null;
try {
index = Jsoup.connect(quarkiverseDocsIndex.toString()).get();
} catch (IOException e) {
failureCollector.critical(FailureCollector.Stage.PARSING, "Unable to fetch the Quarkiverse Docs index page.", e);
// no point in doing anything else here:
return;
}

// find links to quarkiverse extension docs:
Elements quarkiverseGuideIndexLinks = index.select("ul.components li.component a.title");

for (Element quarkiverseGuideIndexLink : quarkiverseGuideIndexLinks) {
Guide guide = new Guide();
String topLevelTitle = quarkiverseGuideIndexLink.text();
guide.title.set(topLevelTitle);

Document extensionIndex = null;
try {
extensionIndex = readGuide(guide, quarkiverseGuideIndexLink.absUrl("href"), Optional.empty());
} catch (URISyntaxException | IOException e) {
failureCollector.warning(FailureCollector.Stage.PARSING,
"Unable to fetch guide: " + topLevelTitle, e);
continue;
}

quarkiverseGuides.add(guide);

// find other sub-pages on the left side
Map<URI, String> indexLinks = new HashMap<>();
Elements extensionSubGuides = extensionIndex.select("nav.nav-menu .nav-item a");
for (Element element : extensionSubGuides) {
String href = element.absUrl("href");
URI uri = UriBuilder.fromUri(href).replaceQuery(null).fragment(null).build();
indexLinks.computeIfAbsent(uri, u -> element.text());
}

for (Map.Entry<URI, String> entry : indexLinks.entrySet()) {
Guide sub = new Guide();
sub.title.set(entry.getValue());
try {
readGuide(sub, entry.getKey().toString(), Optional.of(topLevelTitle));
} catch (URISyntaxException | IOException e) {
failureCollector.warning(FailureCollector.Stage.PARSING,
"Unable to fetch guide: " + topLevelTitle, e);
continue;
}
quarkiverseGuides.add(sub);
}
}
}

private Document readGuide(Guide guide, String link, Optional<String> titlePrefix) throws URISyntaxException, IOException {
guide.url = new URI(link);
guide.type = "reference";
guide.origin = QUARKIVERSE_ORIGIN;

Document extensionIndex = Jsoup.connect(link).get();
Elements content = extensionIndex.select("div.content");

String title = content.select("h1.page").text();
if (!title.isBlank()) {
String actualTitle = titlePrefix.map(prefix -> "%s: %s".formatted(prefix, title)).orElse(title);
guide.title.set(actualTitle);
}
guide.summary.set(content.select("div#preamble").text());
guide.htmlFullContentProvider.set(new FileInputProvider(link, dumpHtmlToFile(content.html())));

Log.debugf("Parsed guide: %s", guide.url);
return extensionIndex;
}

private Path dumpHtmlToFile(String html) throws IOException {
Path path = guideHtmls.path().resolve(UUID.randomUUID().toString());
try (FileOutputStream fos = new FileOutputStream(path.toFile())) {
fos.write(html.getBytes(StandardCharsets.UTF_8));
}
return path;
}

public Stream<Guide> guides() {
if (enabled) {
parseGuides();
}
return quarkiverseGuides.stream();
}

@Override
public void close() throws IOException {
try (var closer = new Closer<IOException>()) {
closer.push(CloseableDirectory::close, guideHtmls);
closer.push(List::clear, quarkiverseGuides);
}
}

private record FileInputProvider(String link, Path content) implements InputProvider {

@Override
public InputStream open() throws IOException {
return new FileInputStream(content.toFile());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package io.quarkus.search.app.quarkiverseio;

import java.net.URI;

import io.smallrye.config.ConfigMapping;
import io.smallrye.config.WithDefault;

@ConfigMapping(prefix = "quarkiverseio")
public interface QuarkiverseIOConfig {
String WEB_URI_DEFAULT_STRING = "https://docs.quarkiverse.io/index/explore/index.html";
URI WEB_URI_DEFAULT = URI.create(WEB_URI_DEFAULT_STRING);

@WithDefault(WEB_URI_DEFAULT_STRING)
URI webUri();

@WithDefault("true")
boolean enabled();
}
Loading
Loading