diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..5b247c005 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,11 @@ +# http://editorconfig.org/ + +root = true + +[*] +charset = utf-8 +end_of_line = lf +trim_trailing_whitespace = true +insert_final_newline = false +indent_style = space +indent_size = 4 diff --git a/.gitignore b/.gitignore index 6a24b9a7e..99085c063 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ target .settings/ bin/ site/vendor +node_modules diff --git a/changelog.txt b/changelog.txt index 60a51116c..e71dc079e 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,11 @@ +Release 1.6 +- new aligorithm based on Greedy String Tiling +- Java 8 now required +- provide our own implementation of variant graphs +- remove optional and seldomly used integrations with Neo4j and Apache Cocoon +- turn collatex-core into a self-contained library, independent of other components +- package collatex-tools as a self-contained, shaded JAR + Release 1.5.1 - Extended the normalization in the javascript alignment table rendering to not only trim whitespace but also lowercase the tokens. @@ -8,7 +16,7 @@ Release 1.5 - Feature: Punctuation is now treated as separate tokens by default in the web-service and command-line tool. - Transposition limiter is moved from the Transposition Detector class to the DekkerAlgorithm class. - The transposition detector is rewritten. It no longer works from left to right, but from largest - moved distance to smallest moved distance. This improves the alignment result in case of longer witnesses. + moved distance to smallest moved distance. This improves the alignment result in case of longer witnesses. - Improved handling of competing blocks of text in the IslandConflictResolver. - Fix: When splitting island in the IslandConflictResolver resulting islands were only kept if there were of size two and up. Now they are kept if they are of size one and up. diff --git a/collatex-cocoon/pom.xml b/collatex-cocoon/pom.xml deleted file mode 100644 index 2fa0dece1..000000000 --- a/collatex-cocoon/pom.xml +++ /dev/null @@ -1,97 +0,0 @@ - - - 4.0.0 - - eu.interedition - collatex - 1.6-SNAPSHOT - - collatex-cocoon - 1.6-SNAPSHOT - CollateX Cocoon Block - Apache Cocoon block exposing CollateX' functionality as a transformer. - - - javax.servlet - servlet-api - - - eu.interedition - collatex-core - - - net.sf.jung - jung-graph-impl - - - org.apache.cocoon - cocoon-core - 2.2.0 - - - org.apache.cocoon - cocoon-servlet-service-components - 1.0.0 - - - org.apache.cocoon - cocoon-template-impl - 1.1.0 - - - org.apache.cocoon - cocoon-flowscript-impl - 1.0.0 - - - - - - org.apache.cocoon - cocoon-maven-plugin - 1.0.0-M2 - - - prepare - compile - - prepare - - - - - - org.mortbay.jetty - maven-jetty-plugin - 6.1.7 - - - - 8888 - 30000 - - - ${project.build.directory}/rcl/webapp - / - - - org.apache.cocoon.mode - dev - - - - - - maven-jar-plugin - 2.1 - - - - ${project.artifactId} - - - - - - - diff --git a/collatex-cocoon/rcl.properties b/collatex-cocoon/rcl.properties deleted file mode 100644 index 7eeeaec94..000000000 --- a/collatex-cocoon/rcl.properties +++ /dev/null @@ -1,17 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -eu.interedition.collatex.collatex-cocoon.service%classes-dir=./target/classes \ No newline at end of file diff --git a/collatex-cocoon/src/main/java/eu/interedition/collatex/cocoon/CollateXTransformer.java b/collatex-cocoon/src/main/java/eu/interedition/collatex/cocoon/CollateXTransformer.java deleted file mode 100644 index 2b4861f56..000000000 --- a/collatex-cocoon/src/main/java/eu/interedition/collatex/cocoon/CollateXTransformer.java +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.cocoon; - -import com.google.common.base.Objects; -import com.google.common.base.Throwables; -import com.google.common.collect.Iterables; -import com.google.common.collect.LinkedHashMultimap; -import com.google.common.collect.Lists; -import com.google.common.collect.Ordering; -import com.google.common.collect.RowSortedTable; -import com.google.common.collect.SetMultimap; -import eu.interedition.collatex.CollationAlgorithm; -import eu.interedition.collatex.CollationAlgorithmFactory; -import eu.interedition.collatex.Token; -import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.Witness; -import eu.interedition.collatex.jung.JungVariantGraph; -import eu.interedition.collatex.matching.EditDistanceTokenComparator; -import eu.interedition.collatex.matching.EqualityTokenComparator; -import eu.interedition.collatex.simple.SimpleCollation; -import eu.interedition.collatex.simple.SimpleToken; -import eu.interedition.collatex.simple.SimpleWitness; -import eu.interedition.collatex.util.ParallelSegmentationApparatus; -import eu.interedition.collatex.util.VariantGraphRanking; -import org.apache.avalon.framework.configuration.Configuration; -import org.apache.avalon.framework.configuration.ConfigurationException; -import org.apache.cocoon.ProcessingException; -import org.apache.cocoon.transformation.AbstractSAXTransformer; -import org.apache.cocoon.xml.AttributesImpl; -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; - -import java.io.IOException; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.SortedMap; - -/** - * @author Gregor Middell - */ -public class CollateXTransformer extends AbstractSAXTransformer { - - private static final String TEI_NS = "http://www.tei-c.org/ns/1.0"; - public static final String COLLATEX_NS = "http://interedition.eu/collatex/ns/1.0"; - - private enum Format { - ALIGNMENT_TABLE, TEI_APPARATUS - } - - private Format format = Format.ALIGNMENT_TABLE; - private CollationAlgorithm algorithm; - private boolean joined; - private final List witnesses = Lists.newArrayList(); - private String sigil; - - @Override - public void configure(Configuration configuration) throws ConfigurationException { - super.configure(configuration); - this.defaultNamespaceURI = COLLATEX_NS; - } - - @Override - public void startTransformingElement(String uri, String name, String raw, Attributes attr) throws ProcessingException, IOException, SAXException { - if (!COLLATEX_NS.equals(uri)) { - return; - } - if ("collation".equals(name)) { - final String format = Objects.firstNonNull(attributeValue(attr, "format"), "table").trim().toLowerCase(); - if ("tei".equals(format)) { - this.format = Format.TEI_APPARATUS; - } else { - this.format = Format.ALIGNMENT_TABLE; - } - - Comparator tokenComparator = new EqualityTokenComparator(); - try { - final int editDistance = Integer.parseInt(Objects.firstNonNull(attributeValue(attr, "editDistance"), "0")); - if (editDistance > 0) { - tokenComparator = new EditDistanceTokenComparator(editDistance); - } - } catch (NumberFormatException e) { - } - - final String algorithm = Objects.firstNonNull(attributeValue(attr, "algorithm"), "dekker").trim().toLowerCase(); - if (algorithm.equals("medite")) { - this.algorithm = CollationAlgorithmFactory.medite(tokenComparator, SimpleToken.TOKEN_MATCH_EVALUATOR); - } else if (algorithm.equals("needleman-wunsch")) { - this.algorithm = CollationAlgorithmFactory.needlemanWunsch(tokenComparator); - } else if (algorithm.equals("gst")) { - this.algorithm = CollationAlgorithmFactory.greedyStringTiling(tokenComparator, 2); - } else { - this.algorithm = CollationAlgorithmFactory.dekker(tokenComparator); - } - - this.joined = "true".equals(Objects.firstNonNull(attributeValue(attr, "joined"), "true").trim().toLowerCase()); - - sigil = null; - witnesses.clear(); - } else if ("witness".equals(name)) { - sigil = Objects.firstNonNull(attributeValue(attr, "sigil"), "w" + (witnesses.size() + 1)); - startTextRecording(); - } - } - - @Override - public void endTransformingElement(String uri, String name, String raw) throws ProcessingException, IOException, SAXException { - if (!COLLATEX_NS.equals(uri)) { - return; - } - if ("collation".equals(name) && !witnesses.isEmpty()) { - ignoreHooksCount++; - final VariantGraph graph = new SimpleCollation(witnesses, algorithm, joined).collate(new JungVariantGraph()); - switch (format) { - case TEI_APPARATUS: - sendTeiApparatus(graph); - break; - default: - sendAlignmentTable(graph); - break; - } - ignoreHooksCount--; - } else if ("witness".equals(name)) { - witnesses.add(new SimpleWitness(sigil, endTextRecording())); - } - } - - private void sendAlignmentTable(VariantGraph graph) throws SAXException { - startPrefixMapping("", COLLATEX_NS); - startElement(COLLATEX_NS, "alignment", "alignment", EMPTY_ATTRIBUTES); - final Set witnesses = graph.witnesses(); - final RowSortedTable> table = VariantGraphRanking.of(graph).asTable(); - - for (Integer rowIndex : table.rowKeySet()) { - final Map> row = table.row(rowIndex); - startElement(COLLATEX_NS, "row", "row", EMPTY_ATTRIBUTES); - for (Witness witness : witnesses) { - final AttributesImpl cellAttrs = new AttributesImpl(); - cellAttrs.addCDATAAttribute("sigil", witness.getSigil()); - startElement(COLLATEX_NS, "cell", "cell", cellAttrs); - if (row.containsKey(witness)) { - for (SimpleToken token : Ordering.natural().immutableSortedCopy(Iterables.filter(row.get(witness), SimpleToken.class))) { - sendTextEvent(token.getContent()); - } - } - endElement(COLLATEX_NS, "cell", "cell"); - - } - endElement(COLLATEX_NS, "row", "row"); - } - endElement(COLLATEX_NS, "alignment", "alignment"); - endPrefixMapping(""); - } - - private void sendTeiApparatus(VariantGraph graph) throws SAXException { - try { - ParallelSegmentationApparatus.generate(VariantGraphRanking.of(graph), new ParallelSegmentationApparatus.GeneratorCallback() { - @Override - public void start() { - try { - startPrefixMapping("cx", COLLATEX_NS); - startPrefixMapping("", TEI_NS); - startElement(COLLATEX_NS, "apparatus", "cx:apparatus", EMPTY_ATTRIBUTES); - } catch (SAXException e) { - throw Throwables.propagate(e); - } - } - - @Override - public void segment(SortedMap> contents) { - final SetMultimap segments = LinkedHashMultimap.create(); - for (Map.Entry> cell : contents.entrySet()) { - final StringBuilder sb = new StringBuilder(); - for (SimpleToken token : Ordering.natural().immutableSortedCopy(Iterables.filter(cell.getValue(), SimpleToken.class))) { - sb.append(token.getContent()); - } - segments.put(sb.toString(), cell.getKey()); - } - - final Set segmentContents = segments.keySet(); - try { - if (segmentContents.size() == 1) { - sendTextEvent(Iterables.getOnlyElement(segmentContents)); - } else { - startElement(TEI_NS, "app", "app", EMPTY_ATTRIBUTES); - for (String segment : segmentContents) { - final StringBuilder witnesses = new StringBuilder(); - for (Witness witness : segments.get(segment)) { - witnesses.append(witness.getSigil()).append(" "); - } - - final AttributesImpl attributes = new AttributesImpl(); - attributes.addCDATAAttribute("wit", witnesses.toString().trim()); - startElement(TEI_NS, "rdg", "rdg", attributes); - sendTextEvent(segment); - endElement(TEI_NS, "rdg", "rdg"); - } - endElement(TEI_NS, "app", "app"); - } - } catch (SAXException e) { - throw Throwables.propagate(e); - } - } - - @Override - public void end() { - try { - endElement(COLLATEX_NS, "apparatus", "cx:apparatus"); - endPrefixMapping(""); - endPrefixMapping("cx"); - } catch (SAXException e) { - throw Throwables.propagate(e); - } - } - }); - } catch (Throwable t) { - Throwables.propagateIfInstanceOf(Throwables.getRootCause(t), SAXException.class); - throw Throwables.propagate(t); - } - } - - static String attributeValue(Attributes attr, String localName) { - for (int ac = 0, al = attr.getLength(); ac < al; ac++) { - if (localName.equals(attr.getLocalName(ac))) { - return attr.getValue(ac); - } - } - return null; - } -} \ No newline at end of file diff --git a/collatex-cocoon/src/main/resources/COB-INF/sitemap.xmap b/collatex-cocoon/src/main/resources/COB-INF/sitemap.xmap deleted file mode 100644 index 9a9bf133c..000000000 --- a/collatex-cocoon/src/main/resources/COB-INF/sitemap.xmap +++ /dev/null @@ -1,29 +0,0 @@ - - - - - - - - - - - - \ No newline at end of file diff --git a/collatex-core/pom.xml b/collatex-core/pom.xml index 9d8b08342..843ffde91 100644 --- a/collatex-core/pom.xml +++ b/collatex-core/pom.xml @@ -1,65 +1,14 @@ - + 4.0.0 eu.interedition collatex - 1.6-SNAPSHOT + 1.7-SNAPSHOT collatex-core - 1.6-SNAPSHOT + 1.7-SNAPSHOT CollateX Core A Java library for collating textual sources, for example, to produce an apparatus. - - - org.neo4j - neo4j - true - - - net.sf.jung - jung-graph-impl - true - - - net.sf.jung - jung-visualization - test - - - org.mockito - mockito-all - - - commons-lang - commons-lang - test - - - - - - maven-javadoc-plugin - 2.9.1 - - - - API - eu.interedition.collatex - - - Collation Algorithms - - eu.interedition.collatex.dekker*:eu.interedition.collatex.medite:eu.interedition.collatex.needlemanwunsch - - - - Variant Graph Implementations - eu.interedition.collatex.jung*:eu.interedition.collatex.neo4j* - - - - - - diff --git a/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithm.java b/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithm.java index c4e6949ac..4211a50ae 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithm.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithm.java @@ -1,192 +1,180 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import eu.interedition.collatex.dekker.Match; -import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm; -import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschScorer; -import eu.interedition.collatex.util.VariantGraphRanking; -import eu.interedition.collatex.util.VertexMatch; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.SortedSet; -import java.util.TreeSet; -import java.util.logging.Level; -import java.util.logging.Logger; - -/** - * @author Gregor Middell - */ -public interface CollationAlgorithm { - - void collate(VariantGraph against, Iterable witness); - - void collate(VariantGraph against, Iterable... witnesses); - - void collate(VariantGraph against, List> witnesses); - - abstract class Base implements CollationAlgorithm { - protected final Logger LOG = Logger.getLogger(getClass().getName()); - protected Map witnessTokenVertices; - - @Override - public void collate(VariantGraph against, Iterable... witnesses) { - collate(against, Arrays.asList(witnesses)); - } - - @Override - public void collate(VariantGraph against, List> witnesses) { - for (Iterable witness : witnesses) { - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "heap space: {0}/{1}", new Object[] { - Runtime.getRuntime().totalMemory(), - Runtime.getRuntime().maxMemory() - }); - } - collate(against, witness); - } - } - - protected void merge(VariantGraph into, Iterable witnessTokens, Map alignments) { - Preconditions.checkArgument(!Iterables.isEmpty(witnessTokens), "Empty witness"); - final Witness witness = Iterables.getFirst(witnessTokens, null).getWitness(); - - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "{0} + {1}: Merge comparand into graph", new Object[] { into, witness }); - } - witnessTokenVertices = Maps.newHashMap(); - VariantGraph.Vertex last = into.getStart(); - final Set witnessSet = Collections.singleton(witness); - for (Token token : witnessTokens) { - VariantGraph.Vertex matchingVertex = alignments.get(token); - if (matchingVertex == null) { - matchingVertex = into.add(token); - } else { - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "Match: {0} to {1}", new Object[] { matchingVertex, token }); - } - matchingVertex.add(Collections.singleton(token)); - } - witnessTokenVertices.put(token, matchingVertex); - - into.connect(last, matchingVertex, witnessSet); - last = matchingVertex; - } - into.connect(last, into.getEnd(), witnessSet); - } - - protected void mergeTranspositions(VariantGraph into, Iterable> transpositions) { - for (SortedSet transposedPhrase : transpositions) { - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase); - } - final Set transposed = Sets.newHashSet(); - for (VertexMatch.WithToken match : transposedPhrase) { - transposed.add(witnessTokenVertices.get(match.token)); - transposed.add(match.vertex); - } - into.transpose(transposed); - } - } - - protected void mergeTranspositions(VariantGraph into, List> transpositions) { - for (List transposedPhrase : transpositions) { - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase); - } - final Set transposed = Sets.newHashSet(); - for (Match match : transposedPhrase) { - transposed.add(witnessTokenVertices.get(match.token)); - transposed.add(match.vertex); - } - into.transpose(transposed); - } - } - - protected void merge(VariantGraph graph, VariantGraph.Vertex[][] vertices, Token[] tokens, SortedSet> matches) { - final SortedSet[] matchesVertexOrder = (SortedSet[]) matches.toArray(new SortedSet[matches.size()]); - final SortedSet[] matchesTokenOrder = Arrays.copyOf(matchesVertexOrder, matchesVertexOrder.length); - - Arrays.sort(matchesTokenOrder, new Comparator>() { - @Override - public int compare(SortedSet o1, SortedSet o2) { - return (o1.first().token - o2.first().token); - } - }); - - final int mergedLength = Math.max(tokens.length, vertices.length); - final Set> inOrderMatches = NeedlemanWunschAlgorithm.align( - matchesVertexOrder, - matchesTokenOrder, - new NeedlemanWunschScorer, SortedSet>() { - - @Override - public float score(SortedSet a, SortedSet b) { - return (a.equals(b) ? 1 : -mergedLength); - } - - @Override - public float gap() { - return -(1 / (mergedLength * 1.0f)); - } - } - ).keySet(); - - final List> transpositions = new ArrayList>(); - for (SortedSet phraseMatch : matches) { - if (!inOrderMatches.contains(phraseMatch)) { - transpositions.add(phraseMatch); - } - } - - - final Map matchedTokens = Maps.newHashMap(); - for (SortedSet phraseMatch : matches) { - for (VertexMatch.WithTokenIndex tokenMatch : phraseMatch) { - matchedTokens.put(tokens[tokenMatch.token], tokenMatch.vertex); - } - } - - final List> transposedTokens = Lists.newLinkedList(); - for (SortedSet transposition : transpositions) { - final SortedSet transpositionMatch = new TreeSet(); - for (VertexMatch.WithTokenIndex match : transposition) { - matchedTokens.remove(tokens[match.token]); - transpositionMatch.add(new VertexMatch.WithToken(match.vertex, match.vertexRank, tokens[match.token])); - } - transposedTokens.add(transpositionMatch); - } - - merge(graph, Arrays.asList(tokens), matchedTokens); - mergeTranspositions(graph, transposedTokens); - } - } -} +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex; + +import eu.interedition.collatex.dekker.Match; +import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm; +import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschScorer; +import eu.interedition.collatex.util.VertexMatch; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +/** + * @author Gregor Middell + */ +public interface CollationAlgorithm { + + void collate(VariantGraph against, Iterable witness); + + void collate(VariantGraph against, Iterable... witnesses); + + void collate(VariantGraph against, List> witnesses); + + abstract class Base implements CollationAlgorithm { + protected final Logger LOG = Logger.getLogger(getClass().getName()); + protected Map witnessTokenVertices; + + @Override + public void collate(VariantGraph against, Iterable... witnesses) { + collate(against, Arrays.asList(witnesses)); + } + + @Override + public void collate(VariantGraph against, List> witnesses) { + for (Iterable witness : witnesses) { + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "heap space: {0}/{1}", new Object[]{ + Runtime.getRuntime().totalMemory(), + Runtime.getRuntime().maxMemory() + }); + } + collate(against, witness); + } + } + + protected void merge(VariantGraph into, Iterable witnessTokens, Map alignments) { + final Witness witness = StreamSupport.stream(witnessTokens.spliterator(), false) + .findFirst() + .map(Token::getWitness) + .orElseThrow(() -> new IllegalArgumentException("Empty witness")); + + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "{0} + {1}: Merge comparand into graph", new Object[]{into, witness}); + } + witnessTokenVertices = new HashMap<>(); + VariantGraph.Vertex last = into.getStart(); + final Set witnessSet = Collections.singleton(witness); + for (Token token : witnessTokens) { + VariantGraph.Vertex matchingVertex = alignments.get(token); + if (matchingVertex == null) { + matchingVertex = into.add(token); + } else { + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "Match: {0} to {1}", new Object[]{matchingVertex, token}); + } + matchingVertex.add(Collections.singleton(token)); + } + witnessTokenVertices.put(token, matchingVertex); + + into.connect(last, matchingVertex, witnessSet); + last = matchingVertex; + } + into.connect(last, into.getEnd(), witnessSet); + } + + protected void mergeTranspositions(VariantGraph into, Iterable> transpositions) { + for (SortedSet transposedPhrase : transpositions) { + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase); + } + final Set transposed = new HashSet<>(); + for (VertexMatch.WithToken match : transposedPhrase) { + transposed.add(witnessTokenVertices.get(match.token)); + transposed.add(match.vertex); + } + into.transpose(transposed); + } + } + + protected void mergeTranspositions(VariantGraph into, List> transpositions) { + for (List transposedPhrase : transpositions) { + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase); + } + final Set transposed = new HashSet<>(); + for (Match match : transposedPhrase) { + transposed.add(witnessTokenVertices.get(match.token)); + transposed.add(match.vertex); + } + into.transpose(transposed); + } + } + + protected void merge(VariantGraph graph, VariantGraph.Vertex[][] vertices, Token[] tokens, SortedSet> matches) { + @SuppressWarnings("unchecked") + final SortedSet[] matchesVertexOrder = matches.toArray(new SortedSet[matches.size()]); + final SortedSet[] matchesTokenOrder = Arrays.copyOf(matchesVertexOrder, matchesVertexOrder.length); + + Arrays.sort(matchesTokenOrder, Comparator.comparing(m -> m.first().token)); + + final Set> alignedMatches = NeedlemanWunschAlgorithm.align( + matchesVertexOrder, + matchesTokenOrder, + new MatchPhraseAlignmentScorer(Math.max(tokens.length, vertices.length)) + ).keySet(); + + final Map alignments = matches.stream() + .filter(alignedMatches::contains) + .flatMap(Set::stream) + .collect(Collectors.toMap(m -> tokens[m.token], m -> m.vertex)); + + final List> transpositions = matches.stream() + .filter(m -> !alignedMatches.contains(m)) + .map(t -> t.stream().map(m -> new VertexMatch.WithToken(m.vertex, m.vertexRank, tokens[m.token])).collect(Collectors.toCollection(TreeSet::new))) + .collect(Collectors.toList()); + + merge(graph, Arrays.asList(tokens), alignments); + mergeTranspositions(graph, transpositions); + } + } + + static class MatchPhraseAlignmentScorer implements NeedlemanWunschScorer, SortedSet> { + + private final int maxWitnessLength; + + public MatchPhraseAlignmentScorer(int maxWitnessLength) { + this.maxWitnessLength = maxWitnessLength; + } + + @Override + public float score(SortedSet a, SortedSet b) { + return (a.equals(b) ? 1 : -maxWitnessLength); + } + + @Override + public float gap() { + return -(1 / (maxWitnessLength * 1.0f)); + } + + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java b/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java index b95755d17..2847a6e2d 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java @@ -1,58 +1,58 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex; - -import com.google.common.base.Function; -import eu.interedition.collatex.dekker.DekkerAlgorithm; -import eu.interedition.collatex.dekker.matrix.MatchTableLinker; -import eu.interedition.collatex.util.GreedyStringTilingAlgorithm; -import eu.interedition.collatex.util.VertexMatch; -import eu.interedition.collatex.medite.MediteAlgorithm; -import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm; - -import java.util.Comparator; -import java.util.SortedSet; - -/** - * @author Gregor Middell - * @author Ronald Haentjens Dekker - */ -public class CollationAlgorithmFactory { - - public static CollationAlgorithm dekker(Comparator comparator) { - return dekkerMatchMatrix(comparator, 3); - } - - public static CollationAlgorithm dekkerMatchMatrix(Comparator comparator, int outlierTranspositionsSizeLimit) { - return new DekkerAlgorithm(comparator, new MatchTableLinker(outlierTranspositionsSizeLimit)); - } - - public static CollationAlgorithm needlemanWunsch(Comparator comparator) { - return new NeedlemanWunschAlgorithm(comparator); - } - - public static CollationAlgorithm greedyStringTiling(Comparator comparator, int minimumTileLength) { - return new GreedyStringTilingAlgorithm(comparator, minimumTileLength); - } - - public static CollationAlgorithm medite(Comparator comparator, Function, Integer> matchEvaluator) { - return new MediteAlgorithm(comparator, matchEvaluator); - } -} +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex; + +import eu.interedition.collatex.dekker.DekkerAlgorithm; +import eu.interedition.collatex.dekker.matrix.MatchTableLinker; +import eu.interedition.collatex.medite.MediteAlgorithm; +import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm; +import eu.interedition.collatex.util.GreedyStringTilingAlgorithm; +import eu.interedition.collatex.util.VertexMatch; + +import java.util.Comparator; +import java.util.SortedSet; +import java.util.function.Function; + +/** + * @author Gregor Middell + * @author Ronald Haentjens Dekker + */ +public class CollationAlgorithmFactory { + + public static CollationAlgorithm dekker(Comparator comparator) { + return dekkerMatchMatrix(comparator, 3); + } + + public static CollationAlgorithm dekkerMatchMatrix(Comparator comparator, int outlierTranspositionsSizeLimit) { + return new DekkerAlgorithm(comparator, new MatchTableLinker()); + } + + public static CollationAlgorithm needlemanWunsch(Comparator comparator) { + return new NeedlemanWunschAlgorithm(comparator); + } + + public static CollationAlgorithm greedyStringTiling(Comparator comparator, int minimumTileLength) { + return new GreedyStringTilingAlgorithm(comparator, minimumTileLength); + } + + public static CollationAlgorithm medite(Comparator comparator, Function, Integer> matchEvaluator) { + return new MediteAlgorithm(comparator, matchEvaluator); + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/Token.java b/collatex-core/src/main/java/eu/interedition/collatex/Token.java index 6954de509..e43705a91 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/Token.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/Token.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,18 +19,10 @@ package eu.interedition.collatex; -import com.google.common.base.Function; /** - * The normalized version of the token. + * The normalized version of the token. */ public interface Token { - Witness getWitness(); - - final Function TO_WITNESS = new Function() { - @Override - public Witness apply(Token input) { - return input.getWitness(); - } - }; + Witness getWitness(); } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java b/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java index a5c78ac1d..8e039bd64 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,156 +19,187 @@ package eu.interedition.collatex; -import com.google.common.base.Function; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; +import eu.interedition.collatex.util.VariantGraphTraversal; -import javax.annotation.Nullable; import java.util.ArrayDeque; +import java.util.Collection; +import java.util.Collections; import java.util.Deque; -import java.util.List; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Optional; import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; /** - * @author Gregor Middell + * @author Gregor Middell */ -public interface VariantGraph { - Vertex getStart(); - - Vertex getEnd(); - - Set transpositions(); - - Iterable vertices(); - - Iterable vertices(Set witnesses); - - Iterable edges(); - - Iterable edges(Set witnesses); - - Vertex add(Token token); - - Edge connect(Vertex from, Vertex to, Set witnesses); - - Edge register(Witness witness); +public class VariantGraph { + final VariantGraph.Vertex start; + final VariantGraph.Vertex end; + final Map>> transpositionIndex = new HashMap<>(); + + public VariantGraph() { + super(); + this.start = new VariantGraph.Vertex(this); + this.end = new VariantGraph.Vertex(this); + + this.start.outgoing.put(this.end, Collections.emptySet()); + this.end.incoming.put(this.start, Collections.emptySet()); + } - Transposition transpose(Set vertices); + public Vertex getStart() { + return start; + } - Edge edgeBetween(Vertex a, Vertex b); + public Vertex getEnd() { + return end; + } - Set witnesses(); + public Set> transpositions() { + return transpositionIndex.values().stream().flatMap(Set::stream).collect(Collectors.toSet()); + } + public Iterable vertices() { + return VariantGraphTraversal.of(this); + } - /** - * @author Gregor Middell - */ - interface Edge { + public Vertex add(Token token) { + final VariantGraph.Vertex vertex = new VariantGraph.Vertex(this); + vertex.tokens.add(token); + return vertex; + } - VariantGraph graph(); + public void connect(Vertex from, Vertex to, Set witnesses) { + if (from.equals(to)) { + throw new IllegalArgumentException(); + } - Edge add(Set witnesses); + witnesses = new HashSet<>(witnesses); + Optional.ofNullable(from.outgoing.remove(to)).ifPresent(witnesses::addAll); - Set witnesses(); + from.outgoing.put(to, witnesses); + to.incoming.put(from, witnesses); - Vertex from(); + start.outgoing.remove(end); + end.incoming.remove(start); + } - Vertex to(); + public Set transpose(Set vertices) { + if (vertices.isEmpty()) { + throw new IllegalArgumentException(); + } + for (Set transposition : vertices.iterator().next().transpositions()) { + if (transposition.equals(vertices)) { + return transposition; + } + } + final Set t = new HashSet<>(vertices); + for (VariantGraph.Vertex vertex : t) { + transpositionIndex.computeIfAbsent(vertex, v -> new HashSet<>()).add(t); + } + return t; + } - void delete(); - } + public Set witnesses() { + return start.outgoing().values().stream().flatMap(Collection::stream).collect(Collectors.toSet()); + } - /** - * @author Gregor Middell - */ - interface Vertex { - Iterable incoming(); + @Override + public String toString() { + return witnesses().toString(); + } - Iterable incoming(Set witnesses); - Iterable outgoing(); + /** + * @author Gregor Middell + */ + public static class Vertex { + private final VariantGraph graph; + private final Set tokens = new HashSet<>(); + private final Map> outgoing = new HashMap<>(); + private final Map> incoming = new HashMap<>(); - Iterable outgoing(Set witnesses); + public Vertex(VariantGraph graph) { + this.graph = graph; + } - Iterable transpositions(); + public Map> incoming() { + return incoming; + } - Set tokens(); + public Map> outgoing() { + return outgoing; + } - Set tokens(Set witnesses); + public Set> transpositions() { + return graph.transpositionIndex.getOrDefault(this, Collections.emptySet()); + } - Set witnesses(); + public Set tokens() { + return tokens; + } - void add(Iterable tokens); + public Set witnesses() { + return incoming().values().stream().flatMap(Set::stream).collect(Collectors.toSet()); + } - VariantGraph graph(); + public void add(Iterable tokens) { + tokens.forEach(this.tokens::add); + } - void delete(); - } + public VariantGraph graph() { + return graph; + } - /** - * @author Gregor Middell - */ - interface Transposition extends Iterable { - void delete(); - } + public String toString() { + return tokens.toString(); + } + } - final Function JOIN = new Function() { - @Override - public VariantGraph apply(@Nullable VariantGraph graph) { - final Set processed = Sets.newHashSet(); - - final Vertex end = graph.getEnd(); - final Deque queue = new ArrayDeque(); - for (VariantGraph.Edge startingEdges : graph.getStart().outgoing()) { - queue.push(startingEdges.to()); - } - - while (!queue.isEmpty()) { - final Vertex vertex = queue.pop(); - final Set transpositions = Sets.newHashSet(vertex.transpositions()); - final List outgoingEdges = Lists.newArrayList(vertex.outgoing()); - if (outgoingEdges.size() == 1) { - final Edge joinCandidateEdge = outgoingEdges.get(0); - final Vertex joinCandidateVertex = joinCandidateEdge.to(); - final Set joinCandidateTranspositions = Sets.newHashSet(joinCandidateVertex.transpositions()); - - boolean canJoin = !end.equals(joinCandidateVertex) && // - Iterables.size(joinCandidateVertex.incoming()) == 1 && // - transpositions.equals(joinCandidateTranspositions); - if (canJoin) { - vertex.add(joinCandidateVertex.tokens()); - for (Transposition t : Sets.newHashSet(joinCandidateVertex.transpositions())) { - final Set transposed = Sets.newHashSet(t); - transposed.remove(joinCandidateVertex); - transposed.add(vertex); - t.delete(); - graph.transpose(transposed); - } - for (Edge e : Lists.newArrayList(joinCandidateVertex.outgoing())) { - final Vertex to = e.to(); - final Set witnesses = e.witnesses(); - e.delete(); - graph.connect(vertex, to, witnesses); + public static final Function JOIN = graph -> { + final Set processed = new HashSet<>(); + final Deque queue = new ArrayDeque<>(graph.start.outgoing.keySet()); + + while (!queue.isEmpty()) { + final Vertex vertex = queue.pop(); + final Set> transpositions = new HashSet<>(vertex.transpositions()); + if (vertex.outgoing.size() == 1) { + final Vertex joinCandidateVertex = vertex.outgoing.keySet().iterator().next(); + final Set> joinCandidateTranspositions = new HashSet<>(joinCandidateVertex.transpositions()); + + boolean canJoin = !graph.end.equals(joinCandidateVertex) && // + joinCandidateVertex.incoming.size() == 1 && // + transpositions.equals(joinCandidateTranspositions); + if (canJoin) { + vertex.add(joinCandidateVertex.tokens()); + for (Set t : new HashSet<>(joinCandidateVertex.transpositions())) { + final Set transposed = new HashSet<>(t); + transposed.remove(joinCandidateVertex); + transposed.add(vertex); + for (Vertex tv : t) { + graph.transpositionIndex.getOrDefault(tv, Collections.emptySet()).remove(t); + } + graph.transpose(transposed); + } + + vertex.outgoing.clear(); + vertex.outgoing.putAll(joinCandidateVertex.outgoing); + + vertex.outgoing.keySet().forEach(v -> v.incoming.put(vertex, v.incoming.remove(joinCandidateVertex))); + + queue.push(vertex); + continue; + } } - joinCandidateEdge.delete(); - joinCandidateVertex.delete(); - queue.push(vertex); - continue; - } - } - processed.add(vertex); - for (Edge e : outgoingEdges) { - final Vertex next = e.to(); - // FIXME: Why do we run out of memory in some cases here, if this is not checked? - if (!processed.contains(next)) { - queue.push(next); - } + // FIXME: Why do we run out of memory in some cases here, if this is not checked? + processed.add(vertex); + vertex.outgoing.keySet().stream().filter(v -> !processed.contains(v)).forEach(queue::push); } - } - return graph; - } - }; + return graph; + }; } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/Witness.java b/collatex-core/src/main/java/eu/interedition/collatex/Witness.java index 75f2adb1f..b0fa4c834 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/Witness.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/Witness.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,33 +19,16 @@ package eu.interedition.collatex; -import com.google.common.base.Function; -import com.google.common.base.Joiner; -import com.google.common.collect.Ordering; - import java.util.Comparator; /** * IWitness - * + *

* Representation of a single textual witness - * */ public interface Witness { - String getSigil(); - - final Comparator SIGIL_COMPARATOR = new Comparator() { - @Override - public int compare(Witness o1, Witness o2) { - return o1.getSigil().compareTo(o2.getSigil()); - } - }; + String getSigil(); - final Function TO_SIGILS = new Function() { - @Override - public String apply(VariantGraph.Edge input) { - return Joiner.on(", ").join(Ordering.from(SIGIL_COMPARATOR).sortedCopy(input.witnesses())); - } - }; + final Comparator SIGIL_COMPARATOR = Comparator.comparing(Witness::getSigil); } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/DekkerAlgorithm.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/DekkerAlgorithm.java index e2d87edfe..e2141be8e 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/DekkerAlgorithm.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/DekkerAlgorithm.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -18,17 +18,6 @@ */ package eu.interedition.collatex.dekker; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.logging.Level; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - import eu.interedition.collatex.CollationAlgorithm; import eu.interedition.collatex.Token; import eu.interedition.collatex.VariantGraph; @@ -36,150 +25,162 @@ import eu.interedition.collatex.dekker.matrix.MatchTableLinker; import eu.interedition.collatex.util.VariantGraphRanking; -public class DekkerAlgorithm extends CollationAlgorithm.Base { - - private final Comparator comparator; - private final TokenLinker tokenLinker; - private final PhraseMatchDetector phraseMatchDetector; - private final TranspositionDetector transpositionDetector; - private Map tokenLinks; - private List> phraseMatches; - private List> transpositions; - private Map alignments; - private boolean mergeTranspositions = false; - - public DekkerAlgorithm(Comparator comparator) { - this(comparator, new MatchTableLinker(3)); - } - - public DekkerAlgorithm(Comparator comparator, TokenLinker tokenLinker) { - this.comparator = comparator; - this.tokenLinker = tokenLinker; - this.phraseMatchDetector = new PhraseMatchDetector(); - this.transpositionDetector = new TranspositionDetector(); - } - - @Override - public void collate(VariantGraph graph, Iterable tokens) { - Preconditions.checkArgument(!Iterables.isEmpty(tokens), "Empty witness"); - final Witness witness = Iterables.getFirst(tokens, null).getWitness(); - - if (LOG.isLoggable(Level.FINER)) { - LOG.log(Level.FINER, "{0} + {1}: {2} vs. {3}", new Object[] { graph, witness, graph.vertices(), tokens }); - } - - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "{0} + {1}: Match and link tokens", new Object[] { graph, witness }); - } - tokenLinks = tokenLinker.link(graph, tokens, comparator); - - if (LOG.isLoggable(Level.FINER)) { - for (Map.Entry tokenLink : tokenLinks.entrySet()) { - LOG.log(Level.FINER, "{0} + {1}: Token match: {2} = {3}", new Object[] { graph, witness, tokenLink.getValue(), tokenLink.getKey() }); - } - } - - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "{0} + {1}: Detect phrase matches", new Object[] { graph, witness }); - } - phraseMatches = phraseMatchDetector.detect(tokenLinks, graph, tokens); - if (LOG.isLoggable(Level.FINER)) { - for (List phraseMatch : phraseMatches) { - LOG.log(Level.FINER, "{0} + {1}: Phrase match: {2}", new Object[] { graph, witness, Iterables.toString(phraseMatch) }); - } - } - - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "{0} + {1}: Detect transpositions", new Object[] { graph, witness }); - } - transpositions = transpositionDetector.detect(phraseMatches, graph); - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "transpositions:{0}", transpositions); - } - - if (LOG.isLoggable(Level.FINER)) { - for (List transposition : transpositions) { - LOG.log(Level.FINER, "{0} + {1}: Transposition: {2}", new Object[] { graph, witness, Iterables.toString(transposition) }); - } - } - - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "{0} + {1}: Determine aligned tokens by filtering transpositions", new Object[] { graph, witness }); - } - alignments = Maps.newHashMap(); - for (List phrase : phraseMatches) { - for (Match match : phrase) { - alignments.put(match.token, match.vertex); - } - } - - for (List transposedPhrase : transpositions) { - for (Match match : transposedPhrase) { - alignments.remove(match.token); - } - } - if (LOG.isLoggable(Level.FINER)) { - for (Map.Entry alignment : alignments.entrySet()) { - LOG.log(Level.FINER, "{0} + {1}: Alignment: {2} = {3}", new Object[] { graph, witness, alignment.getValue(), alignment.getKey() }); - } - } - - merge(graph, tokens, alignments); - - // we filter out small transposed phrases over large distances - List> falseTranspositions = Lists.newArrayList(); - - VariantGraphRanking ranking = VariantGraphRanking.of(graph); - - for (List transposedPhrase : transpositions) { - Match match = transposedPhrase.get(0); - VariantGraph.Vertex v1 = witnessTokenVertices.get(match.token); - VariantGraph.Vertex v2 = match.vertex; - int distance = Math.abs(ranking.apply(v1)-ranking.apply(v2))-1; - if (distance > transposedPhrase.size()*3) { - falseTranspositions.add(transposedPhrase); - } - } +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; - for (List transposition : falseTranspositions) { - transpositions.remove(transposition); - } +public class DekkerAlgorithm extends CollationAlgorithm.Base { - if (mergeTranspositions) { - mergeTranspositions(graph, transpositions); - } - - if (LOG.isLoggable(Level.FINER)) { - LOG.log(Level.FINER, "!{0}: {1}", new Object[] {graph, Iterables.toString(graph.vertices())}); + private final Comparator comparator; + private final TokenLinker tokenLinker; + private final PhraseMatchDetector phraseMatchDetector; + private final TranspositionDetector transpositionDetector; + private Map tokenLinks; + private List> phraseMatches; + private List> transpositions; + private Map alignments; + private boolean mergeTranspositions = false; + + public DekkerAlgorithm(Comparator comparator) { + this(comparator, new MatchTableLinker()); + } + + public DekkerAlgorithm(Comparator comparator, TokenLinker tokenLinker) { + this.comparator = comparator; + this.tokenLinker = tokenLinker; + this.phraseMatchDetector = new PhraseMatchDetector(); + this.transpositionDetector = new TranspositionDetector(); + } + + @Override + public void collate(VariantGraph graph, Iterable tokens) { + final Witness witness = StreamSupport.stream(tokens.spliterator(), false) + .findFirst() + .map(Token::getWitness) + .orElseThrow(() -> new IllegalArgumentException("Empty witness")); + + if (LOG.isLoggable(Level.FINER)) { + LOG.log(Level.FINER, "{0} + {1}: {2} vs. {3}", new Object[]{graph, witness, graph.vertices(), tokens}); + } + + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "{0} + {1}: Match and link tokens", new Object[]{graph, witness}); + } + tokenLinks = tokenLinker.link(graph, tokens, comparator); + + if (LOG.isLoggable(Level.FINER)) { + for (Map.Entry tokenLink : tokenLinks.entrySet()) { + LOG.log(Level.FINER, "{0} + {1}: Token match: {2} = {3}", new Object[]{graph, witness, tokenLink.getValue(), tokenLink.getKey()}); + } + } + + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "{0} + {1}: Detect phrase matches", new Object[]{graph, witness}); + } + phraseMatches = phraseMatchDetector.detect(tokenLinks, graph, tokens); + if (LOG.isLoggable(Level.FINER)) { + for (List phraseMatch : phraseMatches) { + LOG.log(Level.FINER, "{0} + {1}: Phrase match: {2}", new Object[]{graph, witness, phraseMatch}); + } + } + + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "{0} + {1}: Detect transpositions", new Object[]{graph, witness}); + } + transpositions = transpositionDetector.detect(phraseMatches, graph); + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "transpositions:{0}", transpositions); + } + + if (LOG.isLoggable(Level.FINER)) { + for (List transposition : transpositions) { + LOG.log(Level.FINER, "{0} + {1}: Transposition: {2}", new Object[]{graph, witness, transposition}); + } + } + + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "{0} + {1}: Determine aligned tokens by filtering transpositions", new Object[]{graph, witness}); + } + alignments = new HashMap<>(); + for (List phrase : phraseMatches) { + for (Match match : phrase) { + alignments.put(match.token, match.vertex); + } + } + + for (List transposedPhrase : transpositions) { + for (Match match : transposedPhrase) { + alignments.remove(match.token); + } + } + if (LOG.isLoggable(Level.FINER)) { + for (Map.Entry alignment : alignments.entrySet()) { + LOG.log(Level.FINER, "{0} + {1}: Alignment: {2} = {3}", new Object[]{graph, witness, alignment.getValue(), alignment.getKey()}); + } + } + + merge(graph, tokens, alignments); + + // we filter out small transposed phrases over large distances + List> falseTranspositions = new ArrayList<>(); + + VariantGraphRanking ranking = VariantGraphRanking.of(graph); + + for (List transposedPhrase : transpositions) { + Match match = transposedPhrase.get(0); + VariantGraph.Vertex v1 = witnessTokenVertices.get(match.token); + VariantGraph.Vertex v2 = match.vertex; + int distance = Math.abs(ranking.apply(v1) - ranking.apply(v2)) - 1; + if (distance > transposedPhrase.size() * 3) { + falseTranspositions.add(transposedPhrase); + } + } + + for (List transposition : falseTranspositions) { + transpositions.remove(transposition); + } + + if (mergeTranspositions) { + mergeTranspositions(graph, transpositions); + } + + if (LOG.isLoggable(Level.FINER)) { + LOG.log(Level.FINER, "!{0}: {1}", new Object[]{graph, StreamSupport.stream(graph.vertices().spliterator(), false).map(Object::toString).collect(Collectors.joining(", "))}); + } + } + + public Map getTokenLinks() { + return tokenLinks; + } + + public List> getPhraseMatches() { + return Collections.unmodifiableList(phraseMatches); + } + + public List> getTranspositions() { + return Collections.unmodifiableList(transpositions); + } + + public Map getAlignments() { + return Collections.unmodifiableMap(alignments); + } + + /* + * This check disables transposition rendering in the variant + * graph when the variant graph contains more then two witnesses. + * Transposition detection is done in a progressive manner + * (witness by witness). When viewing the resulting graph + * containing the variation for all witnesses + * the detected transpositions can look strange, since segments + * may have split into smaller or larger parts. + */ + public void setMergeTranspositions(boolean b) { + this.mergeTranspositions = b; } - } - - public Map getTokenLinks() { - return tokenLinks; - } - - public List> getPhraseMatches() { - return Collections.unmodifiableList(phraseMatches); - } - - public List> getTranspositions() { - return Collections.unmodifiableList(transpositions); - } - - public Map getAlignments() { - return Collections.unmodifiableMap(alignments); - } - - /* - * This check disables transposition rendering in the variant - * graph when the variant graph contains more then two witnesses. - * Transposition detection is done in a progressive manner - * (witness by witness). When viewing the resulting graph - * containing the variation for all witnesses - * the detected transpositions can look strange, since segments - * may have split into smaller or larger parts. - */ - public void setMergeTranspositions(boolean b) { - this.mergeTranspositions = b; - } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/Match.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/Match.java index e44bc9b6c..60db37de2 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/Match.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/Match.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,78 +19,62 @@ package eu.interedition.collatex.dekker; -import com.google.common.base.Function; -import com.google.common.base.Objects; -import com.google.common.base.Predicate; -import com.google.common.collect.Lists; import eu.interedition.collatex.Token; import eu.interedition.collatex.VariantGraph; +import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Objects; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Collectors; /** - * @author Gregor Middell + * @author Gregor Middell */ public class Match { - public final VariantGraph.Vertex vertex; - public final Token token; + public final VariantGraph.Vertex vertex; + public final Token token; - public Match(VariantGraph.Vertex vertex, Token token) { - this.vertex = vertex; - this.token = token; - } - - @Override - public int hashCode() { - return Objects.hashCode(vertex, token); - } + public Match(VariantGraph.Vertex vertex, Token token) { + this.vertex = vertex; + this.token = token; + } - @Override - public boolean equals(Object obj) { - if (obj != null && obj instanceof Match) { - Match other = (Match) obj; - return vertex.equals(other.vertex) && token.equals(other.token); + @Override + public int hashCode() { + return Objects.hash(vertex, token); } - return super.equals(obj); - } - @Override - public String toString() { - return new StringBuilder("{").append(vertex).append("; ").append(token).append("}").toString(); - } + @Override + public boolean equals(Object obj) { + if (obj != null && obj instanceof Match) { + Match other = (Match) obj; + return vertex.equals(other.vertex) && token.equals(other.token); + } + return super.equals(obj); + } - public static List createPhraseMatch(List vertices, List tokens) { - final List phraseMatch = Lists.newArrayListWithExpectedSize(vertices.size()); - final Iterator vertexIt = vertices.iterator(); - final Iterator tokenIt = tokens.iterator(); - while (vertexIt.hasNext() && tokenIt.hasNext()) { - phraseMatch.add(new Match(vertexIt.next(), tokenIt.next())); + @Override + public String toString() { + return "{" + vertex + "; " + token + "}"; } - return phraseMatch; - } + public static List createPhraseMatch(List vertices, List tokens) { + final List phraseMatch = new ArrayList<>(vertices.size()); + final Iterator vertexIt = vertices.iterator(); + final Iterator tokenIt = tokens.iterator(); + while (vertexIt.hasNext() && tokenIt.hasNext()) { + phraseMatch.add(new Match(vertexIt.next(), tokenIt.next())); + } + return phraseMatch; + } - public static Predicate createNoBoundaryMatchPredicate(final VariantGraph graph) { - return new Predicate() { - @Override - public boolean apply(Match input) { - return !input.vertex.equals(graph.getStart()) && !input.vertex.equals(graph.getEnd()); - } - }; - } - public static final Function MATCH_TO_TOKENS = new Function() { - @Override - public Token apply(Match input) { - return input.token; + public static Predicate createNoBoundaryMatchPredicate(final VariantGraph graph) { + return input -> !input.vertex.equals(graph.getStart()) && !input.vertex.equals(graph.getEnd()); } - }; - public static final Function, List> PHRASE_MATCH_TO_TOKENS = new Function, List>() { - @Override - public List apply(List input) { - return Lists.transform(input, MATCH_TO_TOKENS); - } - }; + public static final Function, List> PHRASE_MATCH_TO_TOKENS = input -> input.stream().map(m -> m.token).collect(Collectors.toList()); } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/PhraseMatchDetector.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/PhraseMatchDetector.java index 3a1abbcb0..887b4cfd5 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/PhraseMatchDetector.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/PhraseMatchDetector.java @@ -1,85 +1,76 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ -package eu.interedition.collatex.dekker; - -import java.util.List; -import java.util.Map; - -import com.google.common.collect.Sets; -import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.neo4j.Neo4jGraphRelationships; -import eu.interedition.collatex.neo4j.Neo4jVariantGraphVertex; -import org.neo4j.graphdb.Direction; -import org.neo4j.graphdb.Node; -import org.neo4j.graphdb.Relationship; - -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; - -import eu.interedition.collatex.Token; - -/** - * - * @author Ronald Haentjens Dekker - * @author Bram Buitendijk - */ -public class PhraseMatchDetector { - - public List> detect(Map linkedTokens, VariantGraph base, Iterable tokens) { - List> phraseMatches = Lists.newArrayList(); - List basePhrase = Lists.newArrayList(); - List witnessPhrase = Lists.newArrayList(); - VariantGraph.Vertex previous = base.getStart(); - - for (Token token : tokens) { - if (!linkedTokens.containsKey(token)) { - addNewPhraseMatchAndClearBuffer(phraseMatches, basePhrase, witnessPhrase); - continue; - } - VariantGraph.Vertex baseVertex = linkedTokens.get(token); - // requirements: - // - previous and base vertex should have the same witnesses - // - previous and base vertex should either be in the same transposition(s) or both aren't in any transpositions - // - there should be a directed edge between previous and base vertex - // - there may not be a longer path between previous and base vertex - boolean sameTranspositions = Sets.newHashSet(previous.transpositions()).equals(Sets.newHashSet(baseVertex.transpositions())); - boolean sameWitnesses = previous.witnesses().equals(baseVertex.witnesses()); - boolean directedEdge = (base.edgeBetween(previous, baseVertex) != null); - boolean isNear = sameTranspositions && sameWitnesses && directedEdge && (Iterables.size(previous.outgoing()) == 1 || Iterables.size(baseVertex.incoming()) == 1); - if (!isNear) { - addNewPhraseMatchAndClearBuffer(phraseMatches, basePhrase, witnessPhrase); - } - basePhrase.add(baseVertex); - witnessPhrase.add(token); - previous = baseVertex; - } - if (!basePhrase.isEmpty()) { - phraseMatches.add(Match.createPhraseMatch(basePhrase, witnessPhrase)); - } - return phraseMatches; - } - - private void addNewPhraseMatchAndClearBuffer(List> phraseMatches, List basePhrase, List witnessPhrase) { - if (!basePhrase.isEmpty()) { - phraseMatches.add(Match.createPhraseMatch(basePhrase, witnessPhrase)); - basePhrase.clear(); - witnessPhrase.clear(); - } - } -} +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ +package eu.interedition.collatex.dekker; + +import eu.interedition.collatex.Token; +import eu.interedition.collatex.VariantGraph; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; + +/** + * @author Ronald Haentjens Dekker + * @author Bram Buitendijk + */ +public class PhraseMatchDetector { + + public List> detect(Map linkedTokens, VariantGraph base, Iterable tokens) { + List> phraseMatches = new ArrayList<>(); + List basePhrase = new ArrayList<>(); + List witnessPhrase = new ArrayList<>(); + VariantGraph.Vertex previous = base.getStart(); + + for (Token token : tokens) { + if (!linkedTokens.containsKey(token)) { + addNewPhraseMatchAndClearBuffer(phraseMatches, basePhrase, witnessPhrase); + continue; + } + VariantGraph.Vertex baseVertex = linkedTokens.get(token); + // requirements: + // - previous and base vertex should have the same witnesses + // - previous and base vertex should either be in the same transposition(s) or both aren't in any transpositions + // - there should be a directed edge between previous and base vertex + // - there may not be a longer path between previous and base vertex + boolean sameTranspositions = new HashSet<>(previous.transpositions()).equals(new HashSet<>(baseVertex.transpositions())); + boolean sameWitnesses = previous.witnesses().equals(baseVertex.witnesses()); + boolean directedEdge = previous.outgoing().containsKey(baseVertex); + boolean isNear = sameTranspositions && sameWitnesses && directedEdge && (previous.outgoing().size() == 1 || baseVertex.incoming().size() == 1); + if (!isNear) { + addNewPhraseMatchAndClearBuffer(phraseMatches, basePhrase, witnessPhrase); + } + basePhrase.add(baseVertex); + witnessPhrase.add(token); + previous = baseVertex; + } + if (!basePhrase.isEmpty()) { + phraseMatches.add(Match.createPhraseMatch(basePhrase, witnessPhrase)); + } + return phraseMatches; + } + + private void addNewPhraseMatchAndClearBuffer(List> phraseMatches, List basePhrase, List witnessPhrase) { + if (!basePhrase.isEmpty()) { + phraseMatches.add(Match.createPhraseMatch(basePhrase, witnessPhrase)); + basePhrase.clear(); + witnessPhrase.clear(); + } + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/TokenLinker.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/TokenLinker.java index 9de49acea..137b76d2c 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/TokenLinker.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/TokenLinker.java @@ -1,33 +1,33 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.dekker; - -import java.util.Comparator; -import java.util.Map; - -import eu.interedition.collatex.Token; -import eu.interedition.collatex.VariantGraph; - - -public interface TokenLinker { - - Map link(VariantGraph base, Iterable witness, Comparator comparator); - +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.dekker; + +import eu.interedition.collatex.Token; +import eu.interedition.collatex.VariantGraph; + +import java.util.Comparator; +import java.util.Map; + + +public interface TokenLinker { + + Map link(VariantGraph base, Iterable witness, Comparator comparator); + } \ No newline at end of file diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/TranspositionDetector.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/TranspositionDetector.java index 27f73126b..75c461c35 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/TranspositionDetector.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/TranspositionDetector.java @@ -1,197 +1,190 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ -package eu.interedition.collatex.dekker; - -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - -import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.simple.SimpleToken; -import eu.interedition.collatex.util.VariantGraphRanking; - -/** - * - * @author Ronald Haentjens Dekker - */ -public class TranspositionDetector { - private Map, Integer> phraseMatchToIndex; - - public List> detect(final List> phraseMatches, VariantGraph base) { - // if there are no phrase matches it is not possible - // to detect transpositions, return an empty list - if (phraseMatches.isEmpty()) { - return Lists.newArrayList(); - } - - /* - * We order the phrase matches in the topological order - * of the graph (called rank). When the rank is equal - * for two phrase matches, the witness order is used - * to differentiate. - */ - final VariantGraphRanking ranking = rankTheGraph(phraseMatches, base); - - Comparator> comp = new Comparator>() { - @Override - public int compare(List pm1, List pm2) { - int rank1 = ranking.apply(pm1.get(0).vertex); - int rank2 = ranking.apply(pm2.get(0).vertex); - int difference = rank1 - rank2; - if (difference != 0) { - return difference; - } - int index1 = phraseMatches.indexOf(pm1); - int index2 = phraseMatches.indexOf(pm2); - return index1 - index2; - } - }; - - List> phraseMatchesGraphOrder = Lists.newArrayList(phraseMatches); - Collections.sort(phraseMatchesGraphOrder, comp); - - // Map 1 - phraseMatchToIndex = Maps.newHashMap(); - for (int i = 0; i < phraseMatchesGraphOrder.size(); i++) { - phraseMatchToIndex.put(phraseMatchesGraphOrder.get(i), i); - } - - /* - * We calculate the index for all the phrase matches - * First in witness order, then in graph order - */ - List phraseMatchesGraphIndex = Lists.newArrayList(); - List phraseMatchesWitnessIndex = Lists.newArrayList(); - - for (int i=0; i < phraseMatches.size(); i++) { - phraseMatchesGraphIndex.add(i); - } - - for (List phraseMatch : phraseMatches) { - phraseMatchesWitnessIndex.add(phraseMatchToIndex.get(phraseMatch)); - } - - /* - * Initialize result variables - */ - List> nonTransposedPhraseMatches = Lists.newArrayList(phraseMatches); - List> transpositions = Lists.newArrayList(); - - /* - * loop here until the maximum distance == 0 - */ - while (true) { - // Map 2 - final Map, Integer> phraseMatchToDistanceMap = Maps.newLinkedHashMap(); - for (int i=0; i < nonTransposedPhraseMatches.size(); i++) { - Integer graphIndex = phraseMatchesGraphIndex.get(i); - Integer witnessIndex = phraseMatchesWitnessIndex.get(i); - Integer distance = Math.abs(graphIndex - witnessIndex); - List phraseMatch = nonTransposedPhraseMatches.get(i); - phraseMatchToDistanceMap.put(phraseMatch, distance); - } - - List distanceList = Lists.newArrayList(phraseMatchToDistanceMap.values()); - - if (distanceList.isEmpty()||Collections.max(distanceList) == 0) { - break; - } - - // sort phrase matches on distance, size - // TODO: order by 3) graph rank? - // TODO: I have not yet found evidence/a use case that - // TODO: indicates that it is needed. - Comparator> comp2 = new Comparator>() { - @Override - public int compare(List pm1, List pm2) { - // first order by distance - int distance1 = phraseMatchToDistanceMap.get(pm1); - int distance2 = phraseMatchToDistanceMap.get(pm2); - int difference = distance2 - distance1; - if (difference != 0) { - return difference; - } - // second order by size - // return pm1.size() - pm2.size(); - return determineSize(pm1) - determineSize(pm2); - } - }; - - List> sortedPhraseMatches = Lists.newArrayList(nonTransposedPhraseMatches); - Collections.sort(sortedPhraseMatches, comp2); - - List transposedPhrase = sortedPhraseMatches.remove(0); - - Integer transposedIndex = phraseMatchToIndex.get(transposedPhrase); - Integer graphIndex = phraseMatchesGraphIndex.indexOf(transposedIndex); - Integer transposedWithIndex = phraseMatchesWitnessIndex.get(graphIndex); - List linkedTransposedPhrase = phraseMatchesGraphOrder.get(transposedWithIndex); - - addTransposition(phraseMatchesWitnessIndex, phraseMatchesGraphIndex, nonTransposedPhraseMatches, transpositions, transposedPhrase); - - Integer distance = phraseMatchToDistanceMap.get(transposedPhrase); - if (distance == phraseMatchToDistanceMap.get(linkedTransposedPhrase) && distance > 1) { - addTransposition(phraseMatchesWitnessIndex, phraseMatchesGraphIndex, nonTransposedPhraseMatches, transpositions, linkedTransposedPhrase); - } - } - return transpositions; - } - - private void addTransposition(List phraseWitnessRanks, List phraseGraphRanks, List> nonTransposedPhraseMatches, List> transpositions, List transposedPhrase) { - Integer indexToRemove = phraseMatchToIndex.get(transposedPhrase); - nonTransposedPhraseMatches.remove(transposedPhrase); - transpositions.add(transposedPhrase); - phraseGraphRanks.remove(indexToRemove); - phraseWitnessRanks.remove(indexToRemove); - } - - private VariantGraphRanking rankTheGraph(List> phraseMatches, VariantGraph base) { - // rank the variant graph - Set matchedVertices = Sets.newHashSet(); - for (List phraseMatch : phraseMatches) { - matchedVertices.add(phraseMatch.get(0).vertex); - } - final VariantGraphRanking ranking = VariantGraphRanking.ofOnlyCertainVertices(base, null, matchedVertices); - return ranking; - } - - /* - * in case of an a, b / b, a transposition we have to determine whether a or b - * stays put. the phrase with the most character stays still if the tokens are - * not simple tokens the phrase with the most tokens stays put - */ - private int determineSize(List t) { - Match firstMatch = t.get(0); - if (!(firstMatch.token instanceof SimpleToken)) { - return t.size(); - } - int charLength = 0; - for (Match m : t) { - SimpleToken token = (SimpleToken) m.token; - charLength += token.getNormalized().length(); - } - return charLength; - } -} +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ +package eu.interedition.collatex.dekker; + +import eu.interedition.collatex.VariantGraph; +import eu.interedition.collatex.simple.SimpleToken; +import eu.interedition.collatex.util.VariantGraphRanking; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * @author Ronald Haentjens Dekker + */ +public class TranspositionDetector { + private Map, Integer> phraseMatchToIndex; + + public List> detect(final List> phraseMatches, VariantGraph base) { + // if there are no phrase matches it is not possible + // to detect transpositions, return an empty list + if (phraseMatches.isEmpty()) { + return new ArrayList<>(); + } + + /* + * We order the phrase matches in the topological order + * of the graph (called rank). When the rank is equal + * for two phrase matches, the witness order is used + * to differentiate. + */ + final VariantGraphRanking ranking = rankTheGraph(phraseMatches, base); + + Comparator> comp = (pm1, pm2) -> { + int rank1 = ranking.apply(pm1.get(0).vertex); + int rank2 = ranking.apply(pm2.get(0).vertex); + int difference = rank1 - rank2; + if (difference != 0) { + return difference; + } + int index1 = phraseMatches.indexOf(pm1); + int index2 = phraseMatches.indexOf(pm2); + return index1 - index2; + }; + + List> phraseMatchesGraphOrder = new ArrayList<>(phraseMatches); + Collections.sort(phraseMatchesGraphOrder, comp); + + // Map 1 + phraseMatchToIndex = new HashMap<>(); + for (int i = 0; i < phraseMatchesGraphOrder.size(); i++) { + phraseMatchToIndex.put(phraseMatchesGraphOrder.get(i), i); + } + + /* + * We calculate the index for all the phrase matches + * First in witness order, then in graph order + */ + List phraseMatchesGraphIndex = new ArrayList<>(); + List phraseMatchesWitnessIndex = new ArrayList<>(); + + for (int i = 0; i < phraseMatches.size(); i++) { + phraseMatchesGraphIndex.add(i); + } + + for (List phraseMatch : phraseMatches) { + phraseMatchesWitnessIndex.add(phraseMatchToIndex.get(phraseMatch)); + } + + /* + * Initialize result variables + */ + List> nonTransposedPhraseMatches = new ArrayList<>(phraseMatches); + List> transpositions = new ArrayList<>(); + + /* + * loop here until the maximum distance == 0 + */ + while (true) { + // Map 2 + final Map, Integer> phraseMatchToDistanceMap = new LinkedHashMap<>(); + for (int i = 0; i < nonTransposedPhraseMatches.size(); i++) { + Integer graphIndex = phraseMatchesGraphIndex.get(i); + Integer witnessIndex = phraseMatchesWitnessIndex.get(i); + Integer distance = Math.abs(graphIndex - witnessIndex); + List phraseMatch = nonTransposedPhraseMatches.get(i); + phraseMatchToDistanceMap.put(phraseMatch, distance); + } + + List distanceList = new ArrayList<>(phraseMatchToDistanceMap.values()); + + if (distanceList.isEmpty() || Collections.max(distanceList) == 0) { + break; + } + + // sort phrase matches on distance, size + // TODO: order by 3) graph rank? + // TODO: I have not yet found evidence/a use case that + // TODO: indicates that it is needed. + Comparator> comp2 = (pm1, pm2) -> { + // first order by distance + int distance1 = phraseMatchToDistanceMap.get(pm1); + int distance2 = phraseMatchToDistanceMap.get(pm2); + int difference = distance2 - distance1; + if (difference != 0) { + return difference; + } + // second order by size + // return pm1.size() - pm2.size(); + return determineSize(pm1) - determineSize(pm2); + }; + + List> sortedPhraseMatches = new ArrayList<>(nonTransposedPhraseMatches); + Collections.sort(sortedPhraseMatches, comp2); + + List transposedPhrase = sortedPhraseMatches.remove(0); + + Integer transposedIndex = phraseMatchToIndex.get(transposedPhrase); + Integer graphIndex = phraseMatchesGraphIndex.indexOf(transposedIndex); + Integer transposedWithIndex = phraseMatchesWitnessIndex.get(graphIndex); + List linkedTransposedPhrase = phraseMatchesGraphOrder.get(transposedWithIndex); + + addTransposition(phraseMatchesWitnessIndex, phraseMatchesGraphIndex, nonTransposedPhraseMatches, transpositions, transposedPhrase); + + Integer distance = phraseMatchToDistanceMap.get(transposedPhrase); + if (distance == phraseMatchToDistanceMap.get(linkedTransposedPhrase) && distance > 1) { + addTransposition(phraseMatchesWitnessIndex, phraseMatchesGraphIndex, nonTransposedPhraseMatches, transpositions, linkedTransposedPhrase); + } + } + return transpositions; + } + + private void addTransposition(List phraseWitnessRanks, List phraseGraphRanks, List> nonTransposedPhraseMatches, List> transpositions, List transposedPhrase) { + Integer indexToRemove = phraseMatchToIndex.get(transposedPhrase); + nonTransposedPhraseMatches.remove(transposedPhrase); + transpositions.add(transposedPhrase); + phraseGraphRanks.remove(indexToRemove); + phraseWitnessRanks.remove(indexToRemove); + } + + private VariantGraphRanking rankTheGraph(List> phraseMatches, VariantGraph base) { + // rank the variant graph + Set matchedVertices = new HashSet<>(); + for (List phraseMatch : phraseMatches) { + matchedVertices.add(phraseMatch.get(0).vertex); + } + final VariantGraphRanking ranking = VariantGraphRanking.ofOnlyCertainVertices(base, matchedVertices); + return ranking; + } + + /* + * in case of an a, b / b, a transposition we have to determine whether a or b + * stays put. the phrase with the most character stays still if the tokens are + * not simple tokens the phrase with the most tokens stays put + */ + private int determineSize(List t) { + Match firstMatch = t.get(0); + if (!(firstMatch.token instanceof SimpleToken)) { + return t.size(); + } + int charLength = 0; + for (Match m : t) { + SimpleToken token = (SimpleToken) m.token; + charLength += token.getNormalized().length(); + } + return charLength; + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/Tuple.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/Tuple.java index 0bbc8c528..f80a88ded 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/Tuple.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/Tuple.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,36 +19,32 @@ package eu.interedition.collatex.dekker; -import com.google.common.collect.Sets; - -import java.util.Set; +import java.util.Objects; /** - * @author Gregor Middell + * @author Gregor Middell */ public class Tuple { - public final T left; - public final T right; - - private final Set set; + public final T left; + public final T right; - public Tuple(T left, T right) { - this.left = left; - this.right = right; - this.set = Sets.newHashSet(left, right); - } + public Tuple(T left, T right) { + this.left = left; + this.right = right; + } - @Override - public boolean equals(Object obj) { - if (obj != null && obj instanceof Tuple) { - return set.equals(((Tuple) obj).set); + @Override + public boolean equals(Object obj) { + if (obj != null && obj instanceof Tuple) { + final Tuple other = (Tuple) obj; + return (left.equals(other.left) || left.equals(other.right)) && (right.equals(other.right) || right.equals(other.left)); + } + return super.equals(obj); } - return super.equals(obj); - } - @Override - public int hashCode() { - return set.hashCode(); - } + @Override + public int hashCode() { + return Objects.hash(left, right); + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Archipelago.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Archipelago.java index 884f62a11..636ea8fbb 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Archipelago.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Archipelago.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -20,163 +20,161 @@ package eu.interedition.collatex.dekker.matrix; import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.logging.Logger; -import com.google.common.base.Objects; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - /* * @author Meindert Kroese * @author Bram Buitendijk * @author Ronald Haentjens Dekker */ public class Archipelago { - Logger LOG = Logger.getLogger(Archipelago.class.getName()); - - private final List islands; - private final Set islandvectors; - - public Archipelago() { - islands = new ArrayList(); - this.islandvectors = Sets.newHashSet(); // row - column, all islands should have direction 1, so this diff should be the same for all coordinates on the island. - } - - //copy constructor - public Archipelago(Archipelago orig) { - this.islands = Lists.newArrayList(orig.islands); - this.islandvectors = Sets.newHashSet(orig.islandvectors); - } - - public Archipelago(Island isl) { - this(); - islands.add(isl); - } - - public void add(Island island) { - islands.add(island); - Coordinate leftEnd = island.getLeftEnd(); - islandvectors.add(leftEnd.row - leftEnd.column); - } - - public int size() { - return islands.size(); - } - - public Island get(int i) { - return islands.get(i); - } - - public boolean containsCoordinate(int row, int column) { - return Objects.equal(getCoordinatesMap().get(row), column); - } - - public List getIslands() { - return islands; - } - - protected void remove(int i) { - islands.remove(i); - } - - @Override - public String toString() { - String result = ""; - for (Island island : getIslands()) { - if (result.isEmpty()) - result = "[ " + island; - else - result += ", " + island; - } - result += " ]"; - return result; - } - - @Override - public int hashCode() { - return Objects.hashCode(islands); - } - - @Override - public boolean equals(Object object) { - if (object == null) return false; - if (object.getClass() != this.getClass()) return false; - if (((Archipelago) object).size() != this.size()) return false; - for (int i = 0; i < size(); i++) { - if (!((Archipelago) object).get(i).equals(get(i))) return false; - } - return true; - } - - private Map getCoordinatesMap() { - final Map map = Maps.newHashMap(); - for (final Island isl : islands) { - for (final Coordinate c : isl) { - map.put(c.getRow(), c.getColumn()); - } - } - return map; - } - - private double distance(Island isl1, Island isl2) { - double result = 0.0; - int isl1_L_x = isl1.getLeftEnd().column; - int isl1_L_y = isl1.getLeftEnd().row; - int isl1_R_x = isl1.getRightEnd().column; - int isl1_R_y = isl1.getRightEnd().row; - int isl2_L_x = isl2.getLeftEnd().column; - int isl2_L_y = isl2.getLeftEnd().row; - int isl2_R_x = isl2.getRightEnd().column; - int isl2_R_y = isl2.getRightEnd().row; - result = distance(isl1_L_x, isl1_L_y, isl2_L_x, isl2_L_y); - double d = distance(isl1_L_x, isl1_L_y, isl2_R_x, isl2_R_y); - if (d < result) result = d; - d = distance(isl1_R_x, isl1_R_y, isl2_L_x, isl2_L_y); - if (d < result) result = d; - d = distance(isl1_R_x, isl1_R_y, isl2_R_x, isl2_R_y); - if (d < result) result = d; - return result; - } - - private double distance(int a_x, int a_y, int b_x, int b_y) { - double result = 0.0; - result = Math.sqrt((a_x - b_x) * (a_x - b_x) + (a_y - b_y) * (a_y - b_y)); - return result; - } - - public Set getIslandVectors() { - return islandvectors; - } - - public double smallestDistance(Island isl) { - double minimum = 10000; - for (Island fixedIsland : getIslands()) { - minimum = Math.min(minimum, distance(isl, fixedIsland)); - } - return minimum; - } - - public double smallestDistanceToIdealLine(Island isl) { - double minimum = 10000; - Island closestIsland = null; - for (Island fixedIsland : getIslands()) { - double prev = minimum; - minimum = Math.min(minimum, distance(isl, fixedIsland)); - if (prev > minimum) { - closestIsland = fixedIsland; - } - } - if (closestIsland == null) { - return minimum; - } - Coordinate leftEnd = isl.getLeftEnd(); - int islandVector = leftEnd.row - leftEnd.column; - Coordinate leftEnd0 = closestIsland.getLeftEnd(); - int closestIslandVector = leftEnd0.row - leftEnd0.column; - return Math.abs(islandVector - closestIslandVector); - } + Logger LOG = Logger.getLogger(Archipelago.class.getName()); + + private final List islands; + private final Set islandvectors; + + public Archipelago() { + islands = new ArrayList<>(); + this.islandvectors = new HashSet<>(); // row - column, all islands should have direction 1, so this diff should be the same for all coordinates on the island. + } + + //copy constructor + public Archipelago(Archipelago orig) { + this.islands = new ArrayList<>(orig.islands); + this.islandvectors = new HashSet<>(orig.islandvectors); + } + + public Archipelago(Island isl) { + this(); + islands.add(isl); + } + + public void add(Island island) { + islands.add(island); + Coordinate leftEnd = island.getLeftEnd(); + islandvectors.add(leftEnd.row - leftEnd.column); + } + + public int size() { + return islands.size(); + } + + public Island get(int i) { + return islands.get(i); + } + + public boolean containsCoordinate(int row, int column) { + return Objects.equals(getCoordinatesMap().get(row), column); + } + + public List getIslands() { + return islands; + } + + protected void remove(int i) { + islands.remove(i); + } + + @Override + public String toString() { + String result = ""; + for (Island island : getIslands()) { + if (result.isEmpty()) + result = "[ " + island; + else + result += ", " + island; + } + result += " ]"; + return result; + } + + @Override + public int hashCode() { + return Objects.hashCode(islands); + } + + @Override + public boolean equals(Object object) { + if (object == null) return false; + if (object.getClass() != this.getClass()) return false; + if (((Archipelago) object).size() != this.size()) return false; + for (int i = 0; i < size(); i++) { + if (!((Archipelago) object).get(i).equals(get(i))) return false; + } + return true; + } + + private Map getCoordinatesMap() { + final Map map = new HashMap<>(); + for (final Island isl : islands) { + for (final Coordinate c : isl) { + map.put(c.getRow(), c.getColumn()); + } + } + return map; + } + + private double distance(Island isl1, Island isl2) { + double result = 0.0; + int isl1_L_x = isl1.getLeftEnd().column; + int isl1_L_y = isl1.getLeftEnd().row; + int isl1_R_x = isl1.getRightEnd().column; + int isl1_R_y = isl1.getRightEnd().row; + int isl2_L_x = isl2.getLeftEnd().column; + int isl2_L_y = isl2.getLeftEnd().row; + int isl2_R_x = isl2.getRightEnd().column; + int isl2_R_y = isl2.getRightEnd().row; + result = distance(isl1_L_x, isl1_L_y, isl2_L_x, isl2_L_y); + double d = distance(isl1_L_x, isl1_L_y, isl2_R_x, isl2_R_y); + if (d < result) result = d; + d = distance(isl1_R_x, isl1_R_y, isl2_L_x, isl2_L_y); + if (d < result) result = d; + d = distance(isl1_R_x, isl1_R_y, isl2_R_x, isl2_R_y); + if (d < result) result = d; + return result; + } + + private double distance(int a_x, int a_y, int b_x, int b_y) { + double result = 0.0; + result = Math.sqrt((a_x - b_x) * (a_x - b_x) + (a_y - b_y) * (a_y - b_y)); + return result; + } + + public Set getIslandVectors() { + return islandvectors; + } + + public double smallestDistance(Island isl) { + double minimum = 10000; + for (Island fixedIsland : getIslands()) { + minimum = Math.min(minimum, distance(isl, fixedIsland)); + } + return minimum; + } + + public double smallestDistanceToIdealLine(Island isl) { + double minimum = 10000; + Island closestIsland = null; + for (Island fixedIsland : getIslands()) { + double prev = minimum; + minimum = Math.min(minimum, distance(isl, fixedIsland)); + if (prev > minimum) { + closestIsland = fixedIsland; + } + } + if (closestIsland == null) { + return minimum; + } + Coordinate leftEnd = isl.getLeftEnd(); + int islandVector = leftEnd.row - leftEnd.column; + Coordinate leftEnd0 = closestIsland.getLeftEnd(); + int closestIslandVector = leftEnd0.row - leftEnd0.column; + return Math.abs(islandVector - closestIslandVector); + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Coordinate.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Coordinate.java index 4dadbbd2d..56da8d35b 100755 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Coordinate.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Coordinate.java @@ -1,81 +1,81 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.dekker.matrix; - -import com.google.common.base.Objects; - -public class Coordinate implements Comparable { - int row; - int column; - - public Coordinate(int row, int column) { - this.column = column; - this.row = row; - } - - Coordinate(Coordinate other) { - this(other.row, other.column); - } - - public int getRow() { - return row; - } - - public int getColumn() { - return column; - } - - public boolean sameColumn(Coordinate c) { - return c.column == column; - } - - public boolean sameRow(Coordinate c) { - return c.row == row; - } - - public boolean bordersOn(Coordinate c) { - return (Math.abs(this.row - c.getRow()) == 1) && (Math.abs(this.column - c.getColumn()) == 1); - } - - @Override - public boolean equals(Object o) { - if (o != null & o instanceof Coordinate) { - final Coordinate c = (Coordinate) o; - return (this.row == c.getRow() && this.column == c.getColumn()); - } - return super.equals(o); - } - - @Override - public int hashCode() { - return Objects.hashCode(row, column); - } - - @Override - public int compareTo(Coordinate o) { - final int result = column - o.column; - return (result == 0 ? row - o.row : result); - } - - @Override - public String toString() { - return "(" + row + "," + column + ")"; - } -} +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.dekker.matrix; + +import java.util.Objects; + +public class Coordinate implements Comparable { + int row; + int column; + + public Coordinate(int row, int column) { + this.column = column; + this.row = row; + } + + Coordinate(Coordinate other) { + this(other.row, other.column); + } + + public int getRow() { + return row; + } + + public int getColumn() { + return column; + } + + public boolean sameColumn(Coordinate c) { + return c.column == column; + } + + public boolean sameRow(Coordinate c) { + return c.row == row; + } + + public boolean bordersOn(Coordinate c) { + return (Math.abs(this.row - c.getRow()) == 1) && (Math.abs(this.column - c.getColumn()) == 1); + } + + @Override + public boolean equals(Object o) { + if (o != null & o instanceof Coordinate) { + final Coordinate c = (Coordinate) o; + return (this.row == c.getRow() && this.column == c.getColumn()); + } + return super.equals(o); + } + + @Override + public int hashCode() { + return Objects.hash(row, column); + } + + @Override + public int compareTo(Coordinate o) { + final int result = column - o.column; + return (result == 0 ? row - o.row : result); + } + + @Override + public String toString() { + return "(" + row + "," + column + ")"; + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Island.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Island.java index 9502ddf12..62b3b248b 100755 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Island.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/Island.java @@ -1,235 +1,233 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.dekker.matrix; - -import java.text.MessageFormat; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; - -import com.google.common.base.Objects; -import com.google.common.collect.Lists; - -/** - * A DirectedIsland is a collections of Coordinates all on the same - * diagonal. The direction of this diagonal can be -1, 0, or 1. - * The zero is for a DirectedIsland of only one Coordinate. - * Directions 1 and -1 examples - * Coordinates (0,0) (1,1) have Direction 1 - * Coordinates (1,1) (2,1) have Direction -1 - * I.e. if the row-coordinate gets larger and the col-coordinate also, the - * direction is 1 (positive) else it is -1 (negative) - */ -public class Island implements Iterable, Comparable { - - private int direction = 0; - private final List islandCoordinates = Lists.newArrayList(); - - public Island() {} - - public Island(Island other) { - for (Coordinate c : other.islandCoordinates) { - add(new Coordinate(c)); - } - } - - public Island(Coordinate first, Coordinate last) { - add(first); - Coordinate newCoordinate = first; - while (!newCoordinate.equals(last)) { - newCoordinate = new Coordinate(newCoordinate.getRow() + 1, newCoordinate.getColumn() + 1); - // LOG.debug("{}", newCoordinate); - add(newCoordinate); - } - } - - public boolean add(Coordinate coordinate) { - boolean result = false; - if (islandCoordinates.isEmpty()) { - result = islandCoordinates.add(coordinate); - } else if (!contains(coordinate) && neighbour(coordinate)) { - if (direction == 0) { - Coordinate existing = islandCoordinates.get(0); - direction = (existing.row - coordinate.row) / (existing.column - coordinate.column); - result = islandCoordinates.add(coordinate); - } else { - Coordinate existing = islandCoordinates.get(0); - if (existing.column != coordinate.column) { - int new_direction = (existing.row - coordinate.row) / (existing.column - coordinate.column); - if (new_direction == direction) result = islandCoordinates.add(coordinate); - } - } - } - return result; - } - - public int direction() { - return direction; - } - - public Island removePoints(Island di) { - Island result = new Island(this); - for (Coordinate c : di) { - result.removeSameColOrRow(c); - } - return result; - } - - public void removeCoordinate(Coordinate c) { - islandCoordinates.remove(c); - } - - public Coordinate getCoorOnRow(int row) { - for (Coordinate coor : islandCoordinates) { - if (coor.getRow() == row) return coor; - } - return null; - } - - public Coordinate getCoorOnCol(int col) { - for (Coordinate coor : islandCoordinates) { - if (coor.getColumn() == col) return coor; - } - return null; - } - - public void merge(Island di) { - for (Coordinate c : di) { - add(c); - } - } - - /** - * Two islands are competitors if there is a horizontal or - * vertical line which goes through both islands - */ - public boolean isCompetitor(Island isl) { - for (Coordinate c : isl) { - for (Coordinate d : islandCoordinates) { - if (c.sameColumn(d) || c.sameRow(d)) return true; - } - } - return false; - } - - public boolean contains(Coordinate c) { - return islandCoordinates.contains(c); - } - - public boolean neighbour(Coordinate c) { - if (contains(c)) return false; - for (Coordinate islC : islandCoordinates) { - if (c.bordersOn(islC)) { - return true; - } - } - return false; - } - - public Coordinate getLeftEnd() { - Coordinate coor = islandCoordinates.get(0); - for (Coordinate c : islandCoordinates) { - if (c.column < coor.column) coor = c; - } - return coor; - } - - public Coordinate getRightEnd() { - Coordinate coor = islandCoordinates.get(0); - for (Coordinate c : islandCoordinates) { - if (c.column > coor.column) coor = c; - } - return coor; - } - - public boolean overlap(Island isl) { - for (Coordinate c : isl) { - if (contains(c) || neighbour(c)) return true; - } - return false; - } - - public int size() { - return islandCoordinates.size(); - } - - public void clear() { - islandCoordinates.clear(); - } - - public int value() { - final int size = size(); - return (size < 2 ? size : direction + size * size); - } - - protected boolean removeSameColOrRow(Coordinate c) { - ArrayList remove = new ArrayList(); - for (Coordinate coor : islandCoordinates) { - if (coor.sameColumn(c) || coor.sameRow(c)) { - remove.add(coor); - } - } - if (remove.isEmpty()) return false; - for (Coordinate coor : remove) { - islandCoordinates.remove(coor); - } - return true; - } - - @Override - public Iterator iterator() { - return Collections.unmodifiableList(islandCoordinates).iterator(); - } - - @Override - public int hashCode() { - return Objects.hashCode(islandCoordinates); - } - - @Override - public boolean equals(Object obj) { - if (obj == null) return false; - - if (!obj.getClass().equals(Island.class)) return false; - - Island isl = (Island) obj; - if (isl.size() != size()) return false; - - boolean result = true; - for (Coordinate c : isl) { - result &= this.contains(c); - } - return result; - } - - @Override - public String toString() { - if (islandCoordinates.isEmpty()) { - throw new RuntimeException("Unexpected situation: island coordinates empty!"); - } - return MessageFormat.format("Island ({0}-{1}) size: {2} direction: {3}", islandCoordinates.get(0), islandCoordinates.get(islandCoordinates.size() - 1), size(), direction()); - } - - @Override - public int compareTo(Island i) { - return this.getLeftEnd().compareTo(i.getLeftEnd()); - } -} +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.dekker.matrix; + +import java.text.MessageFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +/** + * A DirectedIsland is a collections of Coordinates all on the same + * diagonal. The direction of this diagonal can be -1, 0, or 1. + * The zero is for a DirectedIsland of only one Coordinate. + * Directions 1 and -1 examples + * Coordinates (0,0) (1,1) have Direction 1 + * Coordinates (1,1) (2,1) have Direction -1 + * I.e. if the row-coordinate gets larger and the col-coordinate also, the + * direction is 1 (positive) else it is -1 (negative) + */ +public class Island implements Iterable, Comparable { + + private int direction = 0; + private final List islandCoordinates = new ArrayList<>(); + + public Island() { + } + + public Island(Island other) { + for (Coordinate c : other.islandCoordinates) { + add(new Coordinate(c)); + } + } + + public Island(Coordinate first, Coordinate last) { + add(first); + Coordinate newCoordinate = first; + while (!newCoordinate.equals(last)) { + newCoordinate = new Coordinate(newCoordinate.getRow() + 1, newCoordinate.getColumn() + 1); + // LOG.debug("{}", newCoordinate); + add(newCoordinate); + } + } + + public boolean add(Coordinate coordinate) { + boolean result = false; + if (islandCoordinates.isEmpty()) { + result = islandCoordinates.add(coordinate); + } else if (!contains(coordinate) && neighbour(coordinate)) { + if (direction == 0) { + Coordinate existing = islandCoordinates.get(0); + direction = (existing.row - coordinate.row) / (existing.column - coordinate.column); + result = islandCoordinates.add(coordinate); + } else { + Coordinate existing = islandCoordinates.get(0); + if (existing.column != coordinate.column) { + int new_direction = (existing.row - coordinate.row) / (existing.column - coordinate.column); + if (new_direction == direction) result = islandCoordinates.add(coordinate); + } + } + } + return result; + } + + public int direction() { + return direction; + } + + public Island removePoints(Island di) { + Island result = new Island(this); + for (Coordinate c : di) { + result.removeSameColOrRow(c); + } + return result; + } + + public void removeCoordinate(Coordinate c) { + islandCoordinates.remove(c); + } + + public Coordinate getCoorOnRow(int row) { + for (Coordinate coor : islandCoordinates) { + if (coor.getRow() == row) return coor; + } + return null; + } + + public Coordinate getCoorOnCol(int col) { + for (Coordinate coor : islandCoordinates) { + if (coor.getColumn() == col) return coor; + } + return null; + } + + public void merge(Island di) { + for (Coordinate c : di) { + add(c); + } + } + + /** + * Two islands are competitors if there is a horizontal or + * vertical line which goes through both islands + */ + public boolean isCompetitor(Island isl) { + for (Coordinate c : isl) { + for (Coordinate d : islandCoordinates) { + if (c.sameColumn(d) || c.sameRow(d)) return true; + } + } + return false; + } + + public boolean contains(Coordinate c) { + return islandCoordinates.contains(c); + } + + public boolean neighbour(Coordinate c) { + if (contains(c)) return false; + for (Coordinate islC : islandCoordinates) { + if (c.bordersOn(islC)) { + return true; + } + } + return false; + } + + public Coordinate getLeftEnd() { + Coordinate coor = islandCoordinates.get(0); + for (Coordinate c : islandCoordinates) { + if (c.column < coor.column) coor = c; + } + return coor; + } + + public Coordinate getRightEnd() { + Coordinate coor = islandCoordinates.get(0); + for (Coordinate c : islandCoordinates) { + if (c.column > coor.column) coor = c; + } + return coor; + } + + public boolean overlap(Island isl) { + for (Coordinate c : isl) { + if (contains(c) || neighbour(c)) return true; + } + return false; + } + + public int size() { + return islandCoordinates.size(); + } + + public void clear() { + islandCoordinates.clear(); + } + + public int value() { + final int size = size(); + return (size < 2 ? size : direction + size * size); + } + + protected boolean removeSameColOrRow(Coordinate c) { + ArrayList remove = new ArrayList<>(); + for (Coordinate coor : islandCoordinates) { + if (coor.sameColumn(c) || coor.sameRow(c)) { + remove.add(coor); + } + } + if (remove.isEmpty()) return false; + for (Coordinate coor : remove) { + islandCoordinates.remove(coor); + } + return true; + } + + @Override + public Iterator iterator() { + return Collections.unmodifiableList(islandCoordinates).iterator(); + } + + @Override + public int hashCode() { + return islandCoordinates.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (obj == null) return false; + + if (!obj.getClass().equals(Island.class)) return false; + + Island isl = (Island) obj; + if (isl.size() != size()) return false; + + boolean result = true; + for (Coordinate c : isl) { + result &= this.contains(c); + } + return result; + } + + @Override + public String toString() { + if (islandCoordinates.isEmpty()) { + throw new RuntimeException("Unexpected situation: island coordinates empty!"); + } + return MessageFormat.format("Island ({0}-{1}) size: {2} direction: {3}", islandCoordinates.get(0), islandCoordinates.get(islandCoordinates.size() - 1), size(), direction()); + } + + @Override + public int compareTo(Island i) { + return this.getLeftEnd().compareTo(i.getLeftEnd()); + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/IslandCompetition.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/IslandCompetition.java index bbe71b33d..9c18fb9b6 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/IslandCompetition.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/IslandCompetition.java @@ -1,6 +1,25 @@ +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + package eu.interedition.collatex.dekker.matrix; public enum IslandCompetition { - CompetingIslandAndOnIdealIine, CompetingIsland, NonCompetingIsland + CompetingIslandAndOnIdealIine, CompetingIsland, NonCompetingIsland } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/IslandConflictResolver.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/IslandConflictResolver.java index 3291cbc44..a18304f0d 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/IslandConflictResolver.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/IslandConflictResolver.java @@ -1,168 +1,155 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.dekker.matrix; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Set; -import java.util.logging.Logger; - -import com.google.common.collect.ArrayListMultimap; -import com.google.common.collect.Lists; -import com.google.common.collect.Multimap; -import com.google.common.collect.Sets; - -/** - * - * @author Ronald Haentjens Dekker - * @author Bram Buitendijk - * @author Meindert Kroese - */ -public class IslandConflictResolver { - Logger LOG = Logger.getLogger(IslandConflictResolver.class.getName()); - // fixed islands contains all the islands that are selected for the final alignment - private final MatchTableSelection selection; - - //NOTE: outlierTranspositionLimit is ignored for now - public IslandConflictResolver(MatchTable table, int outlierTranspositionsSizeLimit) { - selection = new MatchTableSelection(table); - } - - /* - * Create a non-conflicting version by simply taken all the islands that do - * not conflict with each other, largest first. - */ - public MatchTableSelection createNonConflictingVersion() { - List possibleIslands; - do { - possibleIslands = selection.getPossibleIslands(); - // check the possible islands of a certain size against each other. - if (possibleIslands.size() == 1) { - selection.addIsland(possibleIslands.get(0)); - } else if (possibleIslands.size() > 1) { - Multimap analysis = analyzeConflictsBetweenPossibleIslands(possibleIslands); - resolveConflictsBySelectingPreferredIslands(selection, analysis); - } - } - while (!possibleIslands.isEmpty()); - return selection; - } - - /* - * This method analyzes the relationship between all the islands of the same - * size that have yet to be selected. They can compete with one another - * (choosing one locks out the other), some of them can be on the ideal line. - * - * Parameters: the size of the islands that you want to analyze - */ - public Multimap analyzeConflictsBetweenPossibleIslands(List possibleIslands) { - Multimap conflictMap = ArrayListMultimap.create(); - Set competingIslands = getCompetingIslands(possibleIslands); - for (Island island : competingIslands) { - if (selection.doesCandidateLayOnVectorOfCommittedIsland(island)) { - conflictMap.put(IslandCompetition.CompetingIslandAndOnIdealIine, island); - } else { - conflictMap.put(IslandCompetition.CompetingIsland, island); - } - } - for (Island island : getNonCompetingIslands(possibleIslands, competingIslands)) { - conflictMap.put(IslandCompetition.NonCompetingIsland, island); - } - return conflictMap; - } - - /* - * The preferred Islands are directly added to the result Archipelago - * If we want to - * re-factor this into a pull construction rather then a push construction - * we have to move this code out of this method and move it to the caller - * class - */ - private void resolveConflictsBySelectingPreferredIslands(MatchTableSelection selection, Multimap islandConflictMap) { - // First select competing islands that are on the ideal line - Multimap distanceMap1 = makeDistanceMap(islandConflictMap.get(IslandCompetition.CompetingIslandAndOnIdealIine)); - LOG.fine("addBestOfCompeting with competingIslandsOnIdealLine"); - addBestOfCompeting(selection, distanceMap1); - - // Second select other competing islands - Multimap distanceMap2 = makeDistanceMap(islandConflictMap.get(IslandCompetition.CompetingIsland)); - LOG.fine("addBestOfCompeting with otherCompetingIslands"); - addBestOfCompeting(selection, distanceMap2); - - // Third select non competing islands - LOG.fine("add non competing islands"); - for (Island i : islandConflictMap.get(IslandCompetition.NonCompetingIsland)) { - selection.addIsland(i); - } - } - - private void addBestOfCompeting(MatchTableSelection selection, Multimap distanceMap1) { - for (Double d : shortestToLongestDistances(distanceMap1)) { - for (Island ci : distanceMap1.get(d)) { - if (selection.isIslandPossibleCandidate(ci)) { - selection.addIsland(ci); - } - } - } - } - - // TODO: This method calculates the distance from the ideal line - // TODO: by calculating the ratio x/y. - // TODO: but the ideal line may have moved (due to additions/deletions). - private Multimap makeDistanceMap(Collection competingIslands) { - Multimap distanceMap = ArrayListMultimap.create(); - for (Island isl : competingIslands) { - Coordinate leftEnd = isl.getLeftEnd(); - double ratio = ((leftEnd.column+1) / (double) (leftEnd.row+1)); - double b2 = Math.log(ratio)/Math.log(2); - double distanceToIdealLine = Math.abs(b2); - distanceMap.put(distanceToIdealLine, isl); - } - return distanceMap; - } - - private List shortestToLongestDistances(Multimap distanceMap) { - List distances = Lists.newArrayList(distanceMap.keySet()); - Collections.sort(distances); - return distances; - } - - private Set getNonCompetingIslands(List islands, Set competingIslands) { - Set nonCompetingIslands = Sets.newHashSet(islands); - nonCompetingIslands.removeAll(competingIslands); - return nonCompetingIslands; - } - - private Set getCompetingIslands(List islands) { - Set competingIslands = Sets.newHashSet(); - for (int i = 0; i < islands.size(); i++) { - Island i1 = islands.get(i); - for (int j = 1; j < islands.size() - i; j++) { - Island i2 = islands.get(i + j); - if (i1.isCompetitor(i2)) { - competingIslands.add(i1); - competingIslands.add(i2); - } - } - } - return competingIslands; - } +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.dekker.matrix; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.logging.Logger; + +/** + * @author Ronald Haentjens Dekker + * @author Bram Buitendijk + * @author Meindert Kroese + */ +public class IslandConflictResolver { + Logger LOG = Logger.getLogger(IslandConflictResolver.class.getName()); + // fixed islands contains all the islands that are selected for the final alignment + private final MatchTableSelection selection; + + //NOTE: outlierTranspositionLimit is ignored for now + public IslandConflictResolver(MatchTable table) { + selection = new MatchTableSelection(table); + } + + /* + * Create a non-conflicting version by simply taken all the islands that do + * not conflict with each other, largest first. + */ + public MatchTableSelection createNonConflictingVersion() { + List possibleIslands; + do { + possibleIslands = selection.getPossibleIslands(); + // check the possible islands of a certain size against each other. + if (possibleIslands.size() == 1) { + selection.addIsland(possibleIslands.get(0)); + } else if (possibleIslands.size() > 1) { + Map> analysis = analyzeConflictsBetweenPossibleIslands(possibleIslands); + resolveConflictsBySelectingPreferredIslands(selection, analysis); + } + } + while (!possibleIslands.isEmpty()); + return selection; + } + + /* + * This method analyzes the relationship between all the islands of the same + * size that have yet to be selected. They can compete with one another + * (choosing one locks out the other), some of them can be on the ideal line. + * + * Parameters: the size of the islands that you want to analyze + */ + public Map> analyzeConflictsBetweenPossibleIslands(List possibleIslands) { + Map> conflictMap = new HashMap<>(); + Set competingIslands = getCompetingIslands(possibleIslands); + for (Island island : competingIslands) { + if (selection.doesCandidateLayOnVectorOfCommittedIsland(island)) { + conflictMap.computeIfAbsent(IslandCompetition.CompetingIslandAndOnIdealIine, c -> new ArrayList<>()).add(island); + } else { + conflictMap.computeIfAbsent(IslandCompetition.CompetingIsland, c -> new ArrayList<>()).add(island); + } + } + for (Island island : getNonCompetingIslands(possibleIslands, competingIslands)) { + conflictMap.computeIfAbsent(IslandCompetition.NonCompetingIsland, c -> new ArrayList<>()).add(island); + } + return conflictMap; + } + + /* + * The preferred Islands are directly added to the result Archipelago + * If we want to + * re-factor this into a pull construction rather then a push construction + * we have to move this code out of this method and move it to the caller + * class + */ + private void resolveConflictsBySelectingPreferredIslands(MatchTableSelection selection, Map> islandConflictMap) { + // First select competing islands that are on the ideal line + LOG.fine("addBestOfCompeting with competingIslandsOnIdealLine"); + makeDistanceMap(islandConflictMap.getOrDefault(IslandCompetition.CompetingIslandAndOnIdealIine, Collections.emptyList())) + .values().stream() + .flatMap(List::stream).filter(ci1 -> selection.isIslandPossibleCandidate(ci1)) + .forEach(selection::addIsland); + + // Second select other competing islands + LOG.fine("addBestOfCompeting with otherCompetingIslands"); + makeDistanceMap(islandConflictMap.getOrDefault(IslandCompetition.CompetingIsland, Collections.emptyList())) + .values().stream() + .flatMap(List::stream).filter(ci -> selection.isIslandPossibleCandidate(ci)) + .forEach(selection::addIsland); + + // Third select non competing islands + LOG.fine("add non competing islands"); + islandConflictMap.getOrDefault(IslandCompetition.NonCompetingIsland, Collections.emptyList()) + .forEach(selection::addIsland); + } + + // TODO: This method calculates the distance from the ideal line + // TODO: by calculating the ratio x/y. + // TODO: but the ideal line may have moved (due to additions/deletions). + private SortedMap> makeDistanceMap(Collection competingIslands) { + SortedMap> distanceMap = new TreeMap<>(); + for (Island isl : competingIslands) { + Coordinate leftEnd = isl.getLeftEnd(); + double ratio = ((leftEnd.column + 1) / (double) (leftEnd.row + 1)); + double b2 = Math.log(ratio) / Math.log(2); + double distanceToIdealLine = Math.abs(b2); + distanceMap.computeIfAbsent(distanceToIdealLine, d -> new ArrayList<>()).add(isl); + } + return distanceMap; + } + + private Set getNonCompetingIslands(List islands, Set competingIslands) { + Set nonCompetingIslands = new HashSet<>(islands); + nonCompetingIslands.removeAll(competingIslands); + return nonCompetingIslands; + } + + private Set getCompetingIslands(List islands) { + Set competingIslands = new HashSet<>(); + for (int i = 0; i < islands.size(); i++) { + Island i1 = islands.get(i); + for (int j = 1; j < islands.size() - i; j++) { + Island i2 = islands.get(i + j); + if (i1.isCompetitor(i2)) { + competingIslands.add(i1); + competingIslands.add(i2); + } + } + } + return competingIslands; + } } \ No newline at end of file diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTable.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTable.java index db77d3bac..23260e2bb 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTable.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTable.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,26 +19,26 @@ package eu.interedition.collatex.dekker.matrix; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.google.common.collect.ContiguousSet; -import com.google.common.collect.DiscreteDomain; -import com.google.common.collect.HashBasedTable; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Range; -import com.google.common.collect.Sets; - import eu.interedition.collatex.Token; import eu.interedition.collatex.VariantGraph; import eu.interedition.collatex.matching.EqualityTokenComparator; import eu.interedition.collatex.matching.Matches; import eu.interedition.collatex.util.VariantGraphRanking; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.StreamSupport; + /* @author: Ronald Haentjens Dekker * * This class represents a table of the matches. @@ -47,153 +47,156 @@ * However the API of this class looks very much like an array based one * since you can use tokenAt(row, column) or vertexAt(row, column). * This class is read only. -* Selections of vectors from the table can be made using the +* Selections of vectors from the table can be made using the * MatchTableSelection class. */ public class MatchTable { - private final HashBasedTable table; - private final Iterable witness; - private final List ranks; - - // assumes default token comparator - public static MatchTable create(VariantGraph graph, Iterable witness) { - Comparator comparator = new EqualityTokenComparator(); - return MatchTable.create(graph, witness, comparator); - } - - public static MatchTable create(VariantGraph graph, Iterable witness, Comparator comparator) { - final VariantGraphRanking ranking = VariantGraphRanking.of(graph); - // step 1: build the MatchTable - MatchTable table = createEmptyTable(ranking, graph, witness); - // step 2: do the matching and fill the table - table.fillTableWithMatches(ranking, graph, witness, comparator); - return table; - } - - public VariantGraph.Vertex vertexAt(int rowIndex, int columnIndex) { - MatchTableCell cell = table.get(rowIndex, columnIndex); - return cell==null ? null : cell.vertex; - } - - public Token tokenAt(int rowIndex, int columnIndex) { - MatchTableCell cell = table.get(rowIndex, columnIndex); - return cell==null ? null : cell.token; - } - - // Warning: this method reiterates the witness! - // This method is only meant for the user interface and serialization classes! - // Use the tokenAt method in all other cases. - public List rowList() { - return Lists.newArrayList(witness); - } - - public List columnList() { - return ranks; - } - - // Since the coordinates in allMatches are ordered from upper left to lower right, - // we don't need to check the lower right neighbor. - public Set getIslands() { - Map coordinateMapper = Maps.newHashMap(); - List allMatches = allMatches(); - for (Coordinate c : allMatches) { - // LOG.debug("coordinate {}", c); - addToIslands(coordinateMapper, c); + private final MatchTableCell[][] table; + private final Token[] witness; + private final int[] ranks; + + // assumes default token comparator + public static MatchTable create(VariantGraph graph, Iterable witness) { + Comparator comparator = new EqualityTokenComparator(); + return MatchTable.create(graph, witness, comparator); + } + + public static MatchTable create(VariantGraph graph, Iterable witness, Comparator comparator) { + final VariantGraphRanking ranking = VariantGraphRanking.of(graph); + // step 1: build the MatchTable + MatchTable table = createEmptyTable(ranking, graph, witness); + // step 2: do the matching and fill the table + table.fillTableWithMatches(ranking, graph, witness, comparator); + return table; + } + + private Optional cell(int rowIndex, int columnIndex) { + return Optional.ofNullable(table[rowIndex][columnIndex]); + } + + public VariantGraph.Vertex vertexAt(int rowIndex, int columnIndex) { + return cell(rowIndex, columnIndex).map(c -> c.vertex).orElse(null); + } + + public Token tokenAt(int rowIndex, int columnIndex) { + return cell(rowIndex, columnIndex).map(c -> c.token).orElse(null); + } + + // Warning: this method reiterates the witness! + // This method is only meant for the user interface and serialization classes! + // Use the tokenAt method in all other cases. + public List rowList() { + return Collections.unmodifiableList(Arrays.asList(witness)); } - Set smallestIslandsCoordinates = Sets.newHashSet(allMatches); - smallestIslandsCoordinates.removeAll(coordinateMapper.keySet()); - for (Coordinate coordinate : smallestIslandsCoordinates) { - Island island = new Island(); - island.add(coordinate); - coordinateMapper.put(coordinate, island); + + public List columnList() { + return Arrays.stream(ranks).boxed().collect(Collectors.toList()); } - return Sets.newHashSet(coordinateMapper.values()); - } - - - - private MatchTable(Iterable tokens, List ranks) { - this.table = HashBasedTable.create(); - this.witness = tokens; - this.ranks = ranks; - } - - private static MatchTable createEmptyTable(VariantGraphRanking ranking, VariantGraph graph, Iterable witness) { - // -2 === ignore the start and the end vertex - Range ranksRange = Range.closed(0, Math.max(0, ranking.apply(graph.getEnd()) - 2)); - ImmutableList ranksSet = ContiguousSet.create(ranksRange, DiscreteDomain.integers()).asList(); - return new MatchTable(witness, ranksSet); - } - - // move parameters into fields? - private void fillTableWithMatches(VariantGraphRanking ranking, VariantGraph graph, Iterable witness, Comparator comparator) { - Matches matches = Matches.between(graph.vertices(), witness, comparator); - Set unique = matches.getUnique(); - Set ambiguous = matches.getAmbiguous(); - int rowIndex=0; - for (Token t : witness) { - if (unique.contains(t) || ambiguous.contains(t)) { - List matchingVertices = matches.getAll().get(t); - for (VariantGraph.Vertex vgv : matchingVertices) { - set(rowIndex, ranking.apply(vgv) - 1, t, vgv); + + // Since the coordinates in allMatches are ordered from upper left to lower right, + // we don't need to check the lower right neighbor. + public Set getIslands() { + Map coordinateMapper = new HashMap<>(); + List allMatches = allMatches(); + for (Coordinate c : allMatches) { + // LOG.debug("coordinate {}", c); + addToIslands(coordinateMapper, c); + } + Set smallestIslandsCoordinates = new HashSet<>(allMatches); + smallestIslandsCoordinates.removeAll(coordinateMapper.keySet()); + for (Coordinate coordinate : smallestIslandsCoordinates) { + Island island = new Island(); + island.add(coordinate); + coordinateMapper.put(coordinate, island); } - } - rowIndex++; + return new HashSet<>(coordinateMapper.values()); } - } - - private void set(int rowIndex, int columnIndex, Token token, VariantGraph.Vertex vertex) { - // LOG.debug("putting: {}<->{}<->{}", new Object[] { token, columnIndex, variantGraphVertex }); - MatchTableCell cell = new MatchTableCell(token, vertex); - table.put(rowIndex, columnIndex, cell); - } - - private void addToIslands(Map coordinateMapper, Coordinate c) { - int diff = -1; - Coordinate neighborCoordinate = new Coordinate(c.row + diff, c.column + diff); - VariantGraph.Vertex neighbor = null; - try { - neighbor = vertexAt(c.row + diff, c.column + diff); - } catch (IndexOutOfBoundsException e) {} - if (neighbor != null) { - Island island = coordinateMapper.get(neighborCoordinate); - if (island == null) { - // LOG.debug("new island"); - Island island0 = new Island(); - island0.add(neighborCoordinate); - island0.add(c); - coordinateMapper.put(neighborCoordinate, island0); - coordinateMapper.put(c, island0); - } else { - // LOG.debug("add to existing island"); - island.add(c); - coordinateMapper.put(c, island); - } + + + private MatchTable(Token[] tokens, int[] ranks) { + this.table = new MatchTableCell[tokens.length][ranks.length]; + this.witness = tokens; + this.ranks = ranks; } - } - - // Note: code taken from MatchMatrix class - // TODO: might be simpler to work from the valueSet - // TODO: try remove the call to rowList / columnList - List allMatches() { - List pairs = Lists.newArrayList(); - int rows = rowList().size(); - int cols = columnList().size(); - for (int i = 0; i < rows; i++) { - for (int j = 0; j < cols; j++) { - if (vertexAt(i, j) != null) pairs.add(new Coordinate(i, j)); - } + + private static MatchTable createEmptyTable(VariantGraphRanking ranking, VariantGraph graph, Iterable witness) { + // -2 === ignore the start and the end vertex + return new MatchTable( + StreamSupport.stream(witness.spliterator(), false).toArray(Token[]::new), + IntStream.range(0, Math.max(0, ranking.apply(graph.getEnd()) - 1)).toArray() + ); + } + + // move parameters into fields? + private void fillTableWithMatches(VariantGraphRanking ranking, VariantGraph graph, Iterable witness, Comparator comparator) { + Matches matches = Matches.between(graph.vertices(), witness, comparator); + Set unique = matches.uniqueInWitness; + Set ambiguous = matches.ambiguousInWitness; + int rowIndex = 0; + for (Token t : witness) { + if (unique.contains(t) || ambiguous.contains(t)) { + List matchingVertices = matches.allMatches.getOrDefault(t, Collections.emptyList()); + for (VariantGraph.Vertex vgv : matchingVertices) { + set(rowIndex, ranking.apply(vgv) - 1, t, vgv); + } + } + rowIndex++; + } + } + + private void set(int rowIndex, int columnIndex, Token token, VariantGraph.Vertex vertex) { + // LOG.debug("putting: {}<->{}<->{}", new Object[] { token, columnIndex, variantGraphVertex }); + table[rowIndex][columnIndex] = new MatchTableCell(token, vertex); } - return pairs; - } - - private class MatchTableCell { - public final Token token; - public final VariantGraph.Vertex vertex; - - public MatchTableCell(Token token, VariantGraph.Vertex vertex) { - this.token = token; - this.vertex = vertex; + + private void addToIslands(Map coordinateMapper, Coordinate c) { + int diff = -1; + Coordinate neighborCoordinate = new Coordinate(c.row + diff, c.column + diff); + VariantGraph.Vertex neighbor = null; + try { + neighbor = vertexAt(c.row + diff, c.column + diff); + } catch (IndexOutOfBoundsException e) { + // ignored + } + if (neighbor != null) { + Island island = coordinateMapper.get(neighborCoordinate); + if (island == null) { + // LOG.debug("new island"); + Island island0 = new Island(); + island0.add(neighborCoordinate); + island0.add(c); + coordinateMapper.put(neighborCoordinate, island0); + coordinateMapper.put(c, island0); + } else { + // LOG.debug("add to existing island"); + island.add(c); + coordinateMapper.put(c, island); + } + } + } + + // Note: code taken from MatchMatrix class + // TODO: might be simpler to work from the valueSet + // TODO: try remove the call to rowList / columnList + List allMatches() { + List pairs = new ArrayList<>(); + int rows = rowList().size(); + int cols = columnList().size(); + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + if (vertexAt(i, j) != null) pairs.add(new Coordinate(i, j)); + } + } + return pairs; + } + + private class MatchTableCell { + public final Token token; + public final VariantGraph.Vertex vertex; + + public MatchTableCell(Token token, VariantGraph.Vertex vertex) { + this.token = token; + this.vertex = vertex; + } } - } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableLinker.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableLinker.java index 7ff961793..9e8e46243 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableLinker.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableLinker.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,51 +19,48 @@ package eu.interedition.collatex.dekker.matrix; +import eu.interedition.collatex.Token; +import eu.interedition.collatex.VariantGraph; +import eu.interedition.collatex.dekker.TokenLinker; + import java.util.Comparator; +import java.util.HashMap; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; -import com.google.common.collect.Maps; +public class MatchTableLinker implements TokenLinker { + static Logger LOG = Logger.getLogger(MatchTableLinker.class.getName()); -import eu.interedition.collatex.Token; -import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.dekker.TokenLinker; + public MatchTableLinker() { + super(); + } -public class MatchTableLinker implements TokenLinker { - static Logger LOG = Logger.getLogger(MatchTableLinker.class.getName()); - private final int outlierTranspositionsSizeLimit; + @Override + public Map link(VariantGraph base, Iterable witness, Comparator comparator) { + // create MatchTable and fill it with matches + LOG.fine("create MatchTable and fill it with matches"); + MatchTable table = MatchTable.create(base, witness, comparator); - public MatchTableLinker(int outlierTranspositionsSizeLimit) { - super(); - this.outlierTranspositionsSizeLimit = outlierTranspositionsSizeLimit; - } + // create IslandConflictResolver + LOG.fine("create island conflict resolver"); + IslandConflictResolver resolver = new IslandConflictResolver(table); - @Override - public Map link(VariantGraph base, Iterable witness, Comparator comparator) { - // create MatchTable and fill it with matches - LOG.fine("create MatchTable and fill it with matches"); - MatchTable table = MatchTable.create(base, witness, comparator); + // The IslandConflictResolver createNonConflictingVersion() method + // selects the optimal islands + LOG.fine("select the optimal islands"); + MatchTableSelection preferredIslands = resolver.createNonConflictingVersion(); + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "Number of preferred Islands: {0}", preferredIslands.size()); + } - // create IslandConflictResolver - LOG.fine("create island conflict resolver"); - IslandConflictResolver resolver = new IslandConflictResolver(table, outlierTranspositionsSizeLimit); - - // The IslandConflictResolver createNonConflictingVersion() method - // selects the optimal islands - LOG.fine("select the optimal islands"); - MatchTableSelection preferredIslands = resolver.createNonConflictingVersion(); - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "Number of preferred Islands: {0}", preferredIslands.size()); - } - - // Here the result is put in a map - Map map = Maps.newHashMap(); - for (Island island : preferredIslands.getIslands()) { - for (Coordinate c : island) { - map.put(table.tokenAt(c.row, c.column), table.vertexAt(c.row, c.column)); - } - } - return map; - } + // Here the result is put in a map + Map map = new HashMap<>(); + for (Island island : preferredIslands.getIslands()) { + for (Coordinate c : island) { + map.put(table.tokenAt(c.row, c.column), table.vertexAt(c.row, c.column)); + } + } + return map; + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableSelection.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableSelection.java index ae46c0630..7a9beb0f6 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableSelection.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableSelection.java @@ -1,165 +1,190 @@ +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + package eu.interedition.collatex.dekker.matrix; +import eu.interedition.collatex.VariantGraph; + +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; - -import com.google.common.collect.ArrayListMultimap; -import com.google.common.collect.Lists; -import com.google.common.collect.Multimap; -import com.google.common.collect.Sets; - -import eu.interedition.collatex.VariantGraph; +import java.util.stream.Collectors; // @author: Ronald Haentjens Dekker // Unselected islands reside in the islandMultimap. // Selected islands reside in the fixedIsland Archipelago. -// Group the islands together by size; +// Group the islands together by size; // islands may change after commit islands public class MatchTableSelection { - Logger LOG = Logger.getLogger(MatchTableSelection.class.getName()); - private final Multimap islandMultimap; - private final Archipelago fixedIslands; - //this fields are needed for the locking of table cells - private final Set fixedRows; - private final Set fixedVertices; - private final MatchTable table; - - public MatchTableSelection(MatchTable table) { - fixedRows = Sets.newHashSet(); - fixedVertices = Sets.newHashSet(); - this.table = table; - this.fixedIslands = new Archipelago(); - islandMultimap = ArrayListMultimap.create(); - for (Island isl : table.getIslands()) { - islandMultimap.put(isl.size(), isl); + Logger LOG = Logger.getLogger(MatchTableSelection.class.getName()); + private final Map> islandMultimap; + private final Archipelago fixedIslands; + //this fields are needed for the locking of table cells + private final Set fixedRows; + private final Set fixedVertices; + private final MatchTable table; + + public MatchTableSelection(MatchTable table) { + fixedRows = new HashSet<>(); + fixedVertices = new HashSet<>(); + this.table = table; + this.fixedIslands = new Archipelago(); + islandMultimap = new HashMap<>(); + for (Island isl : table.getIslands()) { + islandMultimap.computeIfAbsent(isl.size(), s -> new ArrayList<>()).add(isl); + } } - } - - // copy constructor - public MatchTableSelection(MatchTableSelection orig) { - // table structure is read only, does not have to be copied - this.islandMultimap = ArrayListMultimap.create(orig.islandMultimap); - this.fixedIslands = new Archipelago(orig.fixedIslands); - this.fixedRows = Sets.newHashSet(orig.fixedRows); - this.fixedVertices = Sets.newHashSet(orig.fixedVertices); - this.table = orig.table; - } - - /* - * Return whether a coordinate overlaps with an already committed coordinate - */ - public boolean doesCoordinateOverlapWithCommittedCoordinate(Coordinate coordinate) { - return fixedRows.contains(coordinate.row) || // - fixedVertices.contains(table.vertexAt(coordinate.row, coordinate.column)); - } - - /* - * Return whether an island overlaps with an already committed island - */ - public boolean isIslandPossibleCandidate(Island island) { - for (Coordinate coordinate : island) { - if (doesCoordinateOverlapWithCommittedCoordinate(coordinate)) return false; + + // copy constructor + public MatchTableSelection(MatchTableSelection orig) { + // table structure is read only, does not have to be copied + this.islandMultimap = orig.islandMultimap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, e -> new ArrayList<>(e.getValue()))); + this.fixedIslands = new Archipelago(orig.fixedIslands); + this.fixedRows = new HashSet<>(orig.fixedRows); + this.fixedVertices = new HashSet<>(orig.fixedVertices); + this.table = orig.table; } - return true; - } - - /* - * Commit an island in the match table - * Island will be part of the final alignment - */ - public void addIsland(Island isl) { - if (LOG.isLoggable(Level.FINE)) { - LOG.log(Level.FINE, "adding island: '{0}'", isl); + + /* + * Return whether a coordinate overlaps with an already committed coordinate + */ + public boolean doesCoordinateOverlapWithCommittedCoordinate(Coordinate coordinate) { + return fixedRows.contains(coordinate.row) || // + fixedVertices.contains(table.vertexAt(coordinate.row, coordinate.column)); } - for (Coordinate coordinate : isl) { - fixedRows.add(coordinate.row); - fixedVertices.add(table.vertexAt(coordinate.row, coordinate.column)); + + /* + * Return whether an island overlaps with an already committed island + */ + public boolean isIslandPossibleCandidate(Island island) { + for (Coordinate coordinate : island) { + if (doesCoordinateOverlapWithCommittedCoordinate(coordinate)) return false; + } + return true; } - fixedIslands.add(isl); - islandMultimap.remove(isl.size(), isl); - } - - public boolean doesCandidateLayOnVectorOfCommittedIsland(Island island) { - Coordinate leftEnd = island.getLeftEnd(); - return fixedIslands.getIslandVectors().contains(leftEnd.row - leftEnd.column); - } - - public int size() { - return fixedIslands.size(); - } - - public List getIslands() { - return fixedIslands.getIslands(); - } - - public boolean containsCoordinate(int row, int column) { - return fixedIslands.containsCoordinate(row, column); - } - - /* - * For all the possible islands of a certain size this method checks whether - * they conflict with one of the previously committed islands. If so, the - * possible island is removed from the multimap. Or in case of overlap, split - * into a smaller island and then put in back into the map Note that this - * method changes the possible islands multimap. - */ - //TODO: the original Island object is modified here - //TODO: That should not happen, if we want to build a decision tree. - public void removeOrSplitImpossibleIslands(Integer islandSize, Multimap islandMultimap) { - Collection islandsToCheck = Lists.newArrayList(islandMultimap.get(islandSize)); - for (Island island : islandsToCheck) { - if (!isIslandPossibleCandidate(island)) { - islandMultimap.remove(islandSize, island); - removeConflictingEndCoordinates(island); - if (island.size() > 0) { - islandMultimap.put(island.size(), island); + + /* + * Commit an island in the match table + * Island will be part of the final alignment + */ + public void addIsland(Island isl) { + if (LOG.isLoggable(Level.FINE)) { + LOG.log(Level.FINE, "adding island: '{0}'", isl); + } + for (Coordinate coordinate : isl) { + fixedRows.add(coordinate.row); + fixedVertices.add(table.vertexAt(coordinate.row, coordinate.column)); } - } + fixedIslands.add(isl); + islandMultimap.computeIfPresent(isl.size(), (s, i) -> { + i.remove(isl); + return (i.isEmpty() ? null : i); + }); } - } - - private void removeConflictingEndCoordinates(Island island) { - boolean goOn = true; - while (goOn) { - Coordinate leftEnd = island.getLeftEnd(); - if (doesCoordinateOverlapWithCommittedCoordinate(leftEnd)) { - island.removeCoordinate(leftEnd); - if (island.size() == 0) { - return; + + public boolean doesCandidateLayOnVectorOfCommittedIsland(Island island) { + Coordinate leftEnd = island.getLeftEnd(); + return fixedIslands.getIslandVectors().contains(leftEnd.row - leftEnd.column); + } + + public int size() { + return fixedIslands.size(); + } + + public List getIslands() { + return fixedIslands.getIslands(); + } + + public boolean containsCoordinate(int row, int column) { + return fixedIslands.containsCoordinate(row, column); + } + + /* + * For all the possible islands of a certain size this method checks whether + * they conflict with one of the previously committed islands. If so, the + * possible island is removed from the multimap. Or in case of overlap, split + * into a smaller island and then put in back into the map Note that this + * method changes the possible islands multimap. + */ + //TODO: the original Island object is modified here + //TODO: That should not happen, if we want to build a decision tree. + public void removeOrSplitImpossibleIslands(Integer islandSize, Map> islandMultimap) { + Collection islandsToCheck = new ArrayList<>(islandMultimap.getOrDefault(islandSize, Collections.emptyList())); + for (Island island : islandsToCheck) { + if (!isIslandPossibleCandidate(island)) { + islandMultimap.computeIfPresent(islandSize, (s, i) -> { + i.remove(island); + return (i.isEmpty() ? null : i); + }); + removeConflictingEndCoordinates(island); + if (island.size() > 0) { + islandMultimap.computeIfAbsent(island.size(), s -> new ArrayList<>()).add(island); + } + } } - } else { - goOn = false; - } } - goOn = true; - while (goOn) { - Coordinate rightEnd = island.getRightEnd(); - if (doesCoordinateOverlapWithCommittedCoordinate(rightEnd)) { - island.removeCoordinate(rightEnd); - if (island.size() == 0) { - return; + + private void removeConflictingEndCoordinates(Island island) { + boolean goOn = true; + while (goOn) { + Coordinate leftEnd = island.getLeftEnd(); + if (doesCoordinateOverlapWithCommittedCoordinate(leftEnd)) { + island.removeCoordinate(leftEnd); + if (island.size() == 0) { + return; + } + } else { + goOn = false; + } + } + goOn = true; + while (goOn) { + Coordinate rightEnd = island.getRightEnd(); + if (doesCoordinateOverlapWithCommittedCoordinate(rightEnd)) { + island.removeCoordinate(rightEnd); + if (island.size() == 0) { + return; + } + } else { + goOn = false; + } } - } else { - goOn = false; - } } - } - - public List getPossibleIslands() { - List possibleIslands = Lists.newArrayList(); - while(possibleIslands.isEmpty()&&!islandMultimap.isEmpty()) { - // find the maximum island size and traverse groups in descending order - Integer max = Collections.max(islandMultimap.keySet()); - LOG.fine("Checking islands of size: "+max); - // check the possible islands of a certain size against - // the already committed islands. - removeOrSplitImpossibleIslands(max, islandMultimap); - possibleIslands = Lists.newArrayList(islandMultimap.get(max)); + + public List getPossibleIslands() { + List possibleIslands = new ArrayList<>(); + while (possibleIslands.isEmpty() && !islandMultimap.isEmpty()) { + // find the maximum island size and traverse groups in descending order + Integer max = Collections.max(islandMultimap.keySet()); + LOG.fine("Checking islands of size: " + max); + // check the possible islands of a certain size against + // the already committed islands. + removeOrSplitImpossibleIslands(max, islandMultimap); + possibleIslands = new ArrayList<>(islandMultimap.getOrDefault(max, Collections.emptyList())); + } + return possibleIslands; } - return possibleIslands; - } } \ No newline at end of file diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableSerializer.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableSerializer.java index 254071c21..5e3a2a763 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableSerializer.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/MatchTableSerializer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,128 +19,126 @@ package eu.interedition.collatex.dekker.matrix; -import java.util.ArrayList; - import eu.interedition.collatex.Token; +import java.util.ArrayList; + /** - * * @author Meindert Kroese * @author Ronald Haentjens Dekker - * */ //TODO: The methods in this class are extracted from the old MatchMatrix class //TODO: check correctness public class MatchTableSerializer { - //TODO: rename - private final MatchTable sparseMatrix; - - public MatchTableSerializer(MatchTable table) { - this.sparseMatrix = table; - } - - public String toHtml() { - StringBuilder result = new StringBuilder("\n\n"); - ArrayList colLabels = columnLabels(); - for (String cLabel : colLabels) { - result.append(""); - } - result.append("\n"); - int colNum = sparseMatrix.columnList().size(); - ArrayList rLabels = rowLabels(); - int row = 0; - for (String label : rLabels) { - result.append(""); - for (int col = 0; col < colNum; col++) - if (sparseMatrix.vertexAt(row, col)!=null) - result.append(""); - else - result.append(""); - result.append("\n"); - row++; - } - result.append("
").append(cLabel).append("
").append(label).append("M
"); - return result.toString(); - } + //TODO: rename + private final MatchTable sparseMatrix; - // arch = preferred matches - public String toHtml(Archipelago arch) { - int mat[] = new int[rowNum()]; - for (Island isl : arch.getIslands()) { - for (Coordinate c : isl) { - mat[c.row] = c.column; - } - } - StringBuilder result = new StringBuilder("\n\n"); - ArrayList colLabels = columnLabels(); - for (String cLabel : colLabels) { - result.append(""); + public MatchTableSerializer(MatchTable table) { + this.sparseMatrix = table; } - result.append("\n"); - ArrayList rLabels = rowLabels(); - int row = 0; - for (String label : rLabels) { - result.append(""); - if (mat[row] > 0) { - result.append("").append(""); - } - result.append("\n"); - row++; + + public String toHtml() { + StringBuilder result = new StringBuilder("
").append(cLabel).append("
").append(label).append("M
\n\n"); + ArrayList colLabels = columnLabels(); + for (String cLabel : colLabels) { + result.append(""); + } + result.append("\n"); + int colNum = sparseMatrix.columnList().size(); + ArrayList rLabels = rowLabels(); + int row = 0; + for (String label : rLabels) { + result.append(""); + for (int col = 0; col < colNum; col++) + if (sparseMatrix.vertexAt(row, col) != null) + result.append(""); + else + result.append(""); + result.append("\n"); + row++; + } + result.append("
").append(cLabel).append("
").append(label).append("M
"); + return result.toString(); } - result.append(""); - return result.toString(); - } - @Override - public String toString() { - StringBuilder result = new StringBuilder(); - ArrayList colLabels = columnLabels(); - for (String cLabel : colLabels) { - result.append(" ").append(cLabel); + // arch = preferred matches + public String toHtml(Archipelago arch) { + int mat[] = new int[rowNum()]; + for (Island isl : arch.getIslands()) { + for (Coordinate c : isl) { + mat[c.row] = c.column; + } + } + StringBuilder result = new StringBuilder("\n\n"); + ArrayList colLabels = columnLabels(); + for (String cLabel : colLabels) { + result.append(""); + } + result.append("\n"); + ArrayList rLabels = rowLabels(); + int row = 0; + for (String label : rLabels) { + result.append(""); + if (mat[row] > 0) { + result.append("").append(""); + } + result.append("\n"); + row++; + } + result.append("
").append(cLabel).append("
").append(label).append("M
"); + return result.toString(); } - result.append("\n"); - int colNum = sparseMatrix.columnList().size(); - ArrayList rLabels = rowLabels(); - int row = 0; - for (String label : rLabels) { - result.append(label); - for (int col = 0; col < colNum; col++) - result.append(" ").append(sparseMatrix.vertexAt(row++, col)!=null); - result.append("\n"); + + @Override + public String toString() { + StringBuilder result = new StringBuilder(); + ArrayList colLabels = columnLabels(); + for (String cLabel : colLabels) { + result.append(" ").append(cLabel); + } + result.append("\n"); + int colNum = sparseMatrix.columnList().size(); + ArrayList rLabels = rowLabels(); + int row = 0; + for (String label : rLabels) { + result.append(label); + for (int col = 0; col < colNum; col++) + result.append(" ").append(sparseMatrix.vertexAt(row++, col) != null); + result.append("\n"); + } + return result.toString(); } - return result.toString(); - } - public ArrayList rowLabels() { - ArrayList labels = new ArrayList(); - for (Token vgv : sparseMatrix.rowList()) { - String token = vgv.toString(); - int pos = token.indexOf(":'"); - if (pos > -1) { - labels.add(token.substring(pos + 2, token.length() - 2)); - } + public ArrayList rowLabels() { + ArrayList labels = new ArrayList(); + for (Token vgv : sparseMatrix.rowList()) { + String token = vgv.toString(); + int pos = token.indexOf(":'"); + if (pos > -1) { + labels.add(token.substring(pos + 2, token.length() - 2)); + } + } + return labels; } - return labels; - } - public ArrayList columnLabels() { - ArrayList labels = new ArrayList(); - for (Integer t : sparseMatrix.columnList()) { - String token = t.toString(); - int pos = token.indexOf(":'"); - if (pos > -1) { - // LOG.debug("token={{}}, pos={}", token, pos); - labels.add(token.substring(pos + 2, token.length() - 1)); - } + public ArrayList columnLabels() { + ArrayList labels = new ArrayList(); + for (Integer t : sparseMatrix.columnList()) { + String token = t.toString(); + int pos = token.indexOf(":'"); + if (pos > -1) { + // LOG.debug("token={{}}, pos={}", token, pos); + labels.add(token.substring(pos + 2, token.length() - 1)); + } + } + return labels; } - return labels; - } - public int rowNum() { - return rowLabels().size(); - } + public int rowNum() { + return rowLabels().size(); + } - public int colNum() { - return columnLabels().size(); - } + public int colNum() { + return columnLabels().size(); + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/package-info.java index 38fbfb70d..a1d331bb1 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/package-info.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/matrix/package-info.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * diff --git a/collatex-core/src/main/java/eu/interedition/collatex/dekker/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/dekker/package-info.java index 829bca94e..c39e0071a 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/dekker/package-info.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/dekker/package-info.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,7 +19,7 @@ /** * Dekker's implementation of a collation algorithm. - *

+ * * Supports progressive alignment of multiple witnesses including heuristic detection of transpositions. * * @see eu.interedition.collatex.dekker.DekkerAlgorithm diff --git a/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraph.java b/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraph.java deleted file mode 100644 index 5d1e73a72..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraph.java +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.jung; - -import com.google.common.base.Preconditions; -import com.google.common.collect.HashMultimap; -import com.google.common.collect.Iterables; -import com.google.common.collect.Multimap; -import com.google.common.collect.Sets; -import edu.uci.ics.jung.graph.DirectedSparseGraph; -import eu.interedition.collatex.Token; -import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.Witness; -import eu.interedition.collatex.util.VariantGraphTraversal; - -import java.util.Collections; -import java.util.Set; - -/** - * @author Gregor Middell - */ -public class JungVariantGraph extends DirectedSparseGraph implements VariantGraph { - - final JungVariantGraphVertex start; - final JungVariantGraphVertex end; - final Multimap transpositionIndex = HashMultimap.create(); - - public JungVariantGraph() { - super(); - addVertex(this.start = new JungVariantGraphVertex(this, Collections.emptySet())); - addVertex(this.end = new JungVariantGraphVertex(this, Collections.emptySet())); - connect(this.start, this.end, Collections.emptySet()); - } - - @Override - public Vertex getStart() { - return start; - } - - @Override - public Vertex getEnd() { - return end; - } - - @Override - public Set transpositions() { - return Sets.newHashSet(transpositionIndex.values()); - } - - @Override - public Iterable vertices() { - return vertices(null); - } - - @Override - public Iterable vertices(Set witnesses) { - return VariantGraphTraversal.of(this, witnesses); - } - - @Override - public Iterable edges() { - return edges(null); - } - - @Override - public Iterable edges(Set witnesses) { - return VariantGraphTraversal.of(this, witnesses).edges(); - } - - @Override - public Vertex add(Token token) { - final JungVariantGraphVertex vertex = new JungVariantGraphVertex(this, Collections.singleton(token)); - addVertex(vertex); - return vertex; - } - - @Override - public Edge connect(Vertex from, Vertex to, Set witnesses) { - Preconditions.checkArgument(!from.equals(to)); - - if (from.equals(start)) { - final Edge startEndEdge = edgeBetween(start, end); - if (startEndEdge != null) { - if (to.equals(end)) { - witnesses = Sets.newHashSet(witnesses); - witnesses.addAll(startEndEdge.witnesses()); - } - startEndEdge.delete(); - } - } - - for (Edge e : from.outgoing()) { - if (to.equals(e.to())) { - return e.add(witnesses); - } - } - - final JungVariantGraphEdge edge = new JungVariantGraphEdge(this, witnesses); - addEdge(edge, (JungVariantGraphVertex) from, (JungVariantGraphVertex) to); - return edge; - } - - @Override - public Edge register(Witness witness) { - return connect(start, end, Collections.singleton(witness)); - } - - @Override - public Transposition transpose(Set vertices) { - Preconditions.checkArgument(!vertices.isEmpty()); - for (Transposition transposition : vertices.iterator().next().transpositions()) { - if (Sets.newHashSet(transposition).equals(vertices)) { - return transposition; - } - } - return new JungVariantGraphTransposition(this, vertices); - } - - @Override - public Edge edgeBetween(Vertex a, Vertex b) { - return findEdge((JungVariantGraphVertex) a, (JungVariantGraphVertex) b); - } - - @Override - public Set witnesses() { - Set witnesses = Sets.newHashSet(); - for (Edge edge : start.outgoing()) { - witnesses.addAll(edge.witnesses()); - } - return witnesses; - } - - @Override - public String toString() { - return Iterables.toString(witnesses()); - } -} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraphEdge.java b/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraphEdge.java deleted file mode 100644 index 54d50ab7d..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraphEdge.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.jung; - -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; -import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.Witness; - -import java.util.Collections; -import java.util.Set; - -/** - * @author Gregor Middell - */ -public class JungVariantGraphEdge implements VariantGraph.Edge { - - final JungVariantGraph graph; - final Set witnesses; - - public JungVariantGraphEdge(JungVariantGraph graph, Set witnesses) { - this.graph = graph; - this.witnesses = Sets.newHashSet(witnesses); - } - - @Override - public VariantGraph.Edge add(Set witnesses) { - this.witnesses.addAll(witnesses); - return this; - } - - @Override - public Set witnesses() { - return Collections.unmodifiableSet(witnesses); - } - - @Override - public VariantGraph graph() { - return graph; - } - - @Override - public VariantGraph.Vertex from() { - return graph.getEndpoints(this).getFirst(); - } - - @Override - public VariantGraph.Vertex to() { - return graph.getEndpoints(this).getSecond(); - } - - @Override - public void delete() { - graph.removeEdge(this); - } - - @Override - public String toString() { - return Iterables.toString(witnesses); - } -} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraphTransposition.java b/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraphTransposition.java deleted file mode 100644 index 6618bd99e..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraphTransposition.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.jung; - -import com.google.common.collect.Iterables; -import com.google.common.collect.Iterators; -import com.google.common.collect.Sets; -import eu.interedition.collatex.VariantGraph; - -import java.util.Iterator; -import java.util.Set; - -/** - * @author Gregor Middell - */ -public class JungVariantGraphTransposition implements VariantGraph.Transposition { - - private final JungVariantGraph graph; - private final Set vertices; - - public JungVariantGraphTransposition(JungVariantGraph graph, Set vertices) { - this.graph = graph; - this.vertices = Sets.newHashSet(vertices); - for (VariantGraph.Vertex vertex : this.vertices) { - graph.transpositionIndex.put(vertex, this); - } - } - - @Override - public void delete() { - for (VariantGraph.Vertex vertex : this.vertices) { - graph.transpositionIndex.remove(vertex, this); - } - } - - @Override - public Iterator iterator() { - return vertices.iterator(); - } - - @Override - public String toString() { - return Iterables.toString(vertices); - } -} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraphVertex.java b/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraphVertex.java deleted file mode 100644 index b136a19a7..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/jung/JungVariantGraphVertex.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.jung; - -import com.google.common.base.Predicate; -import com.google.common.base.Predicates; -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; -import eu.interedition.collatex.Token; -import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.Witness; - -import javax.annotation.Nullable; -import java.util.Collections; -import java.util.Set; - - -/** - * @author Gregor Middell - */ -public class JungVariantGraphVertex implements VariantGraph.Vertex { - private final JungVariantGraph graph; - private final Set tokens; - - public JungVariantGraphVertex(JungVariantGraph graph, Set tokens) { - this.graph = graph; - this.tokens = Sets.newHashSet(tokens); - } - - @Override - public Iterable incoming() { - return incoming(null); - } - - @Override - public Iterable incoming(final Set witnesses) { - return paths(graph.getInEdges(this), witnesses); - } - - @Override - public Iterable outgoing() { - return outgoing(null); - } - - @Override - public Iterable outgoing(Set witnesses) { - return paths(graph.getOutEdges(this), witnesses); - } - - @Override - public Iterable transpositions() { - return graph.transpositionIndex.get(this); - } - - @Override - public Set tokens() { - return tokens(null); - } - - @Override - public Set tokens(final Set witnesses) { - return Collections.unmodifiableSet(Sets.filter(tokens, witnesses == null ? Predicates.alwaysTrue() : new Predicate() { - @Override - public boolean apply(@Nullable Token token) { - return witnesses.contains(token.getWitness()); - } - })); - } - - @Override - public Set witnesses() { - final Set witnesses = Sets.newHashSet(); - for (VariantGraph.Edge edge : incoming()) { - witnesses.addAll(edge.witnesses()); - } - return witnesses; - } - - @Override - public void add(Iterable tokens) { - Iterables.addAll(this.tokens, tokens); - } - - @Override - public VariantGraph graph() { - return graph; - } - - @Override - public void delete() { - graph.removeVertex(this); - } - - @Override - public String toString() { - return Iterables.toString(tokens); - } - - protected static Iterable paths(final Iterable edges, final Set witnesses) { - return Iterables.filter(edges, (witnesses == null ? Predicates.alwaysTrue() : new Predicate() { - @Override - public boolean apply(@Nullable JungVariantGraphEdge edge) { - for (Witness edgeWitness : edge.witnesses()) { - if (witnesses.contains(edgeWitness)) { - return true; - } - } - return false; - } - })); - } -} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/jung/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/jung/package-info.java deleted file mode 100644 index 4b419d7a7..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/jung/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -/** - * In-memory implementation of variant graphs based on the Java - * Universal Network/Graph Framework (JUNG). - * - */ -package eu.interedition.collatex.jung; \ No newline at end of file diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java index f0bc92e2f..674a50de2 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -40,7 +40,7 @@ public static int compute(String str1, String str2) { } int[][][] cache = new int[30][][]; - int matrix[][]; + int matrix[][]; if (str2Length >= cache.length) { matrix = form(str1Length, str2Length); } else if (cache[str2Length] != null) { @@ -48,7 +48,7 @@ public static int compute(String str1, String str2) { } else { matrix = cache[str2Length] = form(str1Length, str2Length); } - + for (int i = 1; i <= str1Length; i++) { final char str1Char = str1Chars[i - 1]; for (int j = 1; j <= str2Length; j++) { diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistanceTokenComparator.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistanceTokenComparator.java index 00168f192..35879c766 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistanceTokenComparator.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistanceTokenComparator.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -26,20 +26,20 @@ public class EditDistanceTokenComparator implements Comparator { - private final int threshold; + private final int threshold; - public EditDistanceTokenComparator() { - this(1); - } + public EditDistanceTokenComparator() { + this(1); + } - public EditDistanceTokenComparator(int threshold) { - this.threshold = threshold; - } + public EditDistanceTokenComparator(int threshold) { + this.threshold = threshold; + } - @Override - public int compare(Token base, Token witness) { - final String baseContent = ((SimpleToken) base).getNormalized(); - final String witnessContent = ((SimpleToken) witness).getNormalized(); - return (EditDistance.compute(baseContent, witnessContent) <= threshold) ? 0 : -1; - } + @Override + public int compare(Token base, Token witness) { + final String baseContent = ((SimpleToken) base).getNormalized(); + final String witnessContent = ((SimpleToken) witness).getNormalized(); + return (EditDistance.compute(baseContent, witnessContent) <= threshold) ? 0 : -1; + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/EqualityTokenComparator.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/EqualityTokenComparator.java index 077638d33..fb213fe09 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/matching/EqualityTokenComparator.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/matching/EqualityTokenComparator.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -26,11 +26,11 @@ public class EqualityTokenComparator implements Comparator { - @Override - public int compare(Token base, Token witness) { - final String baseContent = ((SimpleToken) base).getNormalized(); - final String witnessContent = ((SimpleToken) witness).getNormalized(); - return baseContent.compareTo(witnessContent); - } + @Override + public int compare(Token base, Token witness) { + final String baseContent = ((SimpleToken) base).getNormalized(); + final String witnessContent = ((SimpleToken) witness).getNormalized(); + return baseContent.compareTo(witnessContent); + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/Matches.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/Matches.java index 3c5d2a356..184ad2217 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/matching/Matches.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/matching/Matches.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,105 +19,76 @@ package eu.interedition.collatex.matching; -import java.util.Collection; +import eu.interedition.collatex.Token; +import eu.interedition.collatex.VariantGraph; + +import java.util.ArrayList; +import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.List; import java.util.Map; import java.util.Set; - -import com.google.common.collect.ArrayListMultimap; -import com.google.common.collect.ImmutableMultiset; -import com.google.common.collect.Iterables; -import com.google.common.collect.ListMultimap; -import com.google.common.collect.Multiset; -import com.google.common.collect.Sets; - -import eu.interedition.collatex.Token; -import eu.interedition.collatex.VariantGraph; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; public class Matches { - private final ListMultimap all; - private final Set unmatched; - private final Set ambiguous; - private final Set unique; - - public static Matches between(final Iterable vertices, final Iterable witnessTokens, Comparator comparator) { - - final ListMultimap all = ArrayListMultimap.create(); - for (VariantGraph.Vertex vertex : vertices) { - final Set tokens = vertex.tokens(); - if (tokens.isEmpty()) { - continue; - } - for (Token witnessToken : witnessTokens) { - if (comparator.compare(Iterables.getFirst(tokens, null), witnessToken) == 0) { - all.put(witnessToken, vertex); - } - } + public final Map> allMatches; + public final Set unmatchedInWitness; + public final Set ambiguousInWitness; + public final Set uniqueInWitness; + + public static Matches between(final Iterable vertices, final Iterable witnessTokens, Comparator comparator) { + + final Map> allMatches = new HashMap<>(); + + StreamSupport.stream(vertices.spliterator(), false).forEach(vertex -> + vertex.tokens().stream().findFirst().ifPresent(baseToken -> + StreamSupport.stream(witnessTokens.spliterator(), false) + .filter(witnessToken -> comparator.compare(baseToken, witnessToken) == 0) + .forEach(matchingToken -> allMatches.computeIfAbsent(matchingToken, t -> new ArrayList<>()).add(vertex)))); + + final Set unmatchedInWitness = StreamSupport.stream(witnessTokens.spliterator(), false) + .filter(t -> !allMatches.containsKey(t)) + .collect(Collectors.toCollection(LinkedHashSet::new)); + + final Set ambiguousInBase = allMatches.values().stream() + .flatMap(List::stream) + .collect(Collectors.toMap(Function.identity(), v -> 1, (a, b) -> a + b)) + .entrySet() + .stream() + .filter(v -> v.getValue() > 1) + .map(Map.Entry::getKey) + .collect(Collectors.toCollection(LinkedHashSet::new)); + + // (have to check: base -> witness, and witness -> base) + final Set ambiguousInWitness = Stream.concat( + StreamSupport.stream(witnessTokens.spliterator(), false) + .filter(t -> allMatches.getOrDefault(t, Collections.emptyList()).size() > 1), + + allMatches.entrySet().stream() + .filter(match -> match.getValue().stream().anyMatch(ambiguousInBase::contains)) + .map(Map.Entry::getKey) + ).collect(Collectors.toCollection(LinkedHashSet::new)); + + // sure tokens + // have to check unsure tokens because of (base -> witness && witness -> base) + final Set uniqueInWitness = StreamSupport.stream(witnessTokens.spliterator(), false) + .filter(t -> allMatches.getOrDefault(t, Collections.emptyList()).size() == 1 && !ambiguousInWitness.contains(t)) + .collect(Collectors.toCollection(LinkedHashSet::new)); + + return new Matches(allMatches, unmatchedInWitness, ambiguousInWitness, uniqueInWitness); } - // unmatched tokens - Set unmatched = Sets.newLinkedHashSet(); - for (Token witnessToken : witnessTokens) { - if (!all.containsKey(witnessToken)) { - unmatched.add(witnessToken); - } - } - // unsure tokens (have to check: base -> witness, and witness -> base) - Set ambiguous = Sets.newLinkedHashSet(); - for (Token witnessToken : witnessTokens) { - int count = all.keys().count(witnessToken); - if (count > 1) { - ambiguous.add(witnessToken); - } - } - Multiset bag = ImmutableMultiset.copyOf(all.values()); - Set unsureBaseTokens = Sets.newLinkedHashSet(); - for (VariantGraph.Vertex baseToken : vertices) { - int count = bag.count(baseToken); - if (count > 1) { - unsureBaseTokens.add(baseToken); - } + private Matches(Map> allMatches, Set unmatchedInWitness, Set ambiguousInWitness, Set uniqueInWitness) { + this.allMatches = Collections.unmodifiableMap(allMatches); + this.unmatchedInWitness = Collections.unmodifiableSet(unmatchedInWitness); + this.ambiguousInWitness = Collections.unmodifiableSet(ambiguousInWitness); + this.uniqueInWitness = Collections.unmodifiableSet(uniqueInWitness); } - Collection> entries = all.entries(); - for (Map.Entry entry : entries) { - if (unsureBaseTokens.contains(entry.getValue())) { - ambiguous.add(entry.getKey()); - } - } - // sure tokens - // have to check unsure tokens because of (base -> witness && witness -> base) - Set unique = Sets.newLinkedHashSet(); - for (Token witnessToken : witnessTokens) { - if (all.keys().count(witnessToken) == 1 && !ambiguous.contains(witnessToken)) { - unique.add(witnessToken); - } - } - - return new Matches(all, unmatched, ambiguous, unique); - } - - private Matches(ListMultimap all, Set unmatched, Set ambiguous, Set unique) { - this.all = all; - this.unmatched = unmatched; - this.ambiguous = ambiguous; - this.unique = unique; - } - - public ListMultimap getAll() { - return all; - } - - public Set getUnmatched() { - return unmatched; - } - - public Set getAmbiguous() { - return ambiguous; - } - - public Set getUnique() { - return unique; - } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/StrictEqualityTokenComparator.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/StrictEqualityTokenComparator.java index 88b4dae01..b52b9ac43 100755 --- a/collatex-core/src/main/java/eu/interedition/collatex/matching/StrictEqualityTokenComparator.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/matching/StrictEqualityTokenComparator.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,18 +19,18 @@ package eu.interedition.collatex.matching; -import java.util.Comparator; - import eu.interedition.collatex.Token; import eu.interedition.collatex.simple.SimpleToken; +import java.util.Comparator; + public class StrictEqualityTokenComparator implements Comparator { - @Override - public int compare(Token base, Token witness) { - final String baseContent = ((SimpleToken) base).getContent(); - final String witnessContent = ((SimpleToken) witness).getContent(); - return baseContent.compareTo(witnessContent); - } + @Override + public int compare(Token base, Token witness) { + final String baseContent = ((SimpleToken) base).getContent(); + final String witnessContent = ((SimpleToken) witness).getContent(); + return baseContent.compareTo(witnessContent); + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/package-info.java index 6e3ba47b9..b6220450e 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/matching/package-info.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/matching/package-info.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,7 +19,7 @@ /** * {@link java.util.Comparator Comparators} for matching tokens. - *

+ * * Implementation base the equality of tokens on strict or on approximate equality of their respective textual contents. * * @see eu.interedition.collatex.matching.StrictEqualityTokenComparator diff --git a/collatex-core/src/main/java/eu/interedition/collatex/medite/AlignmentDecisionGraph.java b/collatex-core/src/main/java/eu/interedition/collatex/medite/AlignmentDecisionGraph.java index a2553b0ac..6eec94b35 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/medite/AlignmentDecisionGraph.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/medite/AlignmentDecisionGraph.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,140 +19,134 @@ package eu.interedition.collatex.medite; -import com.google.common.base.Function; -import com.google.common.base.Objects; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; import eu.interedition.collatex.util.VertexMatch; +import java.util.ArrayList; import java.util.Comparator; +import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.PriorityQueue; import java.util.SortedSet; +import java.util.TreeSet; +import java.util.function.Function; /** - * @author Gregor Middell + * @author Gregor Middell */ public class AlignmentDecisionGraph { - private final List> matches; - private final Function, Integer> matchEvaluator; - private final PriorityQueue bestPaths; - private final Map minCosts; - - AlignmentDecisionGraph(List> matches, Function, Integer> matchEvaluator) { - this.matches = matches; - this.matchEvaluator = matchEvaluator; - this.bestPaths = new PriorityQueue(matches.size(), PATH_COST_COMPARATOR); - this.minCosts = Maps.newHashMap(); - } - - static SortedSet> filter(SortedSet> matches, Function, Integer> matchEvaluator) { - final SortedSet> alignments = Sets.newTreeSet(VertexMatch.setComparator()); - - final List> matchList = Lists.newArrayList(matches); - Node optimal = new AlignmentDecisionGraph(matchList, matchEvaluator).findBestPath(); - while (optimal.matchIndex >= 0) { - if (optimal.aligned) { - alignments.add(matchList.get(optimal.matchIndex)); - } - optimal = optimal.previous; - } - return alignments; - } - - private Node findBestPath() { - bestPaths.add(new Node(-1, false)); - while (!bestPaths.isEmpty()) { - final Node current = bestPaths.remove(); - if (current.matchIndex == matches.size() - 1) { - return current; - } - for (Node successor : current.successors()) { - final int tentativeCost = cost(current) + cost(successor); - if (bestPaths.contains(successor) && tentativeCost >= minCosts.get(successor)) { - continue; - } - minCosts.put(successor, tentativeCost); + private final List> matches; + private final Function, Integer> matchEvaluator; + private final PriorityQueue bestPaths; + private final Map minCosts; - successor.cost = tentativeCost + heuristicCost(successor); - successor.previous = current; - bestPaths.remove(successor); - bestPaths.add(successor); - } - } - throw new IllegalStateException("No optimal alignment found"); - } - - private int heuristicCost(Node path) { - final SortedSet evaluated = matches.get(path.matchIndex); - final VertexMatch.WithTokenIndex lastMatch = evaluated.last(); - - int cost = 0; - for (SortedSet following : matches.subList(path.matchIndex + 1, matches.size())) { - final VertexMatch.WithTokenIndex followingFirstMatch = following.first(); - if (lastMatch.vertexRank < followingFirstMatch.vertexRank && lastMatch.token < followingFirstMatch.token) { - // we still can align this following match as the matched components are to the right of this path's last match - continue; - } - // we cannot align this following match, so add it to the cost - cost += value(following); + AlignmentDecisionGraph(List> matches, Function, Integer> matchEvaluator) { + this.matches = matches; + this.matchEvaluator = matchEvaluator; + this.bestPaths = new PriorityQueue<>(matches.size(), Comparator.comparingInt(n -> n.cost)); + this.minCosts = new HashMap<>(); } - return cost; - } - - private int cost(Node current) { - int cost = 0; - while (current != null && current.matchIndex >= 0) { - if (!current.aligned) { - cost += value(matches.get(current.matchIndex)); - } - current = current.previous; + + static SortedSet> filter(SortedSet> matches, Function, Integer> matchEvaluator) { + final SortedSet> alignments = new TreeSet<>(VertexMatch.setComparator()); + + final List> matchList = new ArrayList<>(matches); + Node optimal = new AlignmentDecisionGraph(matchList, matchEvaluator).findBestPath(); + while (optimal.matchIndex >= 0) { + if (optimal.aligned) { + alignments.add(matchList.get(optimal.matchIndex)); + } + optimal = optimal.previous; + } + return alignments; } - return cost; - } - - private int value(SortedSet match) { - return matchEvaluator.apply(match); - } - - static class Node { - final int matchIndex; - final boolean aligned; - Node previous; - int cost; - - Node(int matchIndex, boolean aligned) { - this.matchIndex = matchIndex; - this.aligned = aligned; + + private Node findBestPath() { + bestPaths.add(new Node(-1, false)); + while (!bestPaths.isEmpty()) { + final Node current = bestPaths.remove(); + if (current.matchIndex == matches.size() - 1) { + return current; + } + for (Node successor : current.successors()) { + final int tentativeCost = cost(current) + cost(successor); + if (bestPaths.contains(successor) && tentativeCost >= minCosts.get(successor)) { + continue; + } + minCosts.put(successor, tentativeCost); + + successor.cost = tentativeCost + heuristicCost(successor); + successor.previous = current; + bestPaths.remove(successor); + bestPaths.add(successor); + } + } + throw new IllegalStateException("No optimal alignment found"); } - Node[] successors() { - final int nextIndex = matchIndex + 1; - return new Node[] { new Node(nextIndex, true), new Node(nextIndex, false) }; + private int heuristicCost(Node path) { + final SortedSet evaluated = matches.get(path.matchIndex); + final VertexMatch.WithTokenIndex lastMatch = evaluated.last(); + + int cost = 0; + for (SortedSet following : matches.subList(path.matchIndex + 1, matches.size())) { + final VertexMatch.WithTokenIndex followingFirstMatch = following.first(); + if (lastMatch.vertexRank < followingFirstMatch.vertexRank && lastMatch.token < followingFirstMatch.token) { + // we still can align this following match as the matched components are to the right of this path's last match + continue; + } + // we cannot align this following match, so add it to the cost + cost += value(following); + } + return cost; } - @Override - public boolean equals(Object obj) { - if (obj != null && obj instanceof Node) { - final Node other = (Node) obj; - return (matchIndex == other.matchIndex) && (aligned == other.aligned); - } - return super.equals(obj); + private int cost(Node current) { + int cost = 0; + while (current != null && current.matchIndex >= 0) { + if (!current.aligned) { + cost += value(matches.get(current.matchIndex)); + } + current = current.previous; + } + return cost; } - @Override - public int hashCode() { - return Objects.hashCode(matchIndex, aligned); + private int value(SortedSet match) { + return matchEvaluator.apply(match); } - } - static final Comparator PATH_COST_COMPARATOR = new Comparator() { - @Override - public int compare(Node o1, Node o2) { - return (o1.cost - o2.cost); + static class Node { + final int matchIndex; + final boolean aligned; + Node previous; + int cost; + + Node(int matchIndex, boolean aligned) { + this.matchIndex = matchIndex; + this.aligned = aligned; + } + + Node[] successors() { + final int nextIndex = matchIndex + 1; + return new Node[]{new Node(nextIndex, true), new Node(nextIndex, false)}; + } + + @Override + public boolean equals(Object obj) { + if (obj != null && obj instanceof Node) { + final Node other = (Node) obj; + return (matchIndex == other.matchIndex) && (aligned == other.aligned); + } + return super.equals(obj); + } + + @Override + public int hashCode() { + return Objects.hash(matchIndex, aligned); + } } - }; + } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/medite/Matches.java b/collatex-core/src/main/java/eu/interedition/collatex/medite/Matches.java index 8b732e958..7b0439796 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/medite/Matches.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/medite/Matches.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,200 +19,199 @@ package eu.interedition.collatex.medite; -import com.google.common.base.Function; -import com.google.common.base.Joiner; -import com.google.common.base.Preconditions; -import com.google.common.collect.HashMultimap; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.collect.Multimap; -import com.google.common.collect.Range; -import com.google.common.collect.Sets; -import com.google.common.collect.SortedSetMultimap; import eu.interedition.collatex.Token; import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.util.IntegerRangeSet; -import eu.interedition.collatex.util.VariantGraphRanking; import eu.interedition.collatex.util.VertexMatch; import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; +import java.util.function.Function; +import java.util.stream.Collectors; /** - * @author Gregor Middell + * @author Gregor Middell */ public class Matches extends ArrayList> { - public Matches(int initialCapacity) { - super(initialCapacity); - } + public Matches(int initialCapacity) { + super(initialCapacity); + } - public static Matches between(VariantGraph.Vertex[][] vertices, SuffixTree suffixTree, Function, Integer> matchEvaluator) { + public static Matches between(VariantGraph.Vertex[][] vertices, SuffixTree suffixTree, Function, Integer> matchEvaluator) { - final Multimap matchThreads = HashMultimap.create(); - for (int rank = 0; rank < vertices.length; rank++) { - for (VariantGraph.Vertex vertex : vertices[rank]) { - final MatchThreadElement matchThreadElement = new MatchThreadElement(suffixTree).advance(vertex, rank); - if (matchThreadElement != null) { - matchThreads.put(rank, matchThreadElement); - } - } - for (MatchThreadElement matchThreadElement : matchThreads.get(rank - 1)) { - for (VariantGraph.Vertex vertex : vertices[rank]) { - final MatchThreadElement advanced = matchThreadElement.advance(vertex, rank); - if (advanced != null) { - matchThreads.put(rank, advanced); - } + final Map> matchThreads = new HashMap<>(); + for (int rank = 0; rank < vertices.length; rank++) { + for (VariantGraph.Vertex vertex : vertices[rank]) { + final MatchThreadElement matchThreadElement = new MatchThreadElement(suffixTree).advance(vertex, rank); + if (matchThreadElement != null) { + matchThreads.computeIfAbsent(rank, r -> new LinkedList<>()).add(matchThreadElement); + } + } + for (MatchThreadElement matchThreadElement : matchThreads.getOrDefault(rank - 1, Collections.emptyList())) { + for (VariantGraph.Vertex vertex : vertices[rank]) { + final MatchThreadElement advanced = matchThreadElement.advance(vertex, rank); + if (advanced != null) { + matchThreads.computeIfAbsent(rank, r -> new LinkedList<>()).add(advanced); + } + } + } } - } - } - final Matches matches = new Matches(matchThreads.size()); - for (MatchThreadElement matchThreadElement : matchThreads.values()) { - final List> threadPhrases = Lists.newArrayList(); - boolean firstElement = true; - for (MatchThreadElement threadElement : matchThreadElement.thread()) { - final SuffixTree.EquivalenceClass equivalenceClass = threadElement.cursor.matchedClass(); - for (int mc = 0; mc < equivalenceClass.length; mc++) { - final int tokenCandidate = equivalenceClass.members[mc]; - if (firstElement) { - final SortedSet phrase = new TreeSet(); - phrase.add(new VertexMatch.WithTokenIndex(threadElement.vertex, threadElement.vertexRank, tokenCandidate)); - threadPhrases.add(phrase); - } else { - for (SortedSet phrase : threadPhrases) { - if ((phrase.last().token + 1) == tokenCandidate) { - phrase.add(new VertexMatch.WithTokenIndex(threadElement.vertex, threadElement.vertexRank, tokenCandidate)); - } + final Matches matches = new Matches(matchThreads.size()); + matchThreads.values().stream().flatMap(List::stream).forEach(matchThreadElement -> { + final List> threadPhrases = new ArrayList<>(); + boolean firstElement = true; + for (MatchThreadElement threadElement : matchThreadElement.thread()) { + final SuffixTree.EquivalenceClass equivalenceClass = threadElement.cursor.matchedClass(); + for (int mc = 0; mc < equivalenceClass.length; mc++) { + final int tokenCandidate = equivalenceClass.members[mc]; + if (firstElement) { + final SortedSet phrase = new TreeSet<>(); + phrase.add(new VertexMatch.WithTokenIndex(threadElement.vertex, threadElement.vertexRank, tokenCandidate)); + threadPhrases.add(phrase); + } else { + for (SortedSet phrase : threadPhrases) { + if ((phrase.last().token + 1) == tokenCandidate) { + phrase.add(new VertexMatch.WithTokenIndex(threadElement.vertex, threadElement.vertexRank, tokenCandidate)); + } + } + } + } + firstElement = false; } - } - } - firstElement = false; - } - matches.addAll(threadPhrases); + matches.addAll(threadPhrases); + }); + Collections.sort(matches, maximalUniqueMatchOrdering(matchEvaluator)); + + return matches; } - Collections.sort(matches, maximalUniqueMatchOrdering(matchEvaluator)); - - return matches; - } - - private static Comparator> maximalUniqueMatchOrdering(final Function, Integer> matchEvaluator) { - return new Comparator>() { - @Override - public int compare(SortedSet o1, SortedSet o2) { - // 1. reverse ordering by match value - int result = matchEvaluator.apply(o2) - matchEvaluator.apply(o1); - if (result != 0) { - return result; - } - final VertexMatch.WithTokenIndex firstMatch1 = o1.first(); - final VertexMatch.WithTokenIndex firstMatch2 = o2.first(); + private static Comparator> maximalUniqueMatchOrdering(final Function, Integer> matchEvaluator) { + return new Comparator>() { + @Override + public int compare(SortedSet o1, SortedSet o2) { + // 1. reverse ordering by match value + int result = matchEvaluator.apply(o2) - matchEvaluator.apply(o1); + if (result != 0) { + return result; + } - // 2. ordering by match distance - result = (Math.abs(firstMatch1.token - firstMatch1.vertexRank) - Math.abs(firstMatch2.token - firstMatch2.vertexRank)); - if (result != 0) { - return result; - } + final VertexMatch.WithTokenIndex firstMatch1 = o1.first(); + final VertexMatch.WithTokenIndex firstMatch2 = o2.first(); + // 2. ordering by match distance + result = (Math.abs(firstMatch1.token - firstMatch1.vertexRank) - Math.abs(firstMatch2.token - firstMatch2.vertexRank)); + if (result != 0) { + return result; + } - // 3. ordering by first vertex ranking - result = firstMatch1.vertexRank - firstMatch2.vertexRank; - if (result != 0) { - return result; - } - // 3. ordering by first token index - return firstMatch1.token - firstMatch2.token; + // 3. ordering by first vertex ranking + result = firstMatch1.vertexRank - firstMatch2.vertexRank; + if (result != 0) { + return result; + } - } - }; - } + // 3. ordering by first token index + return firstMatch1.token - firstMatch2.token; - public SortedSet> findMaximalUniqueMatches() { - final List> allMatches = Lists.newArrayList(this); - final SortedSet> maximalUniqueMatches = Sets.newTreeSet(VertexMatch.setComparator()); + } + }; + } - while (true) { - SortedSet nextMum = null; - SortedSet candidate = null; - for (SortedSet successor : allMatches) { - if (candidate == null) { - continue; - } - if (candidate.size() > successor.size() || candidate.first().token == successor.first().token) { - nextMum = candidate; - break; + public SortedSet> findMaximalUniqueMatches() { + final List> allMatches = new ArrayList<>(this); + final SortedSet> maximalUniqueMatches = new TreeSet<>(VertexMatch.setComparator()); + + while (true) { + SortedSet nextMum = null; + SortedSet candidate = null; + for (SortedSet successor : allMatches) { + if (candidate == null) { + continue; + } + if (candidate.size() > successor.size() || candidate.first().token == successor.first().token) { + nextMum = candidate; + break; + } + candidate = successor; + } + if (nextMum == null) { + nextMum = allMatches.stream().findFirst().orElse(null); + } + if (nextMum == null) { + break; + } + if (!maximalUniqueMatches.add(nextMum)) { + throw new IllegalStateException("Duplicate MUM"); + } + + final BitSet rankFilter = new BitSet(); + final BitSet tokenFilter = new BitSet(); + + rankFilter.set(nextMum.first().vertexRank, nextMum.last().vertexRank + 1); + tokenFilter.set(nextMum.first().token, nextMum.last().token + 1); + + allMatches.removeIf(VertexMatch.filter(rankFilter, tokenFilter)); } - candidate = successor; - } - if (nextMum == null) { - nextMum = Iterables.getFirst(allMatches, null); - } - if (nextMum == null) { - break; - } - Preconditions.checkState(maximalUniqueMatches.add(nextMum), "Duplicate MUM"); - - Iterables.removeIf(allMatches, VertexMatch.filter( - new IntegerRangeSet(Range.closed(nextMum.first().vertexRank, nextMum.last().vertexRank)), - new IntegerRangeSet(Range.closed(nextMum.first().token, nextMum.last().token)) - )); + return maximalUniqueMatches; } - return maximalUniqueMatches; - } - /** - * @author Gregor Middell - */ - static class MatchThreadElement { + /** + * @author Gregor Middell + */ + static class MatchThreadElement { - final MatchThreadElement previous; - final VariantGraph.Vertex vertex; - final int vertexRank; - final SuffixTree.Cursor cursor; + final MatchThreadElement previous; + final VariantGraph.Vertex vertex; + final int vertexRank; + final SuffixTree.Cursor cursor; - MatchThreadElement(SuffixTree suffixTree) { - this(null, null, -1, suffixTree.cursor()); - } + MatchThreadElement(SuffixTree suffixTree) { + this(null, null, -1, suffixTree.cursor()); + } - MatchThreadElement(MatchThreadElement previous, VariantGraph.Vertex vertex, int vertexRank, SuffixTree.Cursor cursor) { - this.previous = previous; - this.vertex = vertex; - this.vertexRank = vertexRank; - this.cursor = cursor; - } + MatchThreadElement(MatchThreadElement previous, VariantGraph.Vertex vertex, int vertexRank, SuffixTree.Cursor cursor) { + this.previous = previous; + this.vertex = vertex; + this.vertexRank = vertexRank; + this.cursor = cursor; + } - MatchThreadElement advance(VariantGraph.Vertex vertex, int vertexRank) { - final Set tokens = vertex.tokens(); - if (!tokens.isEmpty()) { - final SuffixTree.Cursor next = cursor.move(Iterables.get(tokens, 0)); - if (next != null) { - return new MatchThreadElement(this, vertex, vertexRank, next); + MatchThreadElement advance(VariantGraph.Vertex vertex, int vertexRank) { + final Set tokens = vertex.tokens(); + if (!tokens.isEmpty()) { + final SuffixTree.Cursor next = cursor.move(tokens.stream().findFirst().get()); + if (next != null) { + return new MatchThreadElement(this, vertex, vertexRank, next); + } + } + return null; } - } - return null; - } - List thread() { - final LinkedList thread = Lists.newLinkedList(); - MatchThreadElement current = this; - while (current.vertex != null) { - thread.addFirst(current); - current = current.previous; - } - return thread; - } + List thread() { + final LinkedList thread = new LinkedList<>(); + MatchThreadElement current = this; + while (current.vertex != null) { + thread.addFirst(current); + current = current.previous; + } + return thread; + } - @Override - public String toString() { - return "[" + Joiner.on(", ").join(vertexRank, vertex, cursor.matchedClass()) + "]"; + @Override + public String toString() { + return "[" + Arrays.asList(vertexRank, vertex, cursor.matchedClass()).stream().map(Object::toString).collect(Collectors.joining(", ")) + "]"; + } } - } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/medite/MediteAlgorithm.java b/collatex-core/src/main/java/eu/interedition/collatex/medite/MediteAlgorithm.java index ad3ed02d8..bf5a02750 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/medite/MediteAlgorithm.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/medite/MediteAlgorithm.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,86 +19,85 @@ package eu.interedition.collatex.medite; -import com.google.common.base.Function; -import com.google.common.collect.Iterables; -import com.google.common.collect.Range; -import com.google.common.collect.Sets; import eu.interedition.collatex.CollationAlgorithm; import eu.interedition.collatex.Token; import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.util.IntegerRangeSet; import eu.interedition.collatex.util.VariantGraphRanking; import eu.interedition.collatex.util.VertexMatch; +import java.util.BitSet; import java.util.Comparator; import java.util.SortedSet; import java.util.TreeSet; +import java.util.function.Function; +import java.util.stream.IntStream; +import java.util.stream.StreamSupport; /** - * @author Gregor Middell + * @author Gregor Middell */ public class MediteAlgorithm extends CollationAlgorithm.Base { - private final Comparator comparator; - private final Function, Integer> matchEvaluator; + private final Comparator comparator; + private final Function, Integer> matchEvaluator; - public MediteAlgorithm(Comparator comparator, Function, Integer> matchEvaluator) { - this.comparator = comparator; - this.matchEvaluator = matchEvaluator; - } + public MediteAlgorithm(Comparator comparator, Function, Integer> matchEvaluator) { + this.comparator = comparator; + this.matchEvaluator = matchEvaluator; + } - @Override - public void collate(VariantGraph graph, Iterable witness) { - final VariantGraph.Vertex[][] vertices = VariantGraphRanking.of(graph).asArray(); - final Token[] tokens = Iterables.toArray(witness, Token.class); + @Override + public void collate(VariantGraph graph, Iterable witness) { + final VariantGraph.Vertex[][] vertices = VariantGraphRanking.of(graph).asArray(); + final Token[] tokens = StreamSupport.stream(witness.spliterator(), false).toArray(Token[]::new); - final SuffixTree suffixTree = SuffixTree.build(comparator, tokens); - final MatchEvaluatorWrapper matchEvaluator = new MatchEvaluatorWrapper(this.matchEvaluator, tokens); + final SuffixTree suffixTree = SuffixTree.build(comparator, tokens); + final MatchEvaluatorWrapper matchEvaluator = new MatchEvaluatorWrapper(this.matchEvaluator, tokens); - final Matches matchCandidates = Matches.between(vertices, suffixTree, matchEvaluator); - final SortedSet> matches = Sets.newTreeSet(VertexMatch.setComparator()); + final Matches matchCandidates = Matches.between(vertices, suffixTree, matchEvaluator); + final SortedSet> matches = new TreeSet<>(VertexMatch.setComparator()); - while (true) { - final SortedSet> maximalUniqueMatches = matchCandidates.findMaximalUniqueMatches(); - if (maximalUniqueMatches.isEmpty()) { - break; - } + while (true) { + final SortedSet> maximalUniqueMatches = matchCandidates.findMaximalUniqueMatches(); + if (maximalUniqueMatches.isEmpty()) { + break; + } - final IntegerRangeSet rankFilter = new IntegerRangeSet(); - final IntegerRangeSet tokenFilter = new IntegerRangeSet(); + final BitSet rankFilter = new BitSet(); + final BitSet tokenFilter = new BitSet(); - for (SortedSet phrase : AlignmentDecisionGraph.filter(maximalUniqueMatches, matchEvaluator)) { - final VertexMatch.WithTokenIndex firstMatch = phrase.first(); - final VertexMatch.WithTokenIndex lastMatch = phrase.last(); + for (SortedSet phrase : AlignmentDecisionGraph.filter(maximalUniqueMatches, matchEvaluator)) { + final VertexMatch.WithTokenIndex firstMatch = phrase.first(); + final VertexMatch.WithTokenIndex lastMatch = phrase.last(); - matches.add(phrase); - rankFilter.add(Range.closed(firstMatch.vertexRank, lastMatch.vertexRank)); - tokenFilter.add(Range.closed(firstMatch.token, lastMatch.token)); - } + matches.add(phrase); + IntStream.range(firstMatch.vertexRank, lastMatch.vertexRank + 1).forEach(rankFilter::set); + IntStream.range(firstMatch.token, lastMatch.token + 1).forEach(tokenFilter::set); + } - Iterables.removeIf(matchCandidates, VertexMatch.filter(rankFilter, tokenFilter)); - } + matchCandidates.removeIf(VertexMatch.filter(rankFilter, tokenFilter)); + } - merge(graph, vertices, tokens, matches); - } + merge(graph, vertices, tokens, matches); + } - static class MatchEvaluatorWrapper implements Function, Integer> { + static class MatchEvaluatorWrapper implements Function, Integer> { - private final Function, Integer> wrapped; - private final Function tokenResolver; + private final Function, Integer> wrapped; + private final Function tokenResolver; - MatchEvaluatorWrapper(final Function, Integer> wrapped, final Token[] tokens) { - this.wrapped = wrapped; - this.tokenResolver = VertexMatch.tokenResolver(tokens); - } + MatchEvaluatorWrapper(final Function, Integer> wrapped, final Token[] tokens) { + this.wrapped = wrapped; + this.tokenResolver = VertexMatch.tokenResolver(tokens); + } - @Override - public Integer apply(SortedSet input) { - final SortedSet tokenPhrase = new TreeSet(); - for (VertexMatch.WithTokenIndex match : input) { - tokenPhrase.add(tokenResolver.apply(match)); - } - return wrapped.apply(tokenPhrase); + @Override + public Integer apply(SortedSet input) { + final SortedSet tokenPhrase = new TreeSet<>(); + for (VertexMatch.WithTokenIndex match : input) { + tokenPhrase.add(tokenResolver.apply(match)); + } + return wrapped.apply(tokenPhrase); + } } - } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/medite/SuffixTree.java b/collatex-core/src/main/java/eu/interedition/collatex/medite/SuffixTree.java index 7c560daa4..df1d6096b 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/medite/SuffixTree.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/medite/SuffixTree.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,12 +19,6 @@ package eu.interedition.collatex.medite; -import com.google.common.base.Joiner; -import com.google.common.base.Strings; -import com.google.common.collect.AbstractIterator; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; - import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; @@ -34,256 +28,249 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.IntStream; /** - * @author Gregor Middell + * @author Gregor Middell */ class SuffixTree { - final Comparator comparator; - final Comparator sourceComparator; - final T[] source; - final Node root; - - static SuffixTree build(Comparator comparator, T... source) { - return new SuffixTree(comparator, source).build(); - } - - private SuffixTree(Comparator comparator, T... source) { - this.comparator = comparator; - this.sourceComparator = new SentinelAwareComparator(comparator); - this.source = source; - this.root = new Node(); - } - - public Cursor cursor() { - return new Cursor(); - } - - public Iterable match(final Iterable str) { - return new Iterable() { - @Override - public Iterator iterator() { - return new AbstractIterator() { - - Cursor cursor = cursor(); - final Iterator it = str.iterator(); - - @Override - protected EquivalenceClass computeNext() { - if (it.hasNext()) { - cursor = cursor.move(it.next()); - return (cursor == null ? endOfData() : cursor.matchedClass()); - } - return endOfData(); - } - }; - } - }; - } - + final Comparator comparator; + final Comparator sourceComparator; + final T[] source; + final Node root; - private SuffixTree build() { - for (int suffixStart = 0; suffixStart <= source.length; suffixStart++) { - root.addSuffix(suffixStart); + @SafeVarargs + static SuffixTree build(Comparator comparator, T... source) { + return new SuffixTree<>(comparator, source).build(); } - compactNodes(root); - return this; - } - - private void compactNodes(Node node) { - for (Node child : node.children) { - while (child.children.size() == 1) { - final Node firstGrandChild = child.children.iterator().next(); - child.incomingLabel.add(firstGrandChild.incomingLabel.getFirst()); - child.children = firstGrandChild.children; - for (Node formerGrandchild : child.children) { - formerGrandchild.parent = child; - } - } - compactNodes(child); + + @SafeVarargs + private SuffixTree(Comparator comparator, T... source) { + this.comparator = comparator; + this.sourceComparator = new SentinelAwareComparator(comparator); + this.source = source; + this.root = new Node(); } - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder(); - final Deque nodes = new ArrayDeque(Collections.singleton(root)); - while (!nodes.isEmpty()) { - final Node node = nodes.remove(); - sb.append(Strings.repeat("\t", node.depth())).append(node).append("\n"); - for (Node child : node.children) { - nodes.addFirst(child); - } + + public Cursor cursor() { + return new Cursor(); } - return sb.toString(); - } - /** - * @author Gregor Middell - */ - class Node { + public Iterable match(final Iterable str) { + return () -> new Iterator() { - final LinkedList incomingLabel; + final Iterator it = str.iterator(); + Optional cursor = Optional.ofNullable(it.hasNext() ? cursor().move(it.next()) : null); - Node parent; - List children = new ArrayList(); + @Override + public boolean hasNext() { + return cursor.isPresent(); + } - public Node(Node parent, int firstIndex) { - this.parent = parent; - this.incomingLabel = Lists.newLinkedList(Collections.singleton(new EquivalenceClass(firstIndex))); - } + @Override + public EquivalenceClass next() { + final EquivalenceClass next = cursor.get().matchedClass(); + cursor = Optional.ofNullable(it.hasNext() ? cursor.get().move(it.next()) : null); + return next; + } - public Node() { - this.parent = null; - this.incomingLabel = null; + }; } - public int depth() { - int depth = 0; - for (Node parent = this.parent; parent != null; parent = parent.parent) { - depth++; - } - return depth; - } - - public void addSuffix(int start) { - addSuffix(this, start); + private SuffixTree build() { + for (int suffixStart = 0; suffixStart <= source.length; suffixStart++) { + root.addSuffix(suffixStart); + } + compactNodes(root); + return this; } - private Node addSuffix(Node node, int start) { - for (Node child : node.children) { - EquivalenceClass childClass = child.incomingLabel.getFirst(); - if (childClass.isMember(start)) { - childClass.add(start); - start++; - if (start == (source.length + 1)) { - return child; - } - return addSuffix(child, start); + private void compactNodes(Node node) { + for (Node child : node.children) { + while (child.children.size() == 1) { + final Node firstGrandChild = child.children.iterator().next(); + child.incomingLabel.add(firstGrandChild.incomingLabel.getFirst()); + child.children = firstGrandChild.children; + for (Node formerGrandchild : child.children) { + formerGrandchild.parent = child; + } + } + compactNodes(child); } - } - while (start <= source.length) { - Node child = new Node(node, start); - node.children.add(child); - node = child; - start++; - } - return node; } @Override public String toString() { - return Iterables.toString(incomingLabel == null ? Collections.emptySet() : incomingLabel); + final StringBuilder sb = new StringBuilder(); + final Deque nodes = new ArrayDeque<>(Collections.singleton(root)); + while (!nodes.isEmpty()) { + final Node node = nodes.remove(); + sb.append(IntStream.range(0, node.depth()).mapToObj(i -> "\t").collect(Collectors.joining())).append(node).append("\n"); + node.children.forEach(nodes::addFirst); + } + return sb.toString(); } - } - class EquivalenceClass implements Comparable { + /** + * @author Gregor Middell + */ + class Node { - int[] members = new int[2]; - int length = 0; + final LinkedList incomingLabel; - EquivalenceClass(int first) { - members[length++] = first; - } + Node parent; + List children = new ArrayList<>(); - void add(int member) { - if (length == members.length) { - members = Arrays.copyOf(members, members.length * 2); - } - members[length++] = member; - } + public Node(Node parent, int firstIndex) { + this.parent = parent; + this.incomingLabel = new LinkedList<>(Collections.singleton(new EquivalenceClass(firstIndex))); + } - boolean isMember(int index) { - return sourceComparator.compare(index, members[0]) == 0; - } + public Node() { + this.parent = null; + this.incomingLabel = null; + } - public boolean isMember(T symbol) { - return (members[0] == source.length ? false : comparator.compare(symbol, source[members[0]]) == 0); - } - @Override - public boolean equals(Object obj) { - if (obj != null && obj instanceof SuffixTree.EquivalenceClass) { - return members[0] == ((SuffixTree.EquivalenceClass)obj).members[0]; - } - return super.equals(obj); - } + public int depth() { + int depth = 0; + for (Node parent = this.parent; parent != null; parent = parent.parent) { + depth++; + } + return depth; + } - @Override - public int hashCode() { - return members[0]; - } + public void addSuffix(int start) { + addSuffix(this, start); + } - @Override - public int compareTo(EquivalenceClass o) { - return (members[0] - o.members[0]); - } + private Node addSuffix(Node node, int start) { + for (Node child : node.children) { + EquivalenceClass childClass = child.incomingLabel.getFirst(); + if (childClass.isMember(start)) { + childClass.add(start); + start++; + if (start == (source.length + 1)) { + return child; + } + return addSuffix(child, start); + } + } + while (start <= source.length) { + Node child = new Node(node, start); + node.children.add(child); + node = child; + start++; + } + return node; + } - @Override - public String toString() { - return "{" + Joiner.on(", ").join(new AbstractIterator() { - private int mc = 0; @Override - protected String computeNext() { - if (mc == length) { - return endOfData(); - } - - final int member = members[mc++]; - return "<[" + member + "] " + (member == source.length ? "$" : source[member].toString()) + ">"; + public String toString() { + return Optional.ofNullable(incomingLabel).map(label -> label.stream().map(Object::toString).collect(Collectors.joining(", "))).orElse(""); } - }) + "}"; } - } + class EquivalenceClass implements Comparable { - class SentinelAwareComparator implements Comparator { + int[] members = new int[2]; + int length = 0; - final Comparator comparator; + EquivalenceClass(int first) { + members[length++] = first; + } - SentinelAwareComparator(Comparator comparator) { - this.comparator = comparator; - } + void add(int member) { + if (length == members.length) { + members = Arrays.copyOf(members, members.length * 2); + } + members[length++] = member; + } - @Override - public int compare(Integer o1, Integer o2) { - if (o1 == source.length || o2 == source.length) { - return (o2 - o1); - } - return comparator.compare(source[o1], source[o2]); - } - } + boolean isMember(int index) { + return sourceComparator.compare(index, members[0]) == 0; + } - public class Cursor { - final Node node; - final int offset; + public boolean isMember(T symbol) { + return (members[0] != source.length && comparator.compare(symbol, source[members[0]]) == 0); + } - Cursor() { - this(root, 0); - } + @Override + public boolean equals(Object obj) { + if (obj != null && obj instanceof SuffixTree.EquivalenceClass) { + return members[0] == ((SuffixTree.EquivalenceClass) obj).members[0]; + } + return super.equals(obj); + } + + @Override + public int hashCode() { + return members[0]; + } + + @Override + public int compareTo(EquivalenceClass o) { + return (members[0] - o.members[0]); + } + + @Override + public String toString() { + return String.format("{%s}", Arrays.stream(members, 0, length) + .mapToObj(member -> "<[" + member + "] " + (member == source.length ? "$" : source[member].toString()) + ">") + .collect(Collectors.joining(", "))); + } - Cursor(Node node, int offset) { - this.node = node; - this.offset = offset; } - public Cursor move(T symbol) { - if (node.incomingLabel == null || (offset + 1) == node.incomingLabel.size()) { - for (Node child : node.children) { - final Cursor next = new Cursor(child, 0); - if (next.matchedClass().isMember(symbol)) { - return next; - } + class SentinelAwareComparator implements Comparator { + + final Comparator comparator; + + SentinelAwareComparator(Comparator comparator) { + this.comparator = comparator; + } + + @Override + public int compare(Integer o1, Integer o2) { + if (o1 == source.length || o2 == source.length) { + return (o2 - o1); + } + return comparator.compare(source[o1], source[o2]); } - return null; - } - return (node.incomingLabel.get(offset + 1).isMember(symbol) ? new Cursor(node, offset + 1) : null); } - EquivalenceClass matchedClass() { - return node.incomingLabel.get(offset); + public class Cursor { + final Node node; + final int offset; + + Cursor() { + this(root, 0); + } + + Cursor(Node node, int offset) { + this.node = node; + this.offset = offset; + } + + public Cursor move(T symbol) { + if (node.incomingLabel == null || (offset + 1) == node.incomingLabel.size()) { + for (Node child : node.children) { + final Cursor next = new Cursor(child, 0); + if (next.matchedClass().isMember(symbol)) { + return next; + } + } + return null; + } + return (node.incomingLabel.get(offset + 1).isMember(symbol) ? new Cursor(node, offset + 1) : null); + } + + EquivalenceClass matchedClass() { + return node.incomingLabel.get(offset); + } } - } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/medite/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/medite/package-info.java index 12379cd47..ff28965f0 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/medite/package-info.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/medite/package-info.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,7 +19,7 @@ /** * Implementation of a collation algorithm inspired by the work of Jean-Gabriel Ganascia and Julien Bourdaillet - * on MEDITE. + * on MEDITE. * * @see eu.interedition.collatex.medite.MediteAlgorithm * diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/NeedlemanWunschAlgorithm.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/NeedlemanWunschAlgorithm.java index e5aaab797..cac6315a6 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/NeedlemanWunschAlgorithm.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/NeedlemanWunschAlgorithm.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,118 +19,113 @@ package eu.interedition.collatex.needlemanwunsch; -import com.google.common.collect.Iterables; -import com.google.common.collect.Maps; import eu.interedition.collatex.CollationAlgorithm; import eu.interedition.collatex.Token; import eu.interedition.collatex.VariantGraph; import eu.interedition.collatex.util.VariantGraphRanking; +import java.util.Arrays; import java.util.Comparator; +import java.util.HashMap; import java.util.Map; import java.util.Set; +import java.util.stream.StreamSupport; /** - * @author Gregor Middell + * @author Gregor Middell */ public class NeedlemanWunschAlgorithm extends CollationAlgorithm.Base { - private final Comparator comparator; - private final NeedlemanWunschScorer scorer = new NeedlemanWunschScorer() { + private final Comparator comparator; + private final NeedlemanWunschScorer scorer = new NeedlemanWunschScorer() { - @Override - public float score(VariantGraph.Vertex[] a, Token b) { - for (VariantGraph.Vertex vertex : a) { - final Set tokens = vertex.tokens(); - if (!tokens.isEmpty() && comparator.compare(Iterables.getFirst(tokens, null), b) == 0) { - return 1; + @Override + public float score(VariantGraph.Vertex[] a, Token b) { + return Arrays.stream(a).map(VariantGraph.Vertex::tokens).flatMap(Set::stream).anyMatch(t -> comparator.compare(t, b) == 0) ? 1 : -1; + } + + @Override + public float gap() { + return -1; } - } - return -1; + }; + + public NeedlemanWunschAlgorithm(Comparator comparator) { + this.comparator = comparator; } @Override - public float gap() { - return -1; - } - }; - - public NeedlemanWunschAlgorithm(Comparator comparator) { - this.comparator = comparator; - } - - @Override - public void collate(VariantGraph against, Iterable witness) { - final VariantGraph.Vertex[][] ranks = VariantGraphRanking.of(against).asArray(); - final Token[] tokens = Iterables.toArray(witness, Token.class); - - final Map alignments = Maps.newHashMap(); - for (Map.Entry alignment : align(ranks, tokens, scorer).entrySet()) { - boolean aligned = false; - final Token token = alignment.getValue(); - for (VariantGraph.Vertex vertex : alignment.getKey()) { - for (Token vertexToken : vertex.tokens()) { - if (comparator.compare(vertexToken, token) == 0) { - alignments.put(token, vertex); - aligned = true; - break; - } - } - if (aligned) { - break; + public void collate(VariantGraph against, Iterable witness) { + final VariantGraph.Vertex[][] ranks = VariantGraphRanking.of(against).asArray(); + final Token[] tokens = StreamSupport.stream(witness.spliterator(), false).toArray(Token[]::new); + + final Map alignments = new HashMap<>(); + for (Map.Entry alignment : align(ranks, tokens, scorer).entrySet()) { + boolean aligned = false; + final Token token = alignment.getValue(); + for (VariantGraph.Vertex vertex : alignment.getKey()) { + for (Token vertexToken : vertex.tokens()) { + if (comparator.compare(vertexToken, token) == 0) { + alignments.put(token, vertex); + aligned = true; + break; + } + } + if (aligned) { + break; + } + } } - } + + merge(against, witness, alignments); } - merge(against, witness, alignments); - } + public static Map align(A[] a, B[] b, NeedlemanWunschScorer scorer) { - public static Map align(A[] a, B[] b, NeedlemanWunschScorer scorer) { + final Map alignments = new HashMap<>(); + final float[][] matrix = new float[a.length + 1][b.length + 1]; - final Map alignments = Maps.newHashMap(); - final float[][] matrix = new float[a.length + 1][b.length + 1]; + int ac = 0; + int bc = 0; + while (ac < a.length) { + matrix[ac++][0] = scorer.gap() * ac; + } + while (bc < b.length) { + matrix[0][bc++] = scorer.gap() * bc; + } - int ac = 0; - int bc = 0; - while (ac < a.length) { - matrix[ac++][0] = scorer.gap() * ac; - } - while (bc < b.length) { - matrix[0][bc++] = scorer.gap() * bc; - } + ac = 1; + for (A aElement : a) { + bc = 1; + for (B bElement : b) { + final float k = matrix[ac - 1][bc - 1] + scorer.score(aElement, bElement); + final float l = matrix[ac - 1][bc] + scorer.gap(); + final float m = matrix[ac][bc - 1] + scorer.gap(); + matrix[ac][bc++] = Math.max(Math.max(k, l), m); + } + ac++; + } - ac = 1; - for (A aElement : a) { - bc = 1; - for (B bElement : b) { - final float k = matrix[ac - 1][bc - 1] + scorer.score(aElement, bElement); - final float l = matrix[ac - 1][bc] + scorer.gap(); - final float m = matrix[ac][bc - 1] + scorer.gap(); - matrix[ac][bc++] = Math.max(Math.max(k, l), m); - } - ac++; - } + ac = a.length; + bc = b.length; + while (ac > 0 && bc > 0) { + final float score = matrix[ac][bc]; + final float scoreDiag = matrix[ac - 1][bc - 1]; + final float scoreUp = matrix[ac][bc - 1]; + final float scoreLeft = matrix[ac - 1][bc]; + + if (score == scoreDiag + scorer.score(a[ac - 1], b[bc - 1])) { + // match + alignments.put(a[ac - 1], b[bc - 1]); + ac--; + bc--; + } else if (score == scoreLeft + scorer.gap()) { + ac--; + } else if (score == scoreUp + scorer.gap()) { + bc--; + } + } - ac = a.length; - bc = b.length; - while (ac > 0 && bc > 0) { - final float score = matrix[ac][bc]; - final float scoreDiag = matrix[ac - 1][bc - 1]; - final float scoreUp = matrix[ac][bc - 1]; - final float scoreLeft = matrix[ac - 1][bc]; - - if (score == scoreDiag + scorer.score(a[ac - 1], b[bc - 1])) { - // match - alignments.put(a[ac - 1], b[bc - 1]); - ac--; - bc--; - } else if (score == scoreLeft + scorer.gap()) { - ac--; - } else if (score == scoreUp + scorer.gap()) { - bc--; - } + return alignments; } - - return alignments; - } } \ No newline at end of file diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/NeedlemanWunschScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/NeedlemanWunschScorer.java index 3f81ed75d..a93e3b08f 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/NeedlemanWunschScorer.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/NeedlemanWunschScorer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -20,11 +20,11 @@ package eu.interedition.collatex.needlemanwunsch; /** - * @author Gregor Middell + * @author Gregor Middell */ public interface NeedlemanWunschScorer { - float score(A a, B b); + float score(A a, B b); - float gap(); + float gap(); } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/package-info.java index 0f5af1587..a2f103157 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/package-info.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunsch/package-info.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -18,10 +18,9 @@ */ /** - * A version of the Needleman-Wunsch - * algorithm. - *

- * This algorithm strives for global alignment of witnesses and bases the alignment on a configurable scoring of matches vs. differences/gaps. + * A version of the Needleman-Wunsch algorithm. + * + * This algorithm strives for global alignment of witnesses and bases the alignment on a configurable scoring of matches vs. differences/gaps. * It does not try to detect transpositions. * * @see eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm diff --git a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jGraphRelationships.java b/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jGraphRelationships.java deleted file mode 100644 index 5685fee9d..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jGraphRelationships.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.neo4j; - -import org.neo4j.graphdb.RelationshipType; - -/** - * @author Gregor Middell - */ -public enum Neo4jGraphRelationships implements RelationshipType { - PATH, TRANSPOSITION; -} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraph.java b/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraph.java deleted file mode 100644 index b08b74db7..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraph.java +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.neo4j; - -import com.google.common.base.Function; -import com.google.common.base.Preconditions; -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; -import eu.interedition.collatex.Token; -import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.Witness; -import eu.interedition.collatex.util.VariantGraphTraversal; -import org.neo4j.graphdb.GraphDatabaseService; -import org.neo4j.graphdb.Node; -import org.neo4j.graphdb.Relationship; - -import java.util.Collections; -import java.util.Set; -import java.util.logging.Level; -import java.util.logging.Logger; - -import static eu.interedition.collatex.neo4j.Neo4jGraphRelationships.PATH; -import static java.util.Collections.singleton; - -/** - * @author Gregor Middell - */ -public class Neo4jVariantGraph implements VariantGraph { - private static final Logger LOG = Logger.getLogger(Neo4jVariantGraph.class.getName()); - - final GraphDatabaseService database; - final Neo4jVariantGraphAdapter adapter; - - final Neo4jVariantGraphVertex start; - final Neo4jVariantGraphVertex end; - - public Neo4jVariantGraph(GraphDatabaseService database, Neo4jVariantGraphAdapter adapter) { - this(database, database.createNode(), database.createNode(), adapter); - connect(start, end, Collections.emptySet()); - } - - public Neo4jVariantGraph(GraphDatabaseService database, Node start, Node end, Neo4jVariantGraphAdapter adapter) { - this.database = database; - this.adapter = adapter; - this.start = (Neo4jVariantGraphVertex) vertexWrapper.apply(start); - this.end = (Neo4jVariantGraphVertex) vertexWrapper.apply(end); - } - - @Override - public Vertex getStart() { - return start; - } - - @Override - public Vertex getEnd() { - return end; - } - - @Override - public Set transpositions() { - final Set transpositions = Sets.newHashSet(); - for (Vertex v : vertices()) { - Iterables.addAll(transpositions, v.transpositions()); - } - return transpositions; - } - - @Override - public Iterable vertices() { - return vertices(null); - } - - @Override - public Iterable vertices(final Set witnesses) { - return VariantGraphTraversal.of(this, witnesses); - } - - @Override - public Iterable edges() { - return edges(null); - } - - @Override - public Iterable edges(final Set witnesses) { - return VariantGraphTraversal.of(this, witnesses).edges(); - } - - @Override - public Neo4jVariantGraphVertex add(Token token) { - if (LOG.isLoggable(Level.FINER)) { - LOG.log(Level.FINER, "Creating new vertex with {0}", token); - } - return new Neo4jVariantGraphVertex(this, singleton(token)); - } - - @Override - public Edge connect(VariantGraph.Vertex from, VariantGraph.Vertex to, Set witnesses) { - Preconditions.checkArgument(!from.equals(to)); - - if (from.equals(start)) { - final Edge startEndEdge = edgeBetween(start, end); - if (startEndEdge != null) { - if (to.equals(end)) { - witnesses = Sets.newHashSet(witnesses); - witnesses.addAll(startEndEdge.witnesses()); - } - startEndEdge.delete(); - } - } - - for (Edge e : from.outgoing()) { - if (to.equals(e.to())) { - return e.add(witnesses); - } - } - return new Neo4jVariantGraphEdge(this, (Neo4jVariantGraphVertex) from, (Neo4jVariantGraphVertex) to, witnesses); - } - - @Override - public Edge register(Witness witness) { - return connect(start, end, Collections.singleton(witness)); - } - - @Override - public Transposition transpose(Set vertices) { - Preconditions.checkArgument(!vertices.isEmpty()); - for (Transposition transposition : vertices.iterator().next().transpositions()) { - if (Sets.newHashSet(transposition).equals(vertices)) { - return transposition; - } - } - return new Neo4jVariantGraphTransposition(this, vertices); - } - - @Override - public Edge edgeBetween(Vertex a, Vertex b) { - final Node aNode = ((Neo4jVariantGraphVertex)a).getNode(); - final Node bNode = ((Neo4jVariantGraphVertex)b).getNode(); - for (Relationship r : aNode.getRelationships(PATH)) { - if (r.getOtherNode(aNode).equals(bNode)) { - return new Neo4jVariantGraphEdge(this, r); - } - } - return null; - } - - @Override - public Set witnesses() { - final Set witnesses = Sets.newHashSet(); - for (Edge e : start.outgoing()) { - witnesses.addAll(e.witnesses()); - } - return witnesses; - } - - @Override - public boolean equals(Object obj) { - if (obj != null && obj instanceof Neo4jVariantGraph) { - return start.equals(((Neo4jVariantGraph) obj).start); - } - return super.equals(obj); - } - - @Override - public int hashCode() { - return start.hashCode(); - } - - @Override - public String toString() { - return Iterables.toString(witnesses()); - } - - final Function vertexWrapper = new Function() { - @Override - public VariantGraph.Vertex apply(Node input) { - return new Neo4jVariantGraphVertex(Neo4jVariantGraph.this, input); - } - }; - - final Function edgeWrapper = new Function() { - @Override - public VariantGraph.Edge apply(Relationship input) { - return new Neo4jVariantGraphEdge(Neo4jVariantGraph.this, input); - } - }; - - final Function transpositionWrapper = new Function() { - @Override - public VariantGraph.Transposition apply(Node input) { - return new Neo4jVariantGraphTransposition(Neo4jVariantGraph.this, input); - } - }; -} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphAdapter.java b/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphAdapter.java deleted file mode 100644 index 2daff43ca..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphAdapter.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.neo4j; - -import eu.interedition.collatex.Token; -import eu.interedition.collatex.Witness; - -import java.util.Set; - -/** - * @author Gregor Middell - */ -public interface Neo4jVariantGraphAdapter { - - Set getTokens(Neo4jVariantGraphVertex vertex, Set witnesses); - - void setTokens(Neo4jVariantGraphVertex vertex, Set tokens); - - Set getWitnesses(Neo4jVariantGraphEdge edge); - - void setWitnesses(Neo4jVariantGraphEdge edge, Set witnesses); - -} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphEdge.java b/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphEdge.java deleted file mode 100644 index 8c03d0a3d..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphEdge.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.neo4j; - -import com.google.common.base.Predicate; -import com.google.common.collect.Sets; -import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.Witness; -import org.neo4j.graphdb.Relationship; - -import java.util.Set; - -/** - * @author Gregor Middell - */ -public class Neo4jVariantGraphEdge implements VariantGraph.Edge { - protected final Neo4jVariantGraph graph; - protected final Relationship relationship; - - public Neo4jVariantGraphEdge(Neo4jVariantGraph graph, Relationship relationship) { - this.graph = graph; - this.relationship = relationship; - } - - public Neo4jVariantGraphEdge(Neo4jVariantGraph graph, Neo4jVariantGraphVertex from, Neo4jVariantGraphVertex to, Set witnesses) { - this(graph, from.getNode().createRelationshipTo(to.getNode(), Neo4jGraphRelationships.PATH)); - graph.adapter.setWitnesses(this, witnesses); - } - - public boolean traversableWith(Set witnesses) { - if (witnesses == null || witnesses.isEmpty()) { - return true; - } - final Set edgeWitnesses = witnesses(); - for (Witness witness : witnesses) { - if (edgeWitnesses.contains(witness)) { - return true; - } - } - return false; - } - - @Override - public VariantGraph.Edge add(Set witnesses) { - graph.adapter.setWitnesses(this, Sets.union(witnesses(), witnesses)); - return this; - } - - @Override - public Set witnesses() { - return graph.adapter.getWitnesses(this); - } - - public static Predicate createTraversableFilter(final Set witnesses) { - return new Predicate() { - - @Override - public boolean apply(VariantGraph.Edge input) { - return ((Neo4jVariantGraphEdge) input).traversableWith(witnesses); - } - }; - } - - @Override - public VariantGraph graph() { - return graph; - } - - @Override - public VariantGraph.Vertex from() { - return graph.vertexWrapper.apply(relationship.getStartNode()); - } - - @Override - public VariantGraph.Vertex to() { - return graph.vertexWrapper.apply(relationship.getEndNode()); - } - - @Override - public void delete() { - relationship.delete(); - } - - @Override - public int hashCode() { - return relationship.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (obj != null && obj instanceof VariantGraph.Edge) { - return relationship.equals(((Neo4jVariantGraphEdge) obj).relationship); - } - return super.equals(obj); - } - - @Override - public String toString() { - return new StringBuilder(from().toString()).append(" -> ").append(to().toString()).toString(); - } -} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphTransposition.java b/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphTransposition.java deleted file mode 100644 index 486fd4c85..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphTransposition.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.neo4j; - -import com.google.common.base.Function; -import com.google.common.base.Preconditions; -import com.google.common.collect.Iterables; -import com.google.common.collect.Iterators; -import eu.interedition.collatex.VariantGraph; -import org.neo4j.graphdb.Node; -import org.neo4j.graphdb.Relationship; - -import com.google.common.base.Objects; - -import javax.annotation.Nullable; -import java.util.Iterator; -import java.util.Set; - -/** - * @author Gregor Middell - */ -public class Neo4jVariantGraphTransposition implements VariantGraph.Transposition { - - private final Neo4jVariantGraph graph; - private final Node node; - - public Neo4jVariantGraphTransposition(Neo4jVariantGraph graph, Node node) { - this.graph = graph; - this.node = node; - } - - public Neo4jVariantGraphTransposition(Neo4jVariantGraph graph, Set vertices) { - this(graph, graph.database.createNode()); - for (Neo4jVariantGraphVertex vertex : Iterables.filter(vertices, Neo4jVariantGraphVertex.class)) { - this.node.createRelationshipTo(vertex.node, Neo4jGraphRelationships.TRANSPOSITION); - } - } - - @Override - public Iterator iterator() { - return Iterators.transform(node.getRelationships(Neo4jGraphRelationships.TRANSPOSITION).iterator(), new Function() { - @Override - public VariantGraph.Vertex apply(@Nullable Relationship relationship) { - return graph.vertexWrapper.apply(relationship.getEndNode()); - } - }); - } - - @Override - public void delete() { - for (Relationship r : node.getRelationships(Neo4jGraphRelationships.TRANSPOSITION)) { - r.delete(); - } - node.delete(); - } - - @Override - public int hashCode() { - return node.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (obj != null && obj instanceof Neo4jVariantGraphTransposition) { - return node.equals(((Neo4jVariantGraphTransposition) obj).node); - } - return super.equals(obj); - } - - @Override - public String toString() { - return Objects.toStringHelper(this).addValue(node).toString(); - } -} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphVertex.java b/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphVertex.java deleted file mode 100644 index 73a9f20b0..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/Neo4jVariantGraphVertex.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.neo4j; - -import static com.google.common.collect.Iterables.*; -import static org.neo4j.graphdb.Direction.*; - -import java.util.Set; - -import javax.annotation.Nullable; - -import eu.interedition.collatex.VariantGraph; -import org.neo4j.graphdb.Node; -import org.neo4j.graphdb.Relationship; - -import com.google.common.base.Function; -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; - -import eu.interedition.collatex.Token; -import eu.interedition.collatex.Witness; - -/** - * @author Gregor Middell - */ -public class Neo4jVariantGraphVertex implements VariantGraph.Vertex { - protected final Neo4jVariantGraph graph; - protected final Node node; - - public Neo4jVariantGraphVertex(Neo4jVariantGraph graph, Node node) { - this.graph = graph; - this.node = node; - } - - public Neo4jVariantGraphVertex(Neo4jVariantGraph graph, Set tokens) { - this(graph, graph.database.createNode()); - setTokens(tokens); - } - - @Override - public Iterable incoming() { - return incoming(null); - } - - @Override - public Iterable incoming(Set witnesses) { - return Iterables.filter(transform(node.getRelationships(Neo4jGraphRelationships.PATH, INCOMING), graph.edgeWrapper), Neo4jVariantGraphEdge.createTraversableFilter(witnesses)); - } - - @Override - public Iterable outgoing() { - return outgoing(null); - } - - @Override - public Iterable outgoing(Set witnesses) { - return Iterables.filter(transform(node.getRelationships(Neo4jGraphRelationships.PATH, OUTGOING), graph.edgeWrapper), Neo4jVariantGraphEdge.createTraversableFilter(witnesses)); - } - - @Override - public Iterable transpositions() { - return transform(node.getRelationships(Neo4jGraphRelationships.TRANSPOSITION), new Function() { - @Override - public VariantGraph.Transposition apply(@Nullable Relationship relationship) { - return graph.transpositionWrapper.apply(relationship.getStartNode()); - } - }); - } - - @Override - public Set tokens() { - return tokens(null); - } - - @Override - public Set tokens(Set witnesses) { - return graph.adapter.getTokens(this, witnesses); - } - - @Override - public Set witnesses() { - final Set witnesses = Sets.newHashSet(); - for (Token token : tokens()) { - witnesses.add(token.getWitness()); - } - return witnesses; - } - - @Override - public void add(Iterable tokens) { - final Set tokenSet = Sets.newHashSet(tokens()); - Iterables.addAll(tokenSet, tokens); - setTokens(tokenSet); - } - - public void setTokens(Set tokens) { - graph.adapter.setTokens(this, tokens); - } - - @Override - public String toString() { - return Iterables.toString(tokens()); - } - - @Override - public VariantGraph graph() { - return graph; - } - - public Node getNode() { - return node; - } - - @Override - public void delete() { - node.delete(); - } - - @Override - public int hashCode() { - return node.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (obj != null && obj instanceof Neo4jVariantGraphVertex) { - return node.equals(((Neo4jVariantGraphVertex) obj).node); - } - return super.equals(obj); - } -} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/neo4j/package-info.java deleted file mode 100644 index d3ac49706..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/neo4j/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -/** - * Persistent implementation of variant graphs based on the Neo4j Graph - * Database. - * - */ -package eu.interedition.collatex.neo4j; \ No newline at end of file diff --git a/collatex-core/src/main/java/eu/interedition/collatex/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/package-info.java index 6f4ceb757..043a3d988 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/package-info.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/package-info.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * diff --git a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleCollation.java b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleCollation.java index fcfdd6b12..ed1da5ebe 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleCollation.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleCollation.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -26,39 +26,35 @@ public class SimpleCollation { - private final List witnesses; - private final CollationAlgorithm algorithm; - private final boolean joined; + private final List witnesses; + private final CollationAlgorithm algorithm; + private final boolean joined; - public SimpleCollation(List witnesses, CollationAlgorithm algorithm, boolean joined) { - this.witnesses = witnesses; - this.algorithm = algorithm; - this.joined = joined; - } - - public List getWitnesses() { - return witnesses; - } + public SimpleCollation(List witnesses, CollationAlgorithm algorithm, boolean joined) { + this.witnesses = witnesses; + this.algorithm = algorithm; + this.joined = joined; + } - public CollationAlgorithm getAlgorithm() { - return algorithm; - } + public List getWitnesses() { + return witnesses; + } - public boolean isJoined() { - return joined; - } + public CollationAlgorithm getAlgorithm() { + return algorithm; + } - public VariantGraph collate(VariantGraph graph) { - for (SimpleWitness witness : witnesses) { - if (witness.getTokens().isEmpty()) { - graph.register(witness); - } else { - algorithm.collate(graph, witness); - } + public boolean isJoined() { + return joined; } - if (joined) { - VariantGraph.JOIN.apply(graph); + + public VariantGraph collate(VariantGraph graph) { + for (SimpleWitness witness : witnesses) { + algorithm.collate(graph, witness); + } + if (joined) { + VariantGraph.JOIN.apply(graph); + } + return graph; } - return graph; - } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimplePatternTokenizer.java b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimplePatternTokenizer.java index f30e9aedd..20e1c63e3 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimplePatternTokenizer.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimplePatternTokenizer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,47 +19,36 @@ package eu.interedition.collatex.simple; -import com.google.common.base.Function; -import com.google.common.collect.Lists; - -import javax.annotation.Nullable; +import java.util.LinkedList; import java.util.List; +import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Stream; /** - * @author Gregor Middell + * @author Gregor Middell * @author Ronald Haentjens Dekker */ -public class SimplePatternTokenizer implements Function> { - - private final Pattern pattern; - - public SimplePatternTokenizer(Pattern pattern) { - this.pattern = pattern; - } - - @Override - public Iterable apply(@Nullable String input) { - final Matcher matcher = pattern.matcher(input); - final List tokens = Lists.newLinkedList(); - while (matcher.find()) { - tokens.add(input.substring(matcher.start(), matcher.end())); +public class SimplePatternTokenizer { + + static final String PUNCT = Pattern.quote(".?!,;:"); + + static Function> tokenizer(Pattern pattern) { + return input -> { + final Matcher matcher = pattern.matcher(input); + final List tokens = new LinkedList<>(); + while (matcher.find()) { + tokens.add(input.substring(matcher.start(), matcher.end())); + } + return tokens.stream(); + }; } - return tokens; - } - public static final SimplePatternTokenizer BY_WHITESPACE = new SimplePatternTokenizer( - Pattern.compile("\\s*?\\S+\\s*]") - ); + public static final Function> BY_WHITESPACE = tokenizer(Pattern.compile("\\s*?\\S+\\s*]")); + + public static final Function> BY_WS_AND_PUNCT = tokenizer(Pattern.compile("[\\s" + PUNCT + "]*?[^\\s" + PUNCT + "]+[\\s" + PUNCT + "]*")); - static final String PUNCT = Pattern.quote(".?!,;:"); + public static final Function> BY_WS_OR_PUNCT = tokenizer(Pattern.compile("[" + PUNCT + "]+[\\s]*|[^" + PUNCT + "\\s]+[\\s]*")); - public static final SimplePatternTokenizer BY_WS_AND_PUNCT = new SimplePatternTokenizer( - Pattern.compile("[\\s" + PUNCT + "]*?[^\\s" + PUNCT + "]+[\\s" + PUNCT + "]*") - ); - - public static final SimplePatternTokenizer BY_WS_OR_PUNCT = new SimplePatternTokenizer( - Pattern.compile("[" + PUNCT + "]+[\\s]*|[^" + PUNCT + "\\s]+[\\s]*") - ); } \ No newline at end of file diff --git a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleToken.java b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleToken.java index cbb06bb61..2b9e53a43 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleToken.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleToken.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,65 +19,63 @@ package eu.interedition.collatex.simple; -import com.google.common.base.Function; -import com.google.common.collect.Iterables; import eu.interedition.collatex.Token; import eu.interedition.collatex.Witness; import eu.interedition.collatex.util.VertexMatch; -import javax.annotation.Nullable; import java.util.SortedSet; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; public class SimpleToken implements Token, Comparable { - private final SimpleWitness witness; - private final String content; - private final String normalized; + private final SimpleWitness witness; + private final String content; + private final String normalized; - public SimpleToken(SimpleWitness witness, String content, String normalized) { - this.witness = witness; - this.content = content; - this.normalized = normalized; - } - - public String getContent() { - return content; - } + public SimpleToken(SimpleWitness witness, String content, String normalized) { + this.witness = witness; + this.content = content; + this.normalized = normalized; + } - @Override - public Witness getWitness() { - return witness; - } + public String getContent() { + return content; + } - public String getNormalized() { - return normalized; - } + @Override + public Witness getWitness() { + return witness; + } - @Override - public String toString() { - return new StringBuilder(witness.toString()).append(":").append(witness.getTokens().indexOf(this)).append(":'").append(normalized).append("'").toString(); - } + public String getNormalized() { + return normalized; + } - public static String toString(Iterable tokens) { - final StringBuilder normalized = new StringBuilder(); - for (SimpleToken token : Iterables.filter(tokens, SimpleToken.class)) { - normalized.append(token.getContent()); + @Override + public String toString() { + return new StringBuilder(witness.toString()).append(":").append(witness.getTokens().indexOf(this)).append(":'").append(normalized).append("'").toString(); } - return normalized.toString().trim(); - } - @Override - public int compareTo(SimpleToken o) { - return witness.compare(this, o); - } + public static String toString(Iterable tokens) { + return StreamSupport.stream(tokens.spliterator(), false) + .filter(t -> SimpleToken.class.isAssignableFrom(t.getClass())) + .map(t -> (SimpleToken) t) + .map(SimpleToken::getContent) + .collect(Collectors.joining()) + .trim(); + } - public static final Function, Integer> TOKEN_MATCH_EVALUATOR = new Function, Integer>() { @Override - public Integer apply(@Nullable SortedSet input) { - int value = 0; - for (VertexMatch.WithToken match : input) { - value += ((SimpleToken) match.token).getContent().length(); - } - return value; + public int compareTo(SimpleToken o) { + return witness.compare(this, o); } - }; + + public static final Function, Integer> TOKEN_MATCH_EVALUATOR = input -> { + int value = 0; + for (VertexMatch.WithToken match : input) { + value += ((SimpleToken) match.token).getContent().length(); + } + return value; + }; } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleTokenNormalizers.java b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleTokenNormalizers.java index 4e489b5be..081f59888 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleTokenNormalizers.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleTokenNormalizers.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,56 +19,45 @@ package eu.interedition.collatex.simple; -import com.google.common.base.Function; -import com.google.common.base.Functions; - -import javax.annotation.Nullable; +import java.util.function.Function; /** - * @author Gregor Middell + * @author Gregor Middell * @author Ronald Haentjens Dekker */ public class SimpleTokenNormalizers { - public static final Function LOWER_CASE = new Function() { - @Override - public String apply(@Nullable String input) { - return input.toLowerCase(); - } - }; - - public static final Function TRIM_WS = new Function() { - @Override - public String apply(@Nullable String input) { - return input.trim(); + public static final Function LOWER_CASE = String::toLowerCase; + + public static final Function TRIM_WS = String::trim; + + public static final Function TRIM_WS_PUNCT = input -> { + int start = 0; + int end = input.length() - 1; + while (start <= end && isWhitespaceOrPunctuation(input.charAt(start))) { + start++; + } + while (end >= start && isWhitespaceOrPunctuation(input.charAt(end))) { + end--; + } + return input.substring(start, end + 1); + }; + + public static boolean isWhitespaceOrPunctuation(char c) { + if (Character.isWhitespace(c)) { + return true; + } + switch (Character.getType(c)) { + case Character.START_PUNCTUATION: + case Character.END_PUNCTUATION: + case Character.OTHER_PUNCTUATION: + return true; + default: + return false; + } } - }; - - public static final Function TRIM_WS_PUNCT = new Function() { - @Override - public String apply(@Nullable String input) { - int start = 0; - int end = input.length() - 1; - while (start <= end && isWhitespaceOrPunctuation(input.charAt(start))) { - start++; - } - while (end >= start && isWhitespaceOrPunctuation(input.charAt(end))) { - end--; - } - return input.substring(start, end + 1); - } - - boolean isWhitespaceOrPunctuation(char c) { - if (Character.isWhitespace(c)) { - return true; - } - final int type = Character.getType(c); - return (Character.START_PUNCTUATION == type || Character.END_PUNCTUATION == type || Character.OTHER_PUNCTUATION == type); - } - }; + public static final Function LC_TRIM_WS_PUNCT = LOWER_CASE.andThen(TRIM_WS_PUNCT); - public static final Function LC_TRIM_WS_PUNCT = Functions.compose(LOWER_CASE, TRIM_WS_PUNCT); - - public static final Function LC_TRIM_WS = Functions.compose(LOWER_CASE, TRIM_WS); + public static final Function LC_TRIM_WS = LOWER_CASE.andThen(TRIM_WS); } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleVariantGraphSerializer.java b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleVariantGraphSerializer.java index bc1ee5065..6828424d7 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleVariantGraphSerializer.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleVariantGraphSerializer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,17 +19,6 @@ package eu.interedition.collatex.simple; -import com.google.common.base.Function; -import com.google.common.base.Objects; -import com.google.common.base.Throwables; -import com.google.common.collect.Iterables; -import com.google.common.collect.LinkedHashMultimap; -import com.google.common.collect.Maps; -import com.google.common.collect.Ordering; -import com.google.common.collect.SetMultimap; -import com.google.common.collect.Sets; -import com.google.common.collect.SortedSetMultimap; -import com.google.common.collect.TreeMultimap; import eu.interedition.collatex.Token; import eu.interedition.collatex.VariantGraph; import eu.interedition.collatex.Witness; @@ -37,419 +26,420 @@ import eu.interedition.collatex.util.ParallelSegmentationApparatus; import eu.interedition.collatex.util.VariantGraphRanking; -import javax.annotation.Nullable; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamWriter; import java.io.IOException; import java.io.PrintWriter; import java.io.Writer; +import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; -import java.util.List; +import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.function.Function; import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; /** - * @author Gregor Middell + * @author Gregor Middell */ public class SimpleVariantGraphSerializer { - /** - * CollateX custom namespace. - */ - protected static final String COLLATEX_NS = "http://interedition.eu/collatex/ns/1.0"; - - /** - * The TEI P5 namespace. - */ - protected static final String TEI_NS = "http://www.tei-c.org/ns/1.0"; - - private final VariantGraph graph; - private final Function, String> tokensToString; - private final Map vertexIds = Maps.newHashMap(); - private final Map transpositionIds = Maps.newHashMap(); - private VariantGraphRanking ranking; - - public SimpleVariantGraphSerializer(VariantGraph graph) { - this(graph, SIMPLE_TOKEN_TO_STRING); - } - - public SimpleVariantGraphSerializer(VariantGraph graph, Function, String> tokensToString) { - this.graph = graph; - this.tokensToString = tokensToString; - } - - public void toTEI(final XMLStreamWriter xml) throws XMLStreamException { - try { - ParallelSegmentationApparatus.generate(ranking(), new ParallelSegmentationApparatus.GeneratorCallback() { - @Override - public void start() { - try { - xml.writeStartElement("cx", "apparatus", COLLATEX_NS); - xml.writeNamespace("cx", COLLATEX_NS); - xml.writeNamespace("", TEI_NS); - } catch (XMLStreamException e) { - throw Throwables.propagate(e); - } - } + /** + * CollateX custom namespace. + */ + protected static final String COLLATEX_NS = "http://interedition.eu/collatex/ns/1.0"; + + /** + * The TEI P5 namespace. + */ + protected static final String TEI_NS = "http://www.tei-c.org/ns/1.0"; + + private final VariantGraph graph; + private final Function, String> tokensToString; + private final Map vertexIds = new HashMap<>(); + private VariantGraphRanking ranking; + + public SimpleVariantGraphSerializer(VariantGraph graph) { + this(graph, SIMPLE_TOKEN_TO_STRING); + } - @Override - public void segment(SortedMap> contents) { - final SetMultimap segments = LinkedHashMultimap.create(); - for (Map.Entry> cell : contents.entrySet()) { - //NOTE: we don't want trailing whitespace before an end tag - segments.put(tokensToString.apply(cell.getValue()).trim(), cell.getKey()); - } - - final Set segmentContents = segments.keySet(); - try { - if (segmentContents.size() == 1) { - xml.writeCharacters(Iterables.getOnlyElement(segmentContents)); - } else { - xml.writeStartElement("", "app", TEI_NS); - for (String segment : segmentContents) { - final StringBuilder witnesses = new StringBuilder(); - for (Witness witness : segments.get(segment)) { - witnesses.append(witness.getSigil()).append(" "); - } - if (segment.length() == 0) { - xml.writeEmptyElement("", "rdg", TEI_NS); - } else { - xml.writeStartElement("", "rdg", TEI_NS); + public SimpleVariantGraphSerializer(VariantGraph graph, Function, String> tokensToString) { + this.graph = graph; + this.tokensToString = tokensToString; + } + + public void toTEI(final XMLStreamWriter xml) throws XMLStreamException { + try { + ParallelSegmentationApparatus.generate(ranking(), new ParallelSegmentationApparatus.GeneratorCallback() { + @Override + public void start() { + try { + xml.writeStartElement("cx", "apparatus", COLLATEX_NS); + xml.writeNamespace("cx", COLLATEX_NS); + xml.writeNamespace("", TEI_NS); + } catch (XMLStreamException e) { + throw new RuntimeException(e); + } } - xml.writeAttribute("wit", witnesses.toString().trim()); + @Override + public void segment(SortedMap> contents) { + final Map> segments = new LinkedHashMap<>(); + contents.forEach((witness, tokens) -> segments.computeIfAbsent(tokensToString.apply(tokens).trim(), k -> new HashSet<>()).add(witness)); + + final Set segmentContents = segments.keySet(); + try { + if (segmentContents.size() == 1) { + xml.writeCharacters(segmentContents.stream().findFirst().get()); + } else { + xml.writeStartElement("", "app", TEI_NS); + for (String segment : segmentContents) { + final StringBuilder witnesses = new StringBuilder(); + for (Witness witness : segments.get(segment)) { + witnesses.append(witness.getSigil()).append(" "); + } + if (segment.length() == 0) { + xml.writeEmptyElement("", "rdg", TEI_NS); + } else { + xml.writeStartElement("", "rdg", TEI_NS); + } + + xml.writeAttribute("wit", witnesses.toString().trim()); + + if (segment.length() > 0) { + xml.writeCharacters(segment); + xml.writeEndElement(); + } + } + xml.writeEndElement(); + } + } catch (XMLStreamException e) { + throw new RuntimeException(e); + } + } - if (segment.length() > 0) { - xml.writeCharacters(segment); - xml.writeEndElement(); + @Override + public void end() { + try { + xml.writeEndElement(); + } catch (XMLStreamException e) { + throw new RuntimeException(e); + } } - } - xml.writeEndElement(); + }); + } catch (RuntimeException re) { + Throwable rootCause = re; + for (Throwable cause = re; cause != null; cause = cause.getCause()) { + rootCause = cause; } - } catch (XMLStreamException e) { - throw Throwables.propagate(e); - } - } - - @Override - public void end() { - try { - xml.writeEndElement(); - } catch (XMLStreamException e) { - throw Throwables.propagate(e); - } + if (rootCause instanceof XMLStreamException) { + throw (XMLStreamException) rootCause; + } + throw re; } - }); - } catch (Throwable t) { - Throwables.propagateIfInstanceOf(Throwables.getRootCause(t), XMLStreamException.class); - throw Throwables.propagate(t); } - } - public void toCsv(final Writer out) throws IOException { - try { - ParallelSegmentationApparatus.generate(ranking(), new ParallelSegmentationApparatus.GeneratorCallback() { - @Override - public void start() { - try { - final List witnessList = Ordering.from(Witness.SIGIL_COMPARATOR).immutableSortedCopy(graph.witnesses()); - for (Iterator it = witnessList.iterator(); it.hasNext(); ) { - out.write(escapeCsvField(it.next().getSigil())); - if (it.hasNext()) { - out.write(","); - } - } - out.write("\r\n"); - } catch (IOException e) { - throw Throwables.propagate(e); - } - } + public void toCsv(final Writer out) throws IOException { + try { + ParallelSegmentationApparatus.generate(ranking(), new ParallelSegmentationApparatus.GeneratorCallback() { + @Override + public void start() { + try { + for (Iterator it = graph.witnesses().stream().sorted(Witness.SIGIL_COMPARATOR).iterator(); it.hasNext(); ) { + out.write(escapeCsvField(it.next().getSigil())); + if (it.hasNext()) { + out.write(","); + } + } + out.write("\r\n"); + } catch (IOException e) { + throw new RuntimeException(e); + } + } - @Override - public void segment(SortedMap> contents) { - try { - for (Iterator witnessIt = contents.keySet().iterator(); witnessIt.hasNext();) { - out.write(escapeCsvField(tokensToString.apply(Objects.firstNonNull(contents.get(witnessIt.next()), Collections.emptySet())))); - if (witnessIt.hasNext()) { - out.write(","); - } + @Override + public void segment(SortedMap> contents) { + try { + for (Iterator witnessIt = contents.keySet().iterator(); witnessIt.hasNext(); ) { + out.write(escapeCsvField(tokensToString.apply(contents.getOrDefault(witnessIt.next(), Collections.emptySet())))); + if (witnessIt.hasNext()) { + out.write(","); + } + } + out.write("\r\n"); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public void end() { + } + }); + } catch (Throwable t) { + for (Throwable cause = t; cause != null; cause = cause.getCause()) { + if (cause instanceof IOException) { + throw (IOException) cause; + } } - out.write("\r\n"); - } catch (IOException e) { - throw Throwables.propagate(e); - } + if (t instanceof RuntimeException) { + throw (RuntimeException) t; + } + throw new RuntimeException(t); } + } - @Override - public void end() { - } - }); - } catch (Throwable t) { - Throwables.propagateIfInstanceOf(Throwables.getRootCause(t), IOException.class); - throw Throwables.propagate(t); + static final Pattern CSV_SPECIAL_CHARS = Pattern.compile("[\r\n\",]"); + + static String escapeCsvField(String content) { + return (CSV_SPECIAL_CHARS.matcher(content).find() ? ("\"" + content.replaceAll("\"", "\"\"") + "\"") : content); } - } - static final Pattern CSV_SPECIAL_CHARS = Pattern.compile("[\r\n\",]"); + public void toDot(Writer writer) { + final PrintWriter out = new PrintWriter(writer); + final String indent = " "; + final String connector = " -> "; - static String escapeCsvField(String content) { - return (CSV_SPECIAL_CHARS.matcher(content).find() ? ("\"" + content.replaceAll("\"", "\"\"") + "\"") : content); - } + out.println("digraph G {"); - public void toDot(Writer writer) { - final PrintWriter out = new PrintWriter(writer); - final String indent = " "; - final String connector = " -> "; + for (VariantGraph.Vertex v : graph.vertices()) { + out.print(indent + id(v)); + out.print(" [label = \"" + toDotLabel(v) + "\"]"); + out.println(";"); + } - out.println("digraph G {"); + for (VariantGraph.Vertex v : graph.vertices()) { + for (Map.Entry> e : v.outgoing().entrySet()) { + out.print(indent + id(v) + connector + id(e.getKey())); + out.print(" [label = \"" + toDotLabel(e.getValue()) + "\"]"); + out.println(";"); + } + } - for (VariantGraph.Vertex v : graph.vertices()) { - out.print(indent + id(v)); - out.print(" [label = \"" + toDotLabel(v) + "\"]"); - out.println(";"); - } + for (Tuple transposedTuple : transposedTuples()) { + final String leftId = id(transposedTuple.left); + final String rightId = id(transposedTuple.right); + out.print(indent + leftId + connector + rightId); + out.print(" [ color = \"lightgray\", style = \"dashed\" arrowhead = \"none\", arrowtail = \"none\" ]"); + out.println(";"); + } - for (VariantGraph.Edge e : graph.edges()) { - out.print(indent + id(e.from()) + connector + id(e.to())); - out.print(" [label = \"" + toDotLabel(e) + "\"]"); + out.print(indent + id(graph.getStart()) + connector + id(graph.getEnd())); + out.print(" [color = \"white\"]"); out.println(";"); - } - for (Tuple transposedTuple : transposedTuples()) { - final String leftId = id(transposedTuple.left); - final String rightId = id(transposedTuple.right); - out.print(indent + leftId + connector + rightId); - out.print(" [ color = \"lightgray\", style = \"dashed\" arrowhead = \"none\", arrowtail = \"none\" ]"); - out.println(";"); - } + out.println("}"); - out.print(indent + id(graph.getStart()) + connector + id(graph.getEnd())); - out.print(" [color = \"white\"]"); - out.println(";"); + out.flush(); + } - out.println("}"); + private String id(VariantGraph.Vertex vertex) { + return ("v" + numericId(vertex)); + } - out.flush(); - } + private int numericId(VariantGraph.Vertex vertex) { + Integer id = vertexIds.get(vertex); + if (id == null) { + id = vertexIds.size(); + vertexIds.put(vertex, id); + } + return id; + } - private String id(VariantGraph.Vertex vertex) { - return ("v" + numericId(vertex)); - } + String toDotLabel(Set e) { + return escapeDotLabel(e.stream().map(Witness::getSigil).distinct().sorted().collect(Collectors.joining(", "))); + } - private int numericId(VariantGraph.Vertex vertex) { - Integer id = vertexIds.get(vertex); - if (id == null) { - id = vertexIds.size(); - vertexIds.put(vertex, id); + String toDotLabel(VariantGraph.Vertex v) { + return escapeDotLabel(vertexToString.apply(v)); } - return id; - } - - private String id(VariantGraph.Transposition transposition) { - Integer id = transpositionIds.get(transposition); - if (id == null) { - id = transpositionIds.size(); - transpositionIds.put(transposition, id); + + static String escapeDotLabel(String string) { + return string.replaceAll("\"", "\\\\\"").replaceAll("[\n\r]+", "\u00B6"); } - return ("t" + id); - } - String toDotLabel(VariantGraph.Edge e) { - return escapeDotLabel(Witness.TO_SIGILS.apply(e)); - } + VariantGraphRanking ranking() { + if (ranking == null) { + ranking = VariantGraphRanking.of(graph); + } + return ranking; + } - String toDotLabel(VariantGraph.Vertex v) { - return escapeDotLabel(vertexToString.apply(v)); - } + Set> transposedTuples() { + final Set> tuples = new HashSet<>(); + final Comparator vertexOrdering = ranking().comparator(); - static String escapeDotLabel(String string) { - return string.replaceAll("\"", "\\\\\"").replaceAll("[\n\r]+", "\u00B6"); - } + for (Set transposition : graph.transpositions()) { + final SortedMap> verticesByWitness = new TreeMap<>(Witness.SIGIL_COMPARATOR); + for (VariantGraph.Vertex vertex : transposition) { + for (Witness witness : vertex.witnesses()) { + verticesByWitness.computeIfAbsent(witness, w -> new TreeSet<>(vertexOrdering)).add(vertex); + } + } - VariantGraphRanking ranking() { - if (ranking == null) { - ranking = VariantGraphRanking.of(graph); - } - return ranking; - } - - Set> transposedTuples() { - final Set> tuples = Sets.newHashSet(); - final Ordering vertexOrdering = Ordering.from(ranking()).compound(new Comparator() { - @Override - public int compare(VariantGraph.Vertex o1, VariantGraph.Vertex o2) { - return Ordering.arbitrary().compare(o1, o2); - } - }); - - for (VariantGraph.Transposition transposition : graph.transpositions()) { - final SortedSetMultimap verticesByWitness = TreeMultimap.create(Witness.SIGIL_COMPARATOR, vertexOrdering); - for (VariantGraph.Vertex vertex : transposition) { - for (Witness witness : vertex.witnesses()) { - verticesByWitness.put(witness, vertex); - } - } - - Witness prev = null; - for (Witness witness : verticesByWitness.keySet()) { - if (prev != null) { - final Iterator prevIt = verticesByWitness.get(prev).iterator(); - final Iterator nextIt = verticesByWitness.get(witness).iterator(); - while (prevIt.hasNext() && nextIt.hasNext()) { - final VariantGraph.Vertex prevVertex = prevIt.next(); - final VariantGraph.Vertex nextVertex = nextIt.next(); - if (!prevVertex.equals(nextVertex)) { - tuples.add(new Tuple(prevVertex, nextVertex)); + Witness prev = null; + for (Witness witness : verticesByWitness.keySet()) { + if (prev != null) { + final Iterator prevIt = verticesByWitness.get(prev).iterator(); + final Iterator nextIt = verticesByWitness.get(witness).iterator(); + while (prevIt.hasNext() && nextIt.hasNext()) { + final VariantGraph.Vertex prevVertex = prevIt.next(); + final VariantGraph.Vertex nextVertex = nextIt.next(); + if (!prevVertex.equals(nextVertex)) { + tuples.add(new Tuple<>(prevVertex, nextVertex)); + } + } + } + prev = witness; } - } } - prev = witness; - } + + return tuples; } - return tuples; - } + public void toGraphML(XMLStreamWriter xml) throws XMLStreamException { + xml.writeStartElement("", GRAPHML_TAG, GRAPHML_NS); + xml.writeNamespace("", GRAPHML_NS); + xml.writeAttribute(XMLNSXSI_ATT, GRAPHML_XMLNSXSI); + xml.writeAttribute(XSISL_ATT, GRAPHML_XSISL); - public void toGraphML(XMLStreamWriter xml) throws XMLStreamException { - xml.writeStartElement("", GRAPHML_TAG, GRAPHML_NS); - xml.writeNamespace("", GRAPHML_NS); - xml.writeAttribute(XMLNSXSI_ATT, GRAPHML_XMLNSXSI); - xml.writeAttribute(XSISL_ATT, GRAPHML_XSISL); + for (GraphMLProperty p : GraphMLProperty.values()) { + p.declare(xml); + } - for (GraphMLProperty p : GraphMLProperty.values()) { - p.declare(xml); - } + xml.writeStartElement(GRAPHML_NS, GRAPH_TAG); + xml.writeAttribute(ID_ATT, GRAPH_ID); + xml.writeAttribute(EDGEDEFAULT_ATT, EDGEDEFAULT_DEFAULT_VALUE); + xml.writeAttribute(PARSENODEIDS_ATT, PARSENODEIDS_DEFAULT_VALUE); + xml.writeAttribute(PARSEEDGEIDS_ATT, PARSEEDGEIDS_DEFAULT_VALUE); + xml.writeAttribute(PARSEORDER_ATT, PARSEORDER_DEFAULT_VALUE); + + final VariantGraphRanking ranking = ranking(); + for (VariantGraph.Vertex vertex : graph.vertices()) { + final int id = numericId(vertex); + xml.writeStartElement(GRAPHML_NS, NODE_TAG); + xml.writeAttribute(ID_ATT, "n" + id); + GraphMLProperty.NODE_NUMBER.write(Integer.toString(id), xml); + GraphMLProperty.NODE_RANK.write(Integer.toString(ranking.apply(vertex)), xml); + GraphMLProperty.NODE_TOKEN.write(vertexToString.apply(vertex), xml); + xml.writeEndElement(); + } - xml.writeStartElement(GRAPHML_NS, GRAPH_TAG); - xml.writeAttribute(ID_ATT, GRAPH_ID); - xml.writeAttribute(EDGEDEFAULT_ATT, EDGEDEFAULT_DEFAULT_VALUE); - xml.writeAttribute(PARSENODEIDS_ATT, PARSENODEIDS_DEFAULT_VALUE); - xml.writeAttribute(PARSEEDGEIDS_ATT, PARSEEDGEIDS_DEFAULT_VALUE); - xml.writeAttribute(PARSEORDER_ATT, PARSEORDER_DEFAULT_VALUE); - - final VariantGraphRanking ranking = ranking(); - for (VariantGraph.Vertex vertex : graph.vertices()) { - final int id = numericId(vertex); - xml.writeStartElement(GRAPHML_NS, NODE_TAG); - xml.writeAttribute(ID_ATT, "n" + id); - GraphMLProperty.NODE_NUMBER.write(Integer.toString(id), xml); - GraphMLProperty.NODE_RANK.write(Integer.toString(ranking.apply(vertex)), xml); - GraphMLProperty.NODE_TOKEN.write(vertexToString.apply(vertex), xml); - xml.writeEndElement(); - } + int edgeNumber = 0; + for (VariantGraph.Vertex v : graph.vertices()) { + for (Map.Entry> edge : v.outgoing().entrySet()) { + xml.writeStartElement(GRAPHML_NS, EDGE_TAG); + xml.writeAttribute(ID_ATT, "e" + edgeNumber); + xml.writeAttribute(SOURCE_ATT, "n" + numericId(v)); + xml.writeAttribute(TARGET_ATT, "n" + numericId(edge.getKey())); + GraphMLProperty.EDGE_NUMBER.write(Integer.toString(edgeNumber++), xml); + GraphMLProperty.EDGE_TYPE.write(EDGE_TYPE_PATH, xml); + GraphMLProperty.EDGE_WITNESSES.write(edge.getValue().stream().map(Witness::getSigil).distinct().sorted().collect(Collectors.joining(", ")), xml); + xml.writeEndElement(); + } + } - int edgeNumber = 0; - for (VariantGraph.Edge edge : graph.edges()) { - xml.writeStartElement(GRAPHML_NS, EDGE_TAG); - xml.writeAttribute(ID_ATT, "e" + edgeNumber); - xml.writeAttribute(SOURCE_ATT, "n" + numericId(edge.from())); - xml.writeAttribute(TARGET_ATT, "n" + numericId(edge.to())); - GraphMLProperty.EDGE_NUMBER.write(Integer.toString(edgeNumber++), xml); - GraphMLProperty.EDGE_TYPE.write(EDGE_TYPE_PATH, xml); - GraphMLProperty.EDGE_WITNESSES.write(Witness.TO_SIGILS.apply(edge), xml); - xml.writeEndElement(); - } + for (Tuple transposedTuple : transposedTuples()) { + xml.writeStartElement(GRAPHML_NS, EDGE_TAG); + xml.writeAttribute(ID_ATT, "e" + edgeNumber); + xml.writeAttribute(SOURCE_ATT, "n" + numericId(transposedTuple.left)); + xml.writeAttribute(TARGET_ATT, "n" + numericId(transposedTuple.right)); + GraphMLProperty.EDGE_NUMBER.write(Integer.toString(edgeNumber++), xml); + GraphMLProperty.EDGE_TYPE.write(EDGE_TYPE_TRANSPOSITION, xml); + xml.writeEndElement(); + } - for (Tuple transposedTuple : transposedTuples()) { - xml.writeStartElement(GRAPHML_NS, EDGE_TAG); - xml.writeAttribute(ID_ATT, "e" + edgeNumber); - xml.writeAttribute(SOURCE_ATT, "n" + numericId(transposedTuple.left)); - xml.writeAttribute(TARGET_ATT, "n" + numericId(transposedTuple.right)); - GraphMLProperty.EDGE_NUMBER.write(Integer.toString(edgeNumber++), xml); - GraphMLProperty.EDGE_TYPE.write(EDGE_TYPE_TRANSPOSITION, xml); - xml.writeEndElement(); - } + xml.writeEndElement(); - xml.writeEndElement(); - - xml.writeEndElement(); - } - - private static final String NODE_TAG = "node"; - private static final String TARGET_ATT = "target"; - private static final String SOURCE_ATT = "source"; - private static final String EDGE_TAG = "edge"; - private static final String EDGE_TYPE_PATH = "path"; - private static final String EDGE_TYPE_TRANSPOSITION = "transposition"; - private static final String EDGEDEFAULT_DEFAULT_VALUE = "directed"; - private static final String EDGEDEFAULT_ATT = "edgedefault"; - private static final String GRAPH_ID = "g0"; - private static final String GRAPH_TAG = "graph"; - private static final String GRAPHML_NS = "http://graphml.graphdrawing.org/xmlns"; - private static final String GRAPHML_TAG = "graphml"; - private static final String XMLNSXSI_ATT = "xmlns:xsi"; - private static final String XSISL_ATT = "xsi:schemaLocation"; - private static final String GRAPHML_XMLNSXSI = "http://www.w3.org/2001/XMLSchema-instance"; - private static final String GRAPHML_XSISL = "http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd"; - private static final String PARSENODEIDS_ATT = "parse.nodeids"; - private static final String PARSENODEIDS_DEFAULT_VALUE = "canonical"; - private static final String PARSEEDGEIDS_ATT = "parse.edgeids"; - private static final String PARSEEDGEIDS_DEFAULT_VALUE = "canonical"; - private static final String PARSEORDER_ATT = "parse.order"; - private static final String PARSEORDER_DEFAULT_VALUE = "nodesfirst"; - - private static final String ATTR_TYPE_ATT = "attr.type"; - private static final String ATTR_NAME_ATT = "attr.name"; - private static final String FOR_ATT = "for"; - private static final String ID_ATT = "id"; - private static final String KEY_TAG = "key"; - private static final String DATA_TAG = "data"; - - private enum GraphMLProperty { - NODE_NUMBER(NODE_TAG, "number", "int"), // - NODE_TOKEN(NODE_TAG, "tokens", "string"), // - NODE_RANK(NODE_TAG, "rank", "int"), // - EDGE_NUMBER(EDGE_TAG, "number", "int"), // - EDGE_TYPE(EDGE_TAG, "type", "string"), // - EDGE_WITNESSES(EDGE_TAG, "witnesses", "string"); - - private String name; - private String forElement; - private String type; - - private GraphMLProperty(String forElement, String name, String type) { - this.name = name; - this.forElement = forElement; - this.type = type; + xml.writeEndElement(); } - public void write(String data, XMLStreamWriter xml) throws XMLStreamException { - xml.writeStartElement(GRAPHML_NS, DATA_TAG); - xml.writeAttribute(KEY_TAG, "d" + ordinal()); - xml.writeCharacters(data); - xml.writeEndElement(); - } + private static final String NODE_TAG = "node"; + private static final String TARGET_ATT = "target"; + private static final String SOURCE_ATT = "source"; + private static final String EDGE_TAG = "edge"; + private static final String EDGE_TYPE_PATH = "path"; + private static final String EDGE_TYPE_TRANSPOSITION = "transposition"; + private static final String EDGEDEFAULT_DEFAULT_VALUE = "directed"; + private static final String EDGEDEFAULT_ATT = "edgedefault"; + private static final String GRAPH_ID = "g0"; + private static final String GRAPH_TAG = "graph"; + private static final String GRAPHML_NS = "http://graphml.graphdrawing.org/xmlns"; + private static final String GRAPHML_TAG = "graphml"; + private static final String XMLNSXSI_ATT = "xmlns:xsi"; + private static final String XSISL_ATT = "xsi:schemaLocation"; + private static final String GRAPHML_XMLNSXSI = "http://www.w3.org/2001/XMLSchema-instance"; + private static final String GRAPHML_XSISL = "http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd"; + private static final String PARSENODEIDS_ATT = "parse.nodeids"; + private static final String PARSENODEIDS_DEFAULT_VALUE = "canonical"; + private static final String PARSEEDGEIDS_ATT = "parse.edgeids"; + private static final String PARSEEDGEIDS_DEFAULT_VALUE = "canonical"; + private static final String PARSEORDER_ATT = "parse.order"; + private static final String PARSEORDER_DEFAULT_VALUE = "nodesfirst"; + + private static final String ATTR_TYPE_ATT = "attr.type"; + private static final String ATTR_NAME_ATT = "attr.name"; + private static final String FOR_ATT = "for"; + private static final String ID_ATT = "id"; + private static final String KEY_TAG = "key"; + private static final String DATA_TAG = "data"; + + private enum GraphMLProperty { + NODE_NUMBER(NODE_TAG, "number", "int"), // + NODE_TOKEN(NODE_TAG, "tokens", "string"), // + NODE_RANK(NODE_TAG, "rank", "int"), // + EDGE_NUMBER(EDGE_TAG, "number", "int"), // + EDGE_TYPE(EDGE_TAG, "type", "string"), // + EDGE_WITNESSES(EDGE_TAG, "witnesses", "string"); + + private String name; + private String forElement; + private String type; + + private GraphMLProperty(String forElement, String name, String type) { + this.name = name; + this.forElement = forElement; + this.type = type; + } - public void declare(XMLStreamWriter xml) throws XMLStreamException { - xml.writeEmptyElement(GRAPHML_NS, KEY_TAG); - xml.writeAttribute(ID_ATT, "d" + ordinal()); - xml.writeAttribute(FOR_ATT, forElement); - xml.writeAttribute(ATTR_NAME_ATT, name); - xml.writeAttribute(ATTR_TYPE_ATT, type); - } - } + public void write(String data, XMLStreamWriter xml) throws XMLStreamException { + xml.writeStartElement(GRAPHML_NS, DATA_TAG); + xml.writeAttribute(KEY_TAG, "d" + ordinal()); + xml.writeCharacters(data); + xml.writeEndElement(); + } - final Function vertexToString = new Function() { - @Override - public String apply(@Nullable VariantGraph.Vertex input) { - final Witness witness = Iterables.getFirst(input.witnesses(), null); - return (witness == null ? "" : tokensToString.apply(input.tokens(Collections.singleton(witness)))); - } - }; - - static final Function, String> SIMPLE_TOKEN_TO_STRING = new Function, String>() { - public String apply(@Nullable Iterable input) { - final List tokens = Ordering.natural().immutableSortedCopy( - Iterables.filter(input, SimpleToken.class) - ); - final StringBuilder sb = new StringBuilder(); - for (SimpleToken token : tokens) { - sb.append(token.getContent()); - } - return sb.toString(); + public void declare(XMLStreamWriter xml) throws XMLStreamException { + xml.writeEmptyElement(GRAPHML_NS, KEY_TAG); + xml.writeAttribute(ID_ATT, "d" + ordinal()); + xml.writeAttribute(FOR_ATT, forElement); + xml.writeAttribute(ATTR_NAME_ATT, name); + xml.writeAttribute(ATTR_TYPE_ATT, type); + } } - }; + + final Function vertexToString = new Function() { + @Override + public String apply(VariantGraph.Vertex input) { + return input.witnesses().stream().findFirst() + .map(witness -> tokensToString.apply(Arrays.asList(input.tokens().stream().filter(t -> witness.equals(t.getWitness())).toArray(Token[]::new)))) + .orElse(""); + } + }; + + static final Function, String> SIMPLE_TOKEN_TO_STRING = input -> StreamSupport.stream(input.spliterator(), false) + .filter(t -> SimpleToken.class.isAssignableFrom(t.getClass())) + .map(t -> (SimpleToken) t) + .sorted() + .map(SimpleToken::getContent) + .collect(Collectors.joining()); } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleWitness.java b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleWitness.java index d6532f258..b2270b8a7 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleWitness.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleWitness.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,90 +19,86 @@ package eu.interedition.collatex.simple; -import com.google.common.base.Function; -import com.google.common.base.Preconditions; -import com.google.common.collect.Iterables; -import com.google.common.collect.Iterators; -import com.google.common.collect.Lists; import eu.interedition.collatex.Token; import eu.interedition.collatex.Witness; import java.util.ArrayList; +import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; +import java.util.function.Function; import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; public class SimpleWitness implements Iterable, Witness, Comparator { - private final String sigil; - private final List tokens = new ArrayList(); - - public SimpleWitness(String sigil) { - this.sigil = sigil; - } - - public SimpleWitness(String sigil, String content) { - this(sigil, content, SimplePatternTokenizer.BY_WS_OR_PUNCT, SimpleTokenNormalizers.LC_TRIM_WS); - } - - public SimpleWitness(String sigil, - String content, - Function> tokenizer, - Function normalizer) { - this(sigil); - setTokenContents(tokenizer.apply(content), normalizer); - } - - public List getTokens() { - return tokens; - } - - public void setTokenContents(Iterable tokenContents, Function normalizer) { - final List tokens = Lists.newArrayListWithExpectedSize(Iterables.size(tokenContents)); - for (String content : tokenContents) { - tokens.add(new SimpleToken(this, content, normalizer.apply(content))); + private final String sigil; + private final List tokens = new ArrayList<>(); + + public SimpleWitness(String sigil) { + this.sigil = sigil; + } + + public SimpleWitness(String sigil, String content) { + this(sigil, content, SimplePatternTokenizer.BY_WS_OR_PUNCT, SimpleTokenNormalizers.LC_TRIM_WS); + } + + public SimpleWitness(String sigil, + String content, + Function> tokenizer, + Function normalizer) { + this(sigil); + setTokenContents(tokenizer.apply(content), normalizer); + } + + public List getTokens() { + return tokens; + } + + public void setTokenContents(Stream tokenContents, Function normalizer) { + setTokens(tokenContents.map(content -> new SimpleToken(SimpleWitness.this, content, normalizer.apply(content))).collect(Collectors.toList())); } - setTokens(tokens); - } - - public void setTokens(List tokens) { - this.tokens.clear(); - this.tokens.addAll(tokens); - } - - @Override - public String getSigil() { - return sigil; - } - - @Override - public Iterator iterator() { - return Iterators.unmodifiableIterator(tokens.iterator()); - } - - @Override - public String toString() { - return getSigil(); - } - - @Override - public int compare(SimpleToken o1, SimpleToken o2) { - final int o1Index = tokens.indexOf(o1); - final int o2Index = tokens.indexOf(o2); - Preconditions.checkArgument(o1Index >= 0, o1); - Preconditions.checkArgument(o2Index >= 0, o2); - return (o1Index - o2Index); - } - - public static final Pattern PUNCT = Pattern.compile("\\p{Punct}"); - - public static final Function TOKEN_NORMALIZER = new Function() { + + public void setTokens(List tokens) { + this.tokens.clear(); + this.tokens.addAll(tokens); + } + @Override - public String apply(String input) { - final String normalized = PUNCT.matcher(input.trim().toLowerCase()).replaceAll(""); - return (normalized == null || normalized.length() == 0 ? input : normalized); + public String getSigil() { + return sigil; } - }; + + @Override + public Iterator iterator() { + return Collections.unmodifiableList(tokens).iterator(); + } + + @Override + public String toString() { + return getSigil(); + } + + @Override + public int compare(SimpleToken o1, SimpleToken o2) { + final int o1Index = tokens.indexOf(o1); + final int o2Index = tokens.indexOf(o2); + if (o1Index < 0) { + throw new IllegalArgumentException(o1.toString()); + } + if (o2Index < 0) { + throw new IllegalArgumentException(); + } + return (o1Index - o2Index); + } + + public static final Pattern PUNCT = Pattern.compile("\\p{Punct}"); + + public static final Function TOKEN_NORMALIZER = input -> { + final String normalized = PUNCT.matcher(input.trim().toLowerCase()).replaceAll(""); + return (normalized == null || normalized.length() == 0 ? input : normalized); + }; } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleWitnessTeiBuilder.java b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleWitnessTeiBuilder.java index a79d4c880..ed97e53cb 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleWitnessTeiBuilder.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/simple/SimpleWitnessTeiBuilder.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,94 +19,92 @@ package eu.interedition.collatex.simple; -import java.io.InputStream; -import java.util.List; - import javax.xml.namespace.QName; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.XMLEvent; - -import com.google.common.collect.Lists; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; // we are going to use stax // previous version was build on DOM, which is not particularly well suited to parsing // large TEI files, with segments and expansions public class SimpleWitnessTeiBuilder { - private static QName w = new QName("http://www.tei-c.org/ns/1.0", "w"); - private static QName seg = new QName("http://www.tei-c.org/ns/1.0", "seg"); - private static QName p = new QName("http://www.tei-c.org/ns/1.0", "p"); + private static QName w = new QName("http://www.tei-c.org/ns/1.0", "w"); + private static QName seg = new QName("http://www.tei-c.org/ns/1.0", "seg"); + private static QName p = new QName("http://www.tei-c.org/ns/1.0", "p"); - public SimpleWitness read(InputStream input) throws XMLStreamException { - SimpleWitness witness = new SimpleWitness("id"); - List tokenContents = Lists.newArrayList(); - XMLInputFactory factory = XMLInputFactory.newInstance(); - XMLEventReader reader = factory.createXMLEventReader(input); - XMLEvent event = null; - int numberOfParagraphs = 0; - while ((event = reader.peek()) != null && numberOfParagraphs < 10) { - // System.out.println(event.toString()); - if (event.isStartElement() && event.asStartElement().getName().equals(w)) { - tokenContents.add(handleWElement(reader)); - } else if (event.isStartElement() && event.asStartElement().getName().equals(seg)) { - tokenContents.add(handleSegElement(reader)); - } else if (event.isStartElement() && event.asStartElement().getName().equals(p)) { - reader.next(); - numberOfParagraphs++; - } else { - reader.next(); - } + public SimpleWitness read(InputStream input) throws XMLStreamException { + SimpleWitness witness = new SimpleWitness("id"); + List tokenContents = new ArrayList<>(); + XMLInputFactory factory = XMLInputFactory.newInstance(); + XMLEventReader reader = factory.createXMLEventReader(input); + XMLEvent event = null; + int numberOfParagraphs = 0; + while ((event = reader.peek()) != null && numberOfParagraphs < 10) { + // System.out.println(event.toString()); + if (event.isStartElement() && event.asStartElement().getName().equals(w)) { + tokenContents.add(handleWElement(reader)); + } else if (event.isStartElement() && event.asStartElement().getName().equals(seg)) { + tokenContents.add(handleSegElement(reader)); + } else if (event.isStartElement() && event.asStartElement().getName().equals(p)) { + reader.next(); + numberOfParagraphs++; + } else { + reader.next(); + } + } + witness.setTokenContents(tokenContents.stream(), SimpleTokenNormalizers.LC_TRIM_WS_PUNCT); + return witness; } - witness.setTokenContents(tokenContents, SimpleTokenNormalizers.LC_TRIM_WS_PUNCT); - return witness; - } - private static String handleWElement(XMLEventReader reader) throws XMLStreamException { - XMLEvent event = reader.nextEvent(); - // Do what you need to do with the start element, e.g. initialize - // data structures - // System.out.println("W tag is triggered here!"); - StringBuffer textBuffer = new StringBuffer(); - while ((event = reader.peek()) != null) { - if (event.isEndElement() && event.asEndElement().getName().equals(w)) { - // Do what you need to do at the end, e.g. add data - // collected from sub elements, etc. - event = reader.nextEvent(); - break; - } else { - // Do what you need to do for start or child elements, e.g. - // dispatch to another handler function - event = reader.nextEvent(); - textBuffer.append(event.toString()); - // System.out.println("Text :"+event.toString()); - } + private static String handleWElement(XMLEventReader reader) throws XMLStreamException { + XMLEvent event = reader.nextEvent(); + // Do what you need to do with the start element, e.g. initialize + // data structures + // System.out.println("W tag is triggered here!"); + StringBuffer textBuffer = new StringBuffer(); + while ((event = reader.peek()) != null) { + if (event.isEndElement() && event.asEndElement().getName().equals(w)) { + // Do what you need to do at the end, e.g. add data + // collected from sub elements, etc. + event = reader.nextEvent(); + break; + } else { + // Do what you need to do for start or child elements, e.g. + // dispatch to another handler function + event = reader.nextEvent(); + textBuffer.append(event.toString()); + // System.out.println("Text :"+event.toString()); + } + } + return textBuffer.toString(); } - return textBuffer.toString(); - } - private static String handleSegElement(XMLEventReader reader) throws XMLStreamException { - XMLEvent event = reader.nextEvent(); - // Do what you need to do with the start element, e.g. initialize - // data structures - // System.out.println("Seg tag is triggered here!"); - StringBuffer textBuffer = new StringBuffer(); - while ((event = reader.peek()) != null) { - if (event.isEndElement() && event.asEndElement().getName().equals(seg)) { - // Do what you need to do at the end, e.g. add data - // collected from sub elements, etc. - event = reader.nextEvent(); - break; - } else { - // Do what you need to do for start or child elements, e.g. - // dispatch to another handler function - event = reader.nextEvent(); - if (event.getEventType() == XMLEvent.CHARACTERS) { - textBuffer.append(event.toString().trim()); + private static String handleSegElement(XMLEventReader reader) throws XMLStreamException { + XMLEvent event = reader.nextEvent(); + // Do what you need to do with the start element, e.g. initialize + // data structures + // System.out.println("Seg tag is triggered here!"); + StringBuffer textBuffer = new StringBuffer(); + while ((event = reader.peek()) != null) { + if (event.isEndElement() && event.asEndElement().getName().equals(seg)) { + // Do what you need to do at the end, e.g. add data + // collected from sub elements, etc. + event = reader.nextEvent(); + break; + } else { + // Do what you need to do for start or child elements, e.g. + // dispatch to another handler function + event = reader.nextEvent(); + if (event.getEventType() == XMLEvent.CHARACTERS) { + textBuffer.append(event.toString().trim()); + } + } } - } + return textBuffer.toString(); } - return textBuffer.toString(); - } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/simple/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/simple/package-info.java index 10a15f836..6ef94f387 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/simple/package-info.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/simple/package-info.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,12 +19,12 @@ /** * Default implementations for core interfaces like witnesses and tokens. - *

- * Classes in this package make fundamental assumptions about the nature of text version to be collated, e.g. that they + * + *

Classes in this package make fundamental assumptions about the nature of text version to be collated, e.g. that they * can be tokenized by whitespace, that tokens might be case insensitive, that punctuation might not matter or that - * XML input adheres to a particular schema. - *

- * Users are advised to implement {@link eu.interedition.collatex.Token} and {@link eu.interedition.collatex.Witness} - * themselves and adjust their implementations to the use case at hand where those assumptions do not hold. + * XML input adheres to a particular schema.

+ * + *

Users are advised to implement {@link eu.interedition.collatex.Token} and {@link eu.interedition.collatex.Witness} + * themselves and adjust their implementations to the use case at hand where those assumptions do not hold.

*/ package eu.interedition.collatex.simple; \ No newline at end of file diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Algorithm.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Algorithm.java index b4ded56a9..59ec02eef 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Algorithm.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Algorithm.java @@ -7,49 +7,61 @@ * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -public enum Algorithm -{ - /** Karkkainen-Sanders. */ +public enum Algorithm { + /** + * Karkkainen-Sanders. + */ SKEW("Kärkkäinen-Sanders"), - /** Karkkainen-Sanders, with decorators allowing arbitrary input. */ + /** + * Karkkainen-Sanders, with decorators allowing arbitrary input. + */ SKEW_D("Kärkkäinen-Sanders (decorated for arbitrary input symbols)"), - /** Yuta Mori's divsufsort algorithm. */ + /** + * Yuta Mori's divsufsort algorithm. + */ DIVSUFSORT("Mori's algorithm"), - /** Yuta Mori's implementation of SA-IS. */ + /** + * Yuta Mori's implementation of SA-IS. + */ SAIS("SA-IS algorithm"), - /** Klaus-Bernd Schürmann's bucket pointer refinement algorithm */ + /** + * Klaus-Bernd Schürmann's bucket pointer refinement algorithm + */ BPR("Klaus-Bernd Schürmann's bpr algorithm"), - /** Deep-Shallow algorithm by Manzini and Ferragina. */ + /** + * Deep-Shallow algorithm by Manzini and Ferragina. + */ DEEP_SHALLOW("Manzini-Ferragina"), - /** "Larrson-Sadakane qsufsort algorithm */ + /** + * "Larrson-Sadakane qsufsort algorithm + */ QSUFSORT("Larrson-Sadakane qsufsort algorithm"); - /** Full name of the algorithm. */ + /** + * Full name of the algorithm. + */ private final String name; /* - * + * */ - private Algorithm(String name) - { + private Algorithm(String name) { this.name = name; } /** * @return Same as {@link #getInstance()}, but returns the algorithm instance - * decorated to work with any range or distribution of input symbols - * (respecting each algorithm's constraints). + * decorated to work with any range or distribution of input symbols + * (respecting each algorithm's constraints). */ - public ISuffixArrayBuilder getDecoratedInstance() - { - switch (this) - { + public ISuffixArrayBuilder getDecoratedInstance() { + switch (this) { case SKEW: return new DensePositiveDecorator(new ExtraTrailingCellsDecorator( getInstance(), SuffixArrays.MAX_EXTRA_TRAILING_SPACE)); @@ -62,10 +74,8 @@ public ISuffixArrayBuilder getDecoratedInstance() /** * @return Create and return an algorithm instance. */ - public ISuffixArrayBuilder getInstance() - { - switch (this) - { + public ISuffixArrayBuilder getInstance() { + switch (this) { case SKEW: return new Skew(); @@ -94,13 +104,11 @@ public ISuffixArrayBuilder getInstance() * instance will overwrite input. *

* If not, create default instance - * + * * @return Create and return low memory consuming instance. */ - public ISuffixArrayBuilder getMemoryConservingInstance() - { - switch (this) - { + public ISuffixArrayBuilder getMemoryConservingInstance() { + switch (this) { case QSUFSORT: return new QSufSort(false); case BPR: @@ -115,8 +123,7 @@ public ISuffixArrayBuilder getMemoryConservingInstance() /** * Return the full name of the algorithm. */ - public String getName() - { + public String getName() { return name; } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/BPR.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/BPR.java index 731f0eb01..1b78aba81 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/BPR.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/BPR.java @@ -6,7 +6,7 @@ *

* A straightforward reimplementation of the bucket pointer refinement algorithm given in: * - * Klaus-Bernd Schürmann, Suffix Arrays in Theory and Practice, Faculty of Technology of + * Klaus-Bernd Schürmann, Suffix Arrays in Theory and Practice, Faculty of Technology of * Bielefeld University, Germany, 2007 * *

@@ -19,40 +19,33 @@ * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -public class BPR implements ISuffixArrayBuilder -{ - private final static class Alphabet - { +public class BPR implements ISuffixArrayBuilder { + private final static class Alphabet { int size; - int [] charArray; - int [] alphaMapping; - int [] charFreq; + int[] charArray; + int[] alphaMapping; + int[] charFreq; - Alphabet(int [] thisString, int stringLength) - { + Alphabet(int[] thisString, int stringLength) { int tmpChar; size = 0; - alphaMapping = new int [KBS_MAX_ALPHABET_SIZE]; - charFreq = new int [KBS_MAX_ALPHABET_SIZE]; - for (int i = 0; i < stringLength; i++) - { + alphaMapping = new int[KBS_MAX_ALPHABET_SIZE]; + charFreq = new int[KBS_MAX_ALPHABET_SIZE]; + for (int i = 0; i < stringLength; i++) { tmpChar = thisString[i]; Tools.assertAlways(tmpChar >= 0, "Input must be positive"); - if (charFreq[tmpChar] == 0) - { + if (charFreq[tmpChar] == 0) { size++; } charFreq[tmpChar]++; } - charArray = new int [size + 1]; + charArray = new int[size + 1]; charArray[size] = 0; int k = 0; - for (int i = 0; i < KBS_MAX_ALPHABET_SIZE; i++) - { + for (int i = 0; i < KBS_MAX_ALPHABET_SIZE; i++) { alphaMapping[i] = -1; - if (charFreq[i] > 0) - { + if (charFreq[i] > 0) { charArray[k] = i; alphaMapping[i] = k; k++; @@ -73,21 +66,19 @@ private final static class Alphabet */ private final boolean preserveInput; - private int [] seq; + private int[] seq; private int length; private Alphabet alphabet; - private int [] suffixArray; - private int [] sufPtrMap; + private int[] suffixArray; + private int[] sufPtrMap; private int start; - public BPR() - { + public BPR() { this(true); } - public BPR(boolean preserveInput) - { + public BPR(boolean preserveInput) { this.preserveInput = preserveInput; } @@ -101,28 +92,24 @@ public BPR(boolean preserveInput) *

  • non-negative (≥0) symbols in the input
  • *
  • symbols limited by {@link #KBS_MAX_ALPHABET_SIZE} (< * KBS_MAX_ALPHABET_SIZE)
  • - *
  • length >= 2
  • + *
  • length ≥ 2
  • * *

    */ @Override - public int [] buildSuffixArray(int [] input, int start, int length) - { + public int[] buildSuffixArray(int[] input, int start, int length) { Tools.assertAlways(input != null, "input must not be null"); Tools.assertAlways(input.length >= start + length + KBS_STRING_EXTENSION_SIZE, "input is too short"); Tools.assertAlways(length >= 2, "input length must be >= 2"); this.start = start; - if (preserveInput) - { + if (preserveInput) { - seq = new int [length + KBS_STRING_EXTENSION_SIZE]; + seq = new int[length + KBS_STRING_EXTENSION_SIZE]; this.start = 0; System.arraycopy(input, start, seq, 0, length); - } - else - { + } else { seq = input; } @@ -131,24 +118,15 @@ public BPR(boolean preserveInput) int alphaSize = alphabet.size; int q; - if (alphaSize <= 9) - { + if (alphaSize <= 9) { q = 7; - } - else if (9 < alphaSize && alphaSize <= 13) - { + } else if (9 < alphaSize && alphaSize <= 13) { q = 6; - } - else if (13 < alphaSize && alphaSize <= 21) - { + } else if (13 < alphaSize && alphaSize <= 21) { q = 5; - } - else if (21 < alphaSize && alphaSize <= 46) - { + } else if (21 < alphaSize && alphaSize <= 46) { q = 4; - } - else - { + } else { q = 3; } @@ -157,11 +135,10 @@ else if (21 < alphaSize && alphaSize <= 46) } /** - * + * */ - private void kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(int q) - { - int [] buckets = determine_Buckets_Sarray_Sptrmap(q); + private void kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(int q) { + int[] buckets = determine_Buckets_Sarray_Sptrmap(q); /* Sorting of all buckets */ int mappedCharPtr = 0; @@ -169,38 +146,33 @@ private void kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(int q) int bucketsInLevel3Bucket = kbs_power_Ulong(alphabetSize, q - 3); int bucketsInLevel2Bucket = bucketsInLevel3Bucket * alphabetSize; int bucketsInLevel1Bucket = bucketsInLevel2Bucket * alphabetSize; - int [] alphaOrder = getCharWeightedOrder_Alphabet(buckets, bucketsInLevel2Bucket); - int [] isNotSortedLevel1Char = new int [alphabetSize]; + int[] alphaOrder = getCharWeightedOrder_Alphabet(buckets, bucketsInLevel2Bucket); + int[] isNotSortedLevel1Char = new int[alphabetSize]; Arrays.fill(isNotSortedLevel1Char, 1); /* Sort all level-1 buckets */ - int [] leftPtrList = new int [alphabetSize]; - int [] rightPtrList = new int [alphabetSize]; - int [] leftPtrList2 = new int [alphabetSize * alphabetSize]; - int [] rightPtrList2 = new int [alphabetSize * alphabetSize]; + int[] leftPtrList = new int[alphabetSize]; + int[] rightPtrList = new int[alphabetSize]; + int[] leftPtrList2 = new int[alphabetSize * alphabetSize]; + int[] rightPtrList2 = new int[alphabetSize * alphabetSize]; int i; int j; int c1 = 0; - for (i = 0; i < alphabetSize; i++) - { + for (i = 0; i < alphabetSize; i++) { c1 = alphaOrder[i]; /* sort buckets cd to cz */ - for (j = i + 1; j < alphabetSize; j++) - { + for (j = i + 1; j < alphabetSize; j++) { int c2 = alphaOrder[j]; int l; - for (l = i; l < alphabetSize; l++) - { + for (l = i; l < alphabetSize; l++) { int c3 = alphaOrder[l]; int tmpUlong = c1 * bucketsInLevel1Bucket + c2 * bucketsInLevel2Bucket + c3 * bucketsInLevel3Bucket; int k; - for (k = tmpUlong; k < tmpUlong + bucketsInLevel3Bucket; k++) - { + for (k = tmpUlong; k < tmpUlong + bucketsInLevel3Bucket; k++) { int leftPtr = buckets[k]; int rightPtr = buckets[k + 1] - 1; - if (rightPtr - leftPtr > 0) - { + if (rightPtr - leftPtr > 0) { if (rightPtr - leftPtr < INSSORT_LIMIT) insSortUpdateRecurse_SaBucket( leftPtr, rightPtr, q, q); else partitionUpdateRecurse_SaBucket(leftPtr, rightPtr, q, q); @@ -210,30 +182,25 @@ private void kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(int q) } /* copy left buckets of cx */ - for (j = i; j < alphabetSize; j++) - { + for (j = i; j < alphabetSize; j++) { int cp1 = alphaOrder[j]; leftPtrList[cp1] = buckets[cp1 * bucketsInLevel1Bucket + c1 * bucketsInLevel2Bucket]; int k; - for (k = i + 1; k < alphabetSize; k++) - { + for (k = i + 1; k < alphabetSize; k++) { int cp2 = alphaOrder[k]; leftPtrList2[cp2 * alphabetSize + cp1] = buckets[cp2 * bucketsInLevel1Bucket + cp1 * bucketsInLevel2Bucket + c1 * bucketsInLevel3Bucket]; } } - if (c1 == 0) - { + if (c1 == 0) { int cp1 = seq[start + mappedCharPtr + length - 1]; int cp2 = seq[start + mappedCharPtr + length - 2]; - if (isNotSortedLevel1Char[cp1] != 0) - { + if (isNotSortedLevel1Char[cp1] != 0) { leftPtrList[cp1]++; leftPtrList2[cp1 * alphabetSize]++; - if (isNotSortedLevel1Char[cp2] != 0 && cp2 != c1) - { + if (isNotSortedLevel1Char[cp2] != 0 && cp2 != c1) { suffixArray[leftPtrList2[cp2 * alphabetSize + cp1]] = length - 2; sufPtrMap[length - 2] = leftPtrList2[cp2 * alphabetSize + cp1]; leftPtrList2[cp2 * alphabetSize + cp1]++; @@ -242,16 +209,13 @@ private void kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(int q) } int leftPtr = buckets[c1 * bucketsInLevel1Bucket]; - while (leftPtr < leftPtrList[c1]) - { + while (leftPtr < leftPtrList[c1]) { int cp1; int tmpUlong = suffixArray[leftPtr]; if (tmpUlong != 0 && isNotSortedLevel1Char[cp1 = seq[start + mappedCharPtr + tmpUlong - - 1]] != 0) - { - if (isNotSortedLevel1Char[seq[start + mappedCharPtr + tmpUlong + 1]] != 0) - { + - 1]] != 0) { + if (isNotSortedLevel1Char[seq[start + mappedCharPtr + tmpUlong + 1]] != 0) { int tmpUlongPtr = leftPtrList[cp1]; sufPtrMap[tmpUlong - 1] = tmpUlongPtr; suffixArray[tmpUlongPtr] = tmpUlong - 1; @@ -260,8 +224,7 @@ private void kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(int q) int cp2; if (tmpUlong > 1 && isNotSortedLevel1Char[cp2 = seq[start + mappedCharPtr - + tmpUlong - 2]] != 0 && cp2 != c1) - { + + tmpUlong - 2]] != 0 && cp2 != c1) { int tmpUlongPtr = leftPtrList2[cp2 * alphabetSize + cp1]++; sufPtrMap[tmpUlong - 2] = tmpUlongPtr; suffixArray[tmpUlongPtr] = tmpUlong - 2; @@ -271,14 +234,12 @@ private void kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(int q) } /* copy right buckets of cx */ - for (j = i; j < alphabetSize; j++) - { + for (j = i; j < alphabetSize; j++) { int cp1 = alphaOrder[j]; int k; rightPtrList[cp1] = buckets[cp1 * bucketsInLevel1Bucket + (c1 + 1) * bucketsInLevel2Bucket]; - for (k = i + 1; k < alphabetSize; k++) - { + for (k = i + 1; k < alphabetSize; k++) { int cp2 = alphaOrder[k]; rightPtrList2[cp2 * alphabetSize + cp1] = buckets[cp2 * bucketsInLevel1Bucket + cp1 * bucketsInLevel2Bucket + (c1 + 1) @@ -286,18 +247,15 @@ private void kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(int q) } } int rightPtr = buckets[(c1 + 1) * bucketsInLevel1Bucket]; - while (leftPtr < rightPtr) - { + while (leftPtr < rightPtr) { int cp1; rightPtr--; int tmpUlong = suffixArray[rightPtr]; if (tmpUlong != 0 && isNotSortedLevel1Char[cp1 = seq[start + mappedCharPtr + tmpUlong - - 1]] != 0) - { + - 1]] != 0) { rightPtrList[cp1]--; - if (isNotSortedLevel1Char[seq[start + mappedCharPtr + tmpUlong + 1]] != 0) - { + if (isNotSortedLevel1Char[seq[start + mappedCharPtr + tmpUlong + 1]] != 0) { int tmpUlongPtr = rightPtrList[cp1]; sufPtrMap[tmpUlong - 1] = tmpUlongPtr; suffixArray[tmpUlongPtr] = tmpUlong - 1; @@ -305,8 +263,7 @@ private void kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(int q) int cp2; if (tmpUlong > 1 && isNotSortedLevel1Char[cp2 = seq[start + mappedCharPtr - + tmpUlong - 2]] != 0 && cp2 != c1) - { + + tmpUlong - 2]] != 0 && cp2 != c1) { int tmpUlongPtr = --rightPtrList2[cp2 * alphabetSize + cp1]; sufPtrMap[tmpUlong - 2] = tmpUlongPtr; suffixArray[tmpUlongPtr] = tmpUlong - 2; @@ -321,25 +278,22 @@ private void kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(int q) /** * Stably sorts a bucket at a refinement level regarding sort keys that are bucket * pointers in sufPtrMap with offset. - * - * @param leftPtr points to the leftmost suffix of the current bucket. + * + * @param leftPtr points to the leftmost suffix of the current bucket. * @param rightPtr points to the rightmost suffix of the current bucket. - * @param offset is the length of the common prefix of the suffixes (a multiple of q). - * @param q is the initial prefix length used for the bucket sort. It also determines - * the increase of offset. + * @param offset is the length of the common prefix of the suffixes (a multiple of q). + * @param q is the initial prefix length used for the bucket sort. It also determines + * the increase of offset. */ private void insSortUpdateRecurse_SaBucket(int leftPtr, int rightPtr, int offset, - int q) - { + int q) { int rightTmpPtr = leftPtr + 1; - while (rightTmpPtr <= rightPtr) - { + while (rightTmpPtr <= rightPtr) { int tempValue = suffixArray[rightTmpPtr]; int tempHashValue = sufPtrMap[suffixArray[rightTmpPtr] + offset]; int leftTmpPtr = rightTmpPtr; while (leftTmpPtr > leftPtr - && sufPtrMap[suffixArray[leftTmpPtr - 1] + offset] > tempHashValue) - { + && sufPtrMap[suffixArray[leftTmpPtr - 1] + offset] > tempHashValue) { suffixArray[leftTmpPtr] = suffixArray[leftTmpPtr - 1]; leftTmpPtr--; } @@ -352,16 +306,15 @@ private void insSortUpdateRecurse_SaBucket(int leftPtr, int rightPtr, int offset /** * The function determines the subbuckets after refining this bucket and recursively * calls the refinement function for the subbuckets. - * - * @param leftPtr points to the leftmost suffix of the current bucket. + * + * @param leftPtr points to the leftmost suffix of the current bucket. * @param rightPtr points to the rightmost suffix of the current bucket. - * @param offset is the length of the common prefix of the suffixes (a multiple of q). - * @param q is the initial prefix length used for the bucket sort. It also determines - * the increase of offset. + * @param offset is the length of the common prefix of the suffixes (a multiple of q). + * @param q is the initial prefix length used for the bucket sort. It also determines + * the increase of offset. */ private void updatePtrAndRefineBuckets_SaBucket(int leftPtr, int rightPtr, - int offset, int q) - { + int offset, int q) { /* * for all buckets with resp. pointer > rightPtr determine buckets via setting * sufPtrMap @@ -370,10 +323,8 @@ private void updatePtrAndRefineBuckets_SaBucket(int leftPtr, int rightPtr, int rightIntervalPtr = rightPtr; int tmpPtr; while (leftPtr <= leftIntervalPtr - && rightPtr < (tmpPtr = sufPtrMap[suffixArray[leftIntervalPtr] + offset])) - { - do - { + && rightPtr < (tmpPtr = sufPtrMap[suffixArray[leftIntervalPtr] + offset])) { + do { sufPtrMap[suffixArray[leftIntervalPtr]] = rightIntervalPtr; leftIntervalPtr--; } @@ -394,8 +345,7 @@ private void updatePtrAndRefineBuckets_SaBucket(int leftPtr, int rightPtr, rightIntervalPtr = leftIntervalPtr; while (leftPtr <= leftIntervalPtr && leftPtr <= sufPtrMap[suffixArray[leftIntervalPtr] + offset] - && sufPtrMap[suffixArray[leftIntervalPtr] + offset] <= rightPtr) - { + && sufPtrMap[suffixArray[leftIntervalPtr] + offset] <= rightPtr) { sufPtrMap[suffixArray[leftIntervalPtr]] = rightIntervalPtr; leftIntervalPtr--; } @@ -412,11 +362,9 @@ private void updatePtrAndRefineBuckets_SaBucket(int leftPtr, int rightPtr, int middleRightPtr = rightIntervalPtr; int middleLeftPtr = leftIntervalPtr; rightIntervalPtr = leftIntervalPtr; - while (leftPtr <= leftIntervalPtr) - { + while (leftPtr <= leftIntervalPtr) { int tmpPtr2 = sufPtrMap[suffixArray[leftIntervalPtr] + offset]; - do - { + do { sufPtrMap[suffixArray[leftIntervalPtr]] = rightIntervalPtr; leftIntervalPtr--; } @@ -426,25 +374,20 @@ private void updatePtrAndRefineBuckets_SaBucket(int leftPtr, int rightPtr, } int newOffset = offset + q; - if (sufPtrMap[suffixArray[leftPtr]] == rightPtr) - { + if (sufPtrMap[suffixArray[leftPtr]] == rightPtr) { newOffset = computeDiffDepthBucket_SaBucket(leftPtr, rightPtr, newOffset, q); } int leftTmpPtr = leftPtr; - while (leftTmpPtr < middleLeftPtr) - { + while (leftTmpPtr < middleLeftPtr) { int rightTmpPtr = sufPtrMap[suffixArray[leftTmpPtr]]; int tmpLong = rightTmpPtr - leftTmpPtr; - if (tmpLong > 0) - { - if (tmpLong == 1) - { + if (tmpLong > 0) { + if (tmpLong == 1) { computeBucketSize2_SaBucket(leftTmpPtr, rightTmpPtr, newOffset, q); leftTmpPtr = rightTmpPtr + 1; continue; } - if (tmpLong == 2) - { + if (tmpLong == 2) { computeBucketSize3_SaBucket(leftTmpPtr, rightTmpPtr, newOffset, q); leftTmpPtr = rightTmpPtr + 1; continue; @@ -454,42 +397,31 @@ private void updatePtrAndRefineBuckets_SaBucket(int leftPtr, int rightPtr, leftTmpPtr = rightTmpPtr + 1; } /* for buckets refering to this bucket, the offset can be doubled */ - if (middleRightPtr > middleLeftPtr + 1) - { - if (middleRightPtr - middleLeftPtr == 2) - { + if (middleRightPtr > middleLeftPtr + 1) { + if (middleRightPtr - middleLeftPtr == 2) { computeBucketSize2_SaBucket(middleLeftPtr + 1, middleRightPtr, Math.max( 2 * offset, newOffset), q); - } - else - { - if (middleRightPtr - middleLeftPtr == 3) - { + } else { + if (middleRightPtr - middleLeftPtr == 3) { computeBucketSize3_SaBucket(middleLeftPtr + 1, middleRightPtr, Math .max(2 * offset, newOffset), q); - } - else - { + } else { insSortUpdateRecurse_SaBucket(middleLeftPtr + 1, middleRightPtr, Math .max(2 * offset, newOffset), q); } } } leftTmpPtr = middleRightPtr + 1; - while (leftTmpPtr < rightPtr) - { + while (leftTmpPtr < rightPtr) { int rightTmpPtr = sufPtrMap[suffixArray[leftTmpPtr]]; int tmpLong = rightTmpPtr - leftTmpPtr; - if (tmpLong > 0) - { - if (tmpLong == 1) - { + if (tmpLong > 0) { + if (tmpLong == 1) { computeBucketSize2_SaBucket(leftTmpPtr, rightTmpPtr, newOffset, q); leftTmpPtr = rightTmpPtr + 1; continue; } - if (tmpLong == 2) - { + if (tmpLong == 2) { computeBucketSize3_SaBucket(leftTmpPtr, rightTmpPtr, newOffset, q); leftTmpPtr = rightTmpPtr + 1; continue; @@ -503,57 +435,49 @@ private void updatePtrAndRefineBuckets_SaBucket(int leftPtr, int rightPtr, /** * Completely sorts buckets of size 3. - * - * @param leftPtr points to the leftmost suffix of the current bucket. + * + * @param leftPtr points to the leftmost suffix of the current bucket. * @param rightPtr points to the rightmost suffix of the current bucket. - * @param q is the initial prefix length used for the bucket sort. It also determines - * the increase of offset. - * @param offset is the length of the common prefix of the suffixes rounded down to a - * multiple of q. + * @param q is the initial prefix length used for the bucket sort. It also determines + * the increase of offset. + * @param offset is the length of the common prefix of the suffixes rounded down to a + * multiple of q. */ - private void computeBucketSize3_SaBucket(int leftPtr, int rightPtr, int offset, int q) - { + private void computeBucketSize3_SaBucket(int leftPtr, int rightPtr, int offset, int q) { int newOffset = offset; while (sufPtrMap[suffixArray[leftPtr] + newOffset] == sufPtrMap[suffixArray[leftPtr + 1] + newOffset] && sufPtrMap[suffixArray[leftPtr + 1] + newOffset] == sufPtrMap[suffixArray[rightPtr] - + newOffset]) - { + + newOffset]) { newOffset += q; } if (sufPtrMap[suffixArray[leftPtr] + newOffset] > sufPtrMap[suffixArray[rightPtr] - + newOffset]) - { + + newOffset]) { int swapTmp = suffixArray[leftPtr]; suffixArray[leftPtr] = suffixArray[rightPtr]; suffixArray[rightPtr] = swapTmp; } if (sufPtrMap[suffixArray[leftPtr] + newOffset] > sufPtrMap[suffixArray[leftPtr + 1] - + newOffset]) - { + + newOffset]) { int swapTmp = suffixArray[leftPtr]; suffixArray[leftPtr] = suffixArray[leftPtr + 1]; suffixArray[leftPtr + 1] = swapTmp; } if (sufPtrMap[suffixArray[leftPtr + 1] + newOffset] > sufPtrMap[suffixArray[rightPtr] - + newOffset]) - { + + newOffset]) { int swapTmp = suffixArray[rightPtr]; suffixArray[rightPtr] = suffixArray[leftPtr + 1]; suffixArray[leftPtr + 1] = swapTmp; } if (sufPtrMap[suffixArray[leftPtr] + newOffset] == sufPtrMap[suffixArray[leftPtr + 1] - + newOffset]) - { + + newOffset]) { int suffix1 = suffixArray[leftPtr] + newOffset + q; int suffix2 = suffixArray[leftPtr + 1] + newOffset + q; - while (sufPtrMap[suffix1] == sufPtrMap[suffix2]) - { + while (sufPtrMap[suffix1] == sufPtrMap[suffix2]) { suffix1 += q; suffix2 += q; } - if (sufPtrMap[suffix1] > sufPtrMap[suffix2]) - { + if (sufPtrMap[suffix1] > sufPtrMap[suffix2]) { int tmpSwap = suffixArray[leftPtr]; suffixArray[leftPtr] = suffixArray[leftPtr + 1]; suffixArray[leftPtr + 1] = tmpSwap; @@ -564,18 +488,15 @@ private void computeBucketSize3_SaBucket(int leftPtr, int rightPtr, int offset, return; } if (sufPtrMap[suffixArray[leftPtr + 1] + newOffset] == sufPtrMap[suffixArray[rightPtr] - + newOffset]) - { + + newOffset]) { sufPtrMap[suffixArray[leftPtr]] = leftPtr; int suffix1 = suffixArray[leftPtr + 1] + newOffset + q; int suffix2 = suffixArray[rightPtr] + newOffset + q; - while (sufPtrMap[suffix1] == sufPtrMap[suffix2]) - { + while (sufPtrMap[suffix1] == sufPtrMap[suffix2]) { suffix1 += q; suffix2 += q; } - if (sufPtrMap[suffix1] > sufPtrMap[suffix2]) - { + if (sufPtrMap[suffix1] > sufPtrMap[suffix2]) { int tmpSwap = suffixArray[rightPtr]; suffixArray[rightPtr] = suffixArray[leftPtr + 1]; suffixArray[leftPtr + 1] = tmpSwap; @@ -591,25 +512,22 @@ private void computeBucketSize3_SaBucket(int leftPtr, int rightPtr, int offset, /** * Completely sorts buckets of size 2. - * - * @param leftPtr points to the leftmost suffix of the current bucket. + * + * @param leftPtr points to the leftmost suffix of the current bucket. * @param rightPtr points to the rightmost suffix of the current bucket. - * @param offset is the length of the common prefix of the suffixes rounded down to a - * multiple of q. - * @param q is the initial prefix length used for the bucket sort. It also determines - * the increase of offset. + * @param offset is the length of the common prefix of the suffixes rounded down to a + * multiple of q. + * @param q is the initial prefix length used for the bucket sort. It also determines + * the increase of offset. */ - private void computeBucketSize2_SaBucket(int leftPtr, int rightPtr, int offset, int q) - { + private void computeBucketSize2_SaBucket(int leftPtr, int rightPtr, int offset, int q) { int suffix1 = suffixArray[leftPtr] + offset; int suffix2 = suffixArray[rightPtr] + offset; - while (sufPtrMap[suffix1] == sufPtrMap[suffix2]) - { + while (sufPtrMap[suffix1] == sufPtrMap[suffix2]) { suffix1 += q; suffix2 += q; } - if (sufPtrMap[suffix1] > sufPtrMap[suffix2]) - { + if (sufPtrMap[suffix1] > sufPtrMap[suffix2]) { int tmpSwap = suffixArray[leftPtr]; suffixArray[leftPtr] = suffixArray[rightPtr]; suffixArray[rightPtr] = tmpSwap; @@ -621,28 +539,24 @@ private void computeBucketSize2_SaBucket(int leftPtr, int rightPtr, int offset, /** * Computes about the LCP of all suffixes in this bucket. It will be the newoffset. - * - * @param leftPtr points to the leftmost suffix of the current bucket. + * + * @param leftPtr points to the leftmost suffix of the current bucket. * @param rightPtr points to the rightmost suffix of the current bucket. - * @param offset is the length of the common prefix of the suffixes rounded down to a - * multiple of q. - * @param q is the initial prefix length used for the bucket sort. It also determines - * the increase of offset. + * @param offset is the length of the common prefix of the suffixes rounded down to a + * multiple of q. + * @param q is the initial prefix length used for the bucket sort. It also determines + * the increase of offset. * @return the LCP of suffixes in this bucket (newoffset). */ private int computeDiffDepthBucket_SaBucket(int leftPtr, int rightPtr, int offset, - int q) - { + int q) { int lcp = offset; - while (true) - { + while (true) { int runPtr = leftPtr; int a = suffixArray[rightPtr]; int tmpPtr = sufPtrMap[a + lcp]; - while (runPtr < rightPtr) - { - if (sufPtrMap[suffixArray[runPtr] + lcp] != tmpPtr) - { + while (runPtr < rightPtr) { + if (sufPtrMap[suffixArray[runPtr] + lcp] != tmpPtr) { return lcp; } runPtr++; @@ -655,28 +569,25 @@ private int computeDiffDepthBucket_SaBucket(int leftPtr, int rightPtr, int offse * Ternary partitioning of buckets with Lomuto's scheme. Subbuckets of size 2 and 3 * are directly sorted and partitions smaller than a given threshold are sorted by * insertion sort. - * - * @param leftPtr points to the leftmost position of the current bucket. + * + * @param leftPtr points to the leftmost position of the current bucket. * @param rightPtr points to the rightmost position of the current bucket. - * @param offset is the length of the common prefix of the suffixes (a multiple of q). - * @param q is the initial prefix length used for the bucket sort. It also determines - * the increase of offset. + * @param offset is the length of the common prefix of the suffixes (a multiple of q). + * @param q is the initial prefix length used for the bucket sort. It also determines + * the increase of offset. */ private void partitionUpdateRecurse_SaBucket(int leftPtr, int rightPtr, int offset, - int q) - { + int q) { int pivot; int tmpSize = rightPtr - leftPtr; - if (tmpSize < 10000) - { + if (tmpSize < 10000) { tmpSize = tmpSize / 4; pivot = sufPtrMap[suffixArray[leftPtr + tmpSize] + offset]; int pivotb = sufPtrMap[suffixArray[leftPtr + 2 * tmpSize] + offset]; int pivotc = sufPtrMap[suffixArray[rightPtr - tmpSize] + offset]; int medNumber = medianOfThreeUlong(pivot, pivotb, pivotc); int pivotPtr = leftPtr + tmpSize; - if (medNumber > 0) - { + if (medNumber > 0) { pivotPtr = (medNumber == 1) ? (leftPtr + 2 * tmpSize) : (rightPtr - tmpSize); pivot = (medNumber == 1) ? pivotb : pivotc; @@ -684,25 +595,20 @@ private void partitionUpdateRecurse_SaBucket(int leftPtr, int rightPtr, int offs int swapTmp = suffixArray[pivotPtr]; suffixArray[pivotPtr] = suffixArray[leftPtr]; suffixArray[leftPtr] = swapTmp; - } - else - { - int [] keyPtrList = new int [9]; + } else { + int[] keyPtrList = new int[9]; tmpSize = tmpSize / 10; int i; - for (i = 0; i < 9; i++) - { + for (i = 0; i < 9; i++) { keyPtrList[i] = leftPtr + (i + 1) * tmpSize; } /* insertion sort */ - for (i = 1; i < 9; i++) - { + for (i = 1; i < 9; i++) { int tempValue = keyPtrList[i]; int tempHashValue = sufPtrMap[suffixArray[tempValue] + offset]; int j = i - 1; while (j >= 0 - && sufPtrMap[suffixArray[keyPtrList[j]] + offset] > tempHashValue) - { + && sufPtrMap[suffixArray[keyPtrList[j]] + offset] > tempHashValue) { keyPtrList[j + 1] = keyPtrList[j]; j--; } @@ -716,28 +622,23 @@ private void partitionUpdateRecurse_SaBucket(int leftPtr, int rightPtr, int offs int pivotRightPtr = leftPtr + 1; while (pivotRightPtr <= rightPtr - && sufPtrMap[suffixArray[pivotRightPtr] + offset] == pivot) - { + && sufPtrMap[suffixArray[pivotRightPtr] + offset] == pivot) { ++pivotRightPtr; } int smallerPivotPtr = pivotRightPtr; while (smallerPivotPtr <= rightPtr - && sufPtrMap[suffixArray[smallerPivotPtr] + offset] < pivot) - { + && sufPtrMap[suffixArray[smallerPivotPtr] + offset] < pivot) { smallerPivotPtr++; } int frontPtr = smallerPivotPtr - 1; - while (frontPtr++ < rightPtr) - { + while (frontPtr++ < rightPtr) { int sortkey = sufPtrMap[suffixArray[frontPtr] + offset]; - if (sortkey <= pivot) - { + if (sortkey <= pivot) { int swapTmp = suffixArray[frontPtr]; suffixArray[frontPtr] = suffixArray[smallerPivotPtr]; suffixArray[smallerPivotPtr] = swapTmp; - if (sortkey == pivot) - { + if (sortkey == pivot) { suffixArray[smallerPivotPtr] = suffixArray[pivotRightPtr]; suffixArray[pivotRightPtr++] = swapTmp; } @@ -746,25 +647,18 @@ private void partitionUpdateRecurse_SaBucket(int leftPtr, int rightPtr, int offs } /* vector swap the pivot elements */ int numberSmaller = smallerPivotPtr - pivotRightPtr; - if (numberSmaller > 0) - { + if (numberSmaller > 0) { int swapsize = Math.min((pivotRightPtr - leftPtr), numberSmaller); int pivotRightTmpPtr = leftPtr + swapsize - 1; vectorSwap(leftPtr, pivotRightTmpPtr, smallerPivotPtr - 1); /* recursively sort < partition */ - if (numberSmaller == 1) - { + if (numberSmaller == 1) { sufPtrMap[suffixArray[leftPtr]] = leftPtr; - } - else - { - if (numberSmaller == 2) - { + } else { + if (numberSmaller == 2) { computeBucketSize2_SaBucket(leftPtr, leftPtr + 1, offset, q); - } - else - { + } else { if (numberSmaller == 3) computeBucketSize3_SaBucket(leftPtr, leftPtr + 2, offset, q); else partitionUpdateRecurse_SaBucket(leftPtr, leftPtr + numberSmaller @@ -776,47 +670,34 @@ else partitionUpdateRecurse_SaBucket(leftPtr, leftPtr + numberSmaller /* update pivots and recursively sort = partition */ int leftTmpPtr = leftPtr + numberSmaller; smallerPivotPtr--; - if (leftTmpPtr == smallerPivotPtr) - { + if (leftTmpPtr == smallerPivotPtr) { sufPtrMap[suffixArray[leftTmpPtr]] = leftTmpPtr; if (leftTmpPtr == rightPtr) return; - } - else - { + } else { int newOffset = (pivot == rightPtr) ? (2 * offset) : offset + q; - if (leftTmpPtr + 1 == smallerPivotPtr) - { + if (leftTmpPtr + 1 == smallerPivotPtr) { computeBucketSize2_SaBucket(leftTmpPtr, smallerPivotPtr, newOffset, q); if (rightPtr == smallerPivotPtr) return; - } - else - { - if (leftTmpPtr + 2 == smallerPivotPtr) - { + } else { + if (leftTmpPtr + 2 == smallerPivotPtr) { computeBucketSize3_SaBucket(leftTmpPtr, smallerPivotPtr, newOffset, q); if (rightPtr == smallerPivotPtr) return; - } - else - { - if (rightPtr == smallerPivotPtr) - { + } else { + if (rightPtr == smallerPivotPtr) { newOffset = computeDiffDepthBucket_SaBucket(leftPtr + numberSmaller, rightPtr, newOffset, q); partitionUpdateRecurse_SaBucket(leftTmpPtr, rightPtr, newOffset, q); return; } - while (leftTmpPtr <= smallerPivotPtr) - { + while (leftTmpPtr <= smallerPivotPtr) { sufPtrMap[suffixArray[leftTmpPtr]] = smallerPivotPtr; leftTmpPtr++; } - if (smallerPivotPtr < leftPtr + numberSmaller + INSSORT_LIMIT) - { + if (smallerPivotPtr < leftPtr + numberSmaller + INSSORT_LIMIT) { insSortUpdateRecurse_SaBucket(leftPtr + numberSmaller, smallerPivotPtr, newOffset, q); - } - else partitionUpdateRecurse_SaBucket(leftPtr + numberSmaller, + } else partitionUpdateRecurse_SaBucket(leftPtr + numberSmaller, smallerPivotPtr, newOffset, q); } } @@ -824,18 +705,15 @@ else partitionUpdateRecurse_SaBucket(leftPtr + numberSmaller, /* recursively sort > partition */ ++smallerPivotPtr; - if (smallerPivotPtr == rightPtr) - { + if (smallerPivotPtr == rightPtr) { sufPtrMap[suffixArray[rightPtr]] = rightPtr; return; } - if (smallerPivotPtr + 1 == rightPtr) - { + if (smallerPivotPtr + 1 == rightPtr) { computeBucketSize2_SaBucket(smallerPivotPtr, rightPtr, offset, q); return; } - if (smallerPivotPtr + 2 == rightPtr) - { + if (smallerPivotPtr + 2 == rightPtr) { computeBucketSize3_SaBucket(smallerPivotPtr, rightPtr, offset, q); return; } @@ -844,15 +722,13 @@ else partitionUpdateRecurse_SaBucket(leftPtr + numberSmaller, } /** - * @param leftPtr points to the leftmost suffix of the first swap space. - * @param rightPtr points to the rightmost suffix of the first swap space. + * @param leftPtr points to the leftmost suffix of the first swap space. + * @param rightPtr points to the rightmost suffix of the first swap space. * @param swapEndPtr points to the leftmost suffix of the second swap space. */ - private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) - { + private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) { int swapTmp = suffixArray[swapEndPtr]; - while (leftPtr < rightPtr) - { + while (leftPtr < rightPtr) { suffixArray[swapEndPtr] = suffixArray[rightPtr]; swapEndPtr--; suffixArray[rightPtr] = suffixArray[swapEndPtr]; @@ -866,36 +742,31 @@ private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) /** * Sorts the alphabet concerning some weight concerning cc bucket size and alphabet * frequency Only works for mapped string with alphabet [0,alphaSize] - * - * @param buckets - the bucket table + * + * @param buckets - the bucket table * @param bucketsInLevel2Bucket - number of subbuckets of level-2 buckets * @return the order of the alphabet according to the weight on buckets with same - * first and second character + * first and second character */ - private int [] getCharWeightedOrder_Alphabet(int [] buckets, int bucketsInLevel2Bucket) - { + private int[] getCharWeightedOrder_Alphabet(int[] buckets, int bucketsInLevel2Bucket) { int alphabetSize = alphabet.size; - int [] charWeight = new int [alphabetSize]; + int[] charWeight = new int[alphabetSize]; int tmpBucketFactor = bucketsInLevel2Bucket * (alphabetSize + 1); int i; - for (i = 0; i < alphabetSize; i++) - { + for (i = 0; i < alphabetSize; i++) { charWeight[i] = alphabet.charFreq[i]; charWeight[i] -= buckets[i * tmpBucketFactor + bucketsInLevel2Bucket] - buckets[i * tmpBucketFactor]; } - int [] targetCharArray = new int [alphabetSize + 1]; - for (i = 0; i < alphabetSize; i++) - { + int[] targetCharArray = new int[alphabetSize + 1]; + for (i = 0; i < alphabetSize; i++) { targetCharArray[i] = i; } - for (i = 1; i < alphabet.size; i++) - { + for (i = 1; i < alphabet.size; i++) { int tmpWeight = charWeight[i]; int j = i; - while (j > 0 && tmpWeight < charWeight[targetCharArray[j - 1]]) - { + while (j > 0 && tmpWeight < charWeight[targetCharArray[j - 1]]) { targetCharArray[j] = targetCharArray[j - 1]; j--; } @@ -907,19 +778,15 @@ private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) /** * Constructs all buckets w.r.t. q-gram size q, the up to prefix q sorted suffix * array, and the bucket-pointer table. - * + * * @param q size of q-gram. * @return Buckets containing pointers into the suffix array. */ - private int [] determine_Buckets_Sarray_Sptrmap(int q) - { + private int[] determine_Buckets_Sarray_Sptrmap(int q) { - if (kbs_getExp_Ulong(2, alphabet.size) >= 0) - { + if (kbs_getExp_Ulong(2, alphabet.size) >= 0) { return determinePower2Alpha_Buckets_Sarray_Sptrmap(q); - } - else - { + } else { return determineAll_Buckets_Sarray_Sptrmap(q); } } @@ -927,16 +794,15 @@ private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) /** * Constructs all buckets w.r.t. q-gram size q, the up to prefix q sorted suffix * array, and the bucket-pointer table. - * + * * @param q size of q-gram. * @return Buckets containing pointers into the suffix array. * @see #determine_Buckets_Sarray_Sptrmap */ - private int [] determineAll_Buckets_Sarray_Sptrmap(int q) - { - int [] buckets = determineAll_Buckets_Sarray(q); + private int[] determineAll_Buckets_Sarray_Sptrmap(int q) { + int[] buckets = determineAll_Buckets_Sarray(q); int strLen = length; - sufPtrMap = new int [strLen + 2 * q + 1]; + sufPtrMap = new int[strLen + 2 * q + 1]; /* computation of first hashvalue */ int alphabetSize = alphabet.size; @@ -944,16 +810,14 @@ private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) int tempPower = 1; int hashCode = 0; int i; - for (i = q - 1; i >= 0; i--) - { + for (i = q - 1; i >= 0; i--) { hashCode += seq[start + mappedUcharArray + i] * tempPower; tempPower *= alphabetSize; } int tempModulo = kbs_power_Ulong(alphabetSize, q - 1); mappedUcharArray += q; int j; - for (j = 0; j < strLen - 1; j++) - { + for (j = 0; j < strLen - 1; j++) { sufPtrMap[j] = (buckets[hashCode + 1]) - 1; hashCode -= (seq[start + mappedUcharArray - q]) * tempModulo; hashCode *= alphabetSize; @@ -963,8 +827,7 @@ private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) sufPtrMap[j] = buckets[hashCode]; /* set the values in sufPtrMap[strLen..strLen+2*d] to [-1, -2, ..., -2*d] */ int beginPtr = -1; - for (j = strLen; j <= strLen + 2 * q; j++) - { + for (j = strLen; j <= strLen + 2 * q; j++) { sufPtrMap[j] = beginPtr--; } return buckets; @@ -974,34 +837,30 @@ private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) * Constructs all buckets w.r.t. q-gram size and the up to prefix q sorted suffix * array. Call determine_Buckets_Sarray(const Kbs_Ustring *const ustring, register * const Kbs_Ulong q, Kbs_Ulong **suffixArrayPtr) instead - * + * * @param q size of q-gram. * @return Buckets containing pointers into the suffix array. * @see #determine_Buckets_Sarray_Sptrmap(int) */ - private int [] determineAll_Buckets_Sarray(int q) - { + private int[] determineAll_Buckets_Sarray(int q) { int strLen = length; int alphabetSize = alphabet.size; int numberBuckets = kbs_power_Ulong(alphabetSize, q); - int [] buckets = new int [numberBuckets + 1]; - for (int i = 0; i < q; i++) - { + int[] buckets = new int[numberBuckets + 1]; + for (int i = 0; i < q; i++) { seq[start + length + i] = alphabet.charArray[0]; } - for (int i = 0; i < KBS_STRING_EXTENSION_SIZE - q; i++) - { + for (int i = 0; i < KBS_STRING_EXTENSION_SIZE - q; i++) { seq[start + length + i + q] = 0; } /* computation of first hashvalue */ - int [] alphaMap = alphabet.alphaMapping; + int[] alphaMap = alphabet.alphaMapping; int mappedUcharArray = 0; int hashCode = 0; int tempPower = 1; int i; - for (i = q - 1; i >= 0; i--) - { + for (i = q - 1; i >= 0; i--) { hashCode += (seq[start + mappedUcharArray + i] = alphaMap[seq[start + mappedUcharArray + i]]) * tempPower; @@ -1013,8 +872,7 @@ private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) mappedUcharArray += q; buckets[hashCode]++; int j; - for (j = 1; j < strLen; j++) - { + for (j = 1; j < strLen; j++) { hashCode -= (seq[start + mappedUcharArray - q]) * tempModulo; hashCode *= alphabetSize; hashCode += seq[start + mappedUcharArray] = alphaMap[seq[start @@ -1023,31 +881,27 @@ private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) buckets[hashCode]++; } /* update the alphabet for mapped string */ - for (j = 0; j < alphabetSize; j++) - { + for (j = 0; j < alphabetSize; j++) { alphabet.charFreq[j] = alphabet.charFreq[alphabet.charArray[j]]; alphabet.charArray[j] = j; alphaMap[j] = j; } - for (; j < KBS_MAX_ALPHABET_SIZE; j++) - { + for (; j < KBS_MAX_ALPHABET_SIZE; j++) { alphaMap[j] = -1; } - this.suffixArray = new int [strLen + 1]; + this.suffixArray = new int[strLen + 1]; /* computation of the bucket pointers, pointers into the suffix array */ - for (j = 1; j <= numberBuckets; j++) - { + for (j = 1; j <= numberBuckets; j++) { buckets[j] = buckets[j - 1] + buckets[j]; } /* computation of the suffix array (buckets that are copied later are left out) */ - int [] charRank = getCharWeightedRank_Alphabet(buckets, q); + int[] charRank = getCharWeightedRank_Alphabet(buckets, q); mappedUcharArray = q; hashCode = firstHashCode; - for (j = 0; j < strLen - 1; j++) - { + for (j = 0; j < strLen - 1; j++) { int c1; buckets[hashCode]--; if ((c1 = charRank[seq[start + mappedUcharArray - q]]) < charRank[seq[start @@ -1069,26 +923,23 @@ private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) /** * Constructs all buckets w.r.t. q-gram size q, the up to prefix length q sorted * suffix array, and the bucket-pointer table. - * + * * @param q size of q-gram. * @return Buckets containing pointers into the suffix array. * @see #determine_Buckets_Sarray_Sptrmap */ - private int [] determinePower2Alpha_Buckets_Sarray_Sptrmap(int q) - { + private int[] determinePower2Alpha_Buckets_Sarray_Sptrmap(int q) { int strLen = length; int exp2 = kbs_getExp_Ulong(2, alphabet.size); - if (exp2 < 0) - { + if (exp2 < 0) { throw new RuntimeException("value out of bounds"); } - int [] buckets = determinePower2Alpha_Buckets_Sarray(q); - this.sufPtrMap = new int [strLen + 2 * q + 1]; + int[] buckets = determinePower2Alpha_Buckets_Sarray(q); + this.sufPtrMap = new int[strLen + 2 * q + 1]; int mappedUcharArray = 0; int hashCode = 0; int j; - for (j = 0; j < q; j++) - { + for (j = 0; j < q; j++) { hashCode = hashCode << exp2; hashCode += seq[start + mappedUcharArray + j]; } @@ -1097,8 +948,7 @@ private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) tempModulo = tempModulo << (exp2 * (q - 1)); tempModulo = ~tempModulo; mappedUcharArray += q; - for (j = 0; j < strLen - 1; j++) - { + for (j = 0; j < strLen - 1; j++) { sufPtrMap[j] = (buckets[hashCode + 1]) - 1; hashCode = hashCode & tempModulo; hashCode = hashCode << exp2; @@ -1107,36 +957,29 @@ private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) } sufPtrMap[j] = buckets[hashCode]; int beginPtr = -1; - for (j = strLen; j <= strLen + 2 * q; j++) - { + for (j = strLen; j <= strLen + 2 * q; j++) { sufPtrMap[j] = beginPtr--; } return buckets; } - private int kbs_power_Ulong(int base, int exp) - { + private int kbs_power_Ulong(int base, int exp) { int p; - if (exp == 0) - { + if (exp == 0) { return 1; } - if (exp == 1) - { + if (exp == 1) { return base; } - if (base == 4) - { - if (exp > 15) - { + if (base == 4) { + if (exp > 15) { throw new RuntimeException(); } return 4 << (2 * (exp - 1)); } p = 1; - for (; exp > 0; --exp) - { + for (; exp > 0; --exp) { p = p * base; } return p; @@ -1146,29 +989,25 @@ private int kbs_power_Ulong(int base, int exp) * Constructs all buckets w.r.t. q-gram size q and the up to prefix q sorted suffix * array. Precondition: ustring->alphabet->alphaSize = 2^x for some x; otherwise, call * determine_Buckets_Sarray. - * + * * @param q size of q-gram. * @return Buckets containing pointers into the suffix array. * @see #determine_Buckets_Sarray_Sptrmap(int) */ - private int [] determinePower2Alpha_Buckets_Sarray(int q) - { + private int[] determinePower2Alpha_Buckets_Sarray(int q) { int exp2 = kbs_getExp_Ulong(2, alphabet.size); int strLen = length; int mappedUcharArray = 0; - for (int i = 0; i < q; i++) - { + for (int i = 0; i < q; i++) { seq[start + length + i] = alphabet.charArray[0]; } - for (int i = length + q; i < length + KBS_STRING_EXTENSION_SIZE - q; i++) - { + for (int i = length + q; i < length + KBS_STRING_EXTENSION_SIZE - q; i++) { seq[start + i] = 0; } int numberBuckets = kbs_power_Ulong(alphabet.size, q); - int [] buckets = new int [numberBuckets + 1]; + int[] buckets = new int[numberBuckets + 1]; int hashCode = 0; - for (int j = 0; j < q; j++) - { + for (int j = 0; j < q; j++) { hashCode = hashCode << exp2; hashCode += (seq[start + mappedUcharArray + j] = alphabet.alphaMapping[seq[start + mappedUcharArray + j]]); @@ -1182,44 +1021,39 @@ private int kbs_power_Ulong(int base, int exp) mappedUcharArray += q; buckets[hashCode]++; - for (int j = 1; j < strLen; j++) - { + for (int j = 1; j < strLen; j++) { hashCode = hashCode & tempModulo; hashCode = hashCode << exp2; hashCode = hashCode | (seq[start + mappedUcharArray] = alphabet.alphaMapping[seq[start - + mappedUcharArray]]); + + mappedUcharArray]]); mappedUcharArray++; buckets[hashCode]++; } /* update the alphabet for mapped string */ int j; - for (j = 0; j < alphabet.size; j++) - { + for (j = 0; j < alphabet.size; j++) { alphabet.charFreq[j] = alphabet.charFreq[alphabet.charArray[j]]; alphabet.charArray[j] = j; alphabet.alphaMapping[j] = j; } - for (; j < KBS_MAX_ALPHABET_SIZE; j++) - { + for (; j < KBS_MAX_ALPHABET_SIZE; j++) { alphabet.alphaMapping[j] = -1; } - this.suffixArray = new int [strLen + 1]; + this.suffixArray = new int[strLen + 1]; /* computation of the bucket pointers, pointers into the suffix array */ - for (j = 1; j <= numberBuckets; j++) - { + for (j = 1; j <= numberBuckets; j++) { buckets[j] = buckets[j - 1] + buckets[j]; } /* computation of the suffix array */ - int [] charRank = getCharWeightedRank_Alphabet(buckets, q); + int[] charRank = getCharWeightedRank_Alphabet(buckets, q); mappedUcharArray = q; hashCode = firstHashCode; - for (j = 0; j < strLen - 1; j++) - { + for (j = 0; j < strLen - 1; j++) { int c1; buckets[hashCode]--; if ((c1 = charRank[seq[start + mappedUcharArray - q]]) < charRank[seq[start @@ -1241,23 +1075,21 @@ private int kbs_power_Ulong(int base, int exp) /** * Sorts the alphabet regarding some weight according to cc bucket size and alphabet * frequency Only works for mapped string with alphabet [0,alphaSize] - * + * * @param buckets - the bucket table - * @param q - the initial q-gram size + * @param q - the initial q-gram size * @return the rank of each character */ - private int [] getCharWeightedRank_Alphabet(int [] buckets, int q) - { + private int[] getCharWeightedRank_Alphabet(int[] buckets, int q) { int alphabetSize = alphabet.size; - int [] charWeight = new int [alphabetSize]; + int[] charWeight = new int[alphabetSize]; int bucketsInLevel2Bucket = kbs_power_Ulong(alphabetSize, q - 2); int tmpBucketFactor = bucketsInLevel2Bucket * (alphabetSize + 1); int i; charWeight[0] = alphabet.charFreq[0]; charWeight[0] -= buckets[bucketsInLevel2Bucket - 1]; - for (i = 1; i < alphabetSize - 1; i++) - { + for (i = 1; i < alphabetSize - 1; i++) { charWeight[i] = alphabet.charFreq[i]; charWeight[i] -= buckets[i * tmpBucketFactor + bucketsInLevel2Bucket - 1] - buckets[i * tmpBucketFactor - 1]; @@ -1267,49 +1099,40 @@ private int kbs_power_Ulong(int base, int exp) + bucketsInLevel2Bucket - 1] - buckets[(alphabetSize - 1) * tmpBucketFactor - 1]; - int [] targetCharArray = new int [alphabetSize]; - for (i = 0; i < alphabetSize; i++) - { + int[] targetCharArray = new int[alphabetSize]; + for (i = 0; i < alphabetSize; i++) { targetCharArray[i] = i; } /* insertion sort by charWeight */ - for (i = 1; i < alphabet.size; i++) - { + for (i = 1; i < alphabet.size; i++) { int tmpWeight = charWeight[i]; int j = i; - while (j > 0 && tmpWeight < charWeight[targetCharArray[j - 1]]) - { + while (j > 0 && tmpWeight < charWeight[targetCharArray[j - 1]]) { targetCharArray[j] = targetCharArray[j - 1]; j--; } targetCharArray[j] = i; } - int [] charRank = new int [alphabetSize + 1]; - for (i = 0; i < alphabetSize; i++) - { + int[] charRank = new int[alphabetSize + 1]; + for (i = 0; i < alphabetSize; i++) { charRank[targetCharArray[i]] = i; } return charRank; } /** - * + * */ - private int kbs_getExp_Ulong(int base, int value) - { + private int kbs_getExp_Ulong(int base, int value) { int exp = 0; int tmpValue = 1; - while (tmpValue < value) - { + while (tmpValue < value) { tmpValue *= base; exp++; } - if (tmpValue == value) - { + if (tmpValue == value) { return exp; - } - else - { + } else { return -1; } @@ -1321,14 +1144,11 @@ private int kbs_getExp_Ulong(int base, int value) * @param c third key * @return 0 if a is the median, 1 if b is the median, 2 if c is the median. */ - private int medianOfThreeUlong(int a, int b, int c) - { - if (a == b || a == c) - { + private int medianOfThreeUlong(int a, int b, int c) { + if (a == b || a == c) { return 0; } - if (b == c) - { + if (b == c) { return 2; } return a < b ? (b < c ? 1 : (a < c ? 2 : 0)) : (b > c ? 1 : (a < c ? 0 : 2)); diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/CharSequenceAdapter.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/CharSequenceAdapter.java index f548eaf41..6dbeffc8e 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/CharSequenceAdapter.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/CharSequenceAdapter.java @@ -2,47 +2,42 @@ /** * An adapter for constructing suffix arrays on character sequences. - * - * @see SuffixArrays#create(CharSequence) - * @see SuffixArrays#create(CharSequence, ISuffixArrayBuilder) * * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) + * @see SuffixArrays#create(CharSequence) + * @see SuffixArrays#create(CharSequence, ISuffixArrayBuilder) */ -final class CharSequenceAdapter -{ +final class CharSequenceAdapter { private final ISuffixArrayBuilder delegate; /** * Last mapped input in {@link #buildSuffixArray(CharSequence)}. */ - int [] input; + int[] input; /** * Construct an adapter with a given underlying suffix array construction strategy. * The suffix array builder should accept non-negative characters, with a possibly * large alphabet size. - * + * * @see DensePositiveDecorator */ - public CharSequenceAdapter(ISuffixArrayBuilder builder) - { + public CharSequenceAdapter(ISuffixArrayBuilder builder) { this.delegate = builder; } /** * Construct a suffix array for a given character sequence. */ - public int [] buildSuffixArray(CharSequence sequence) - { + public int[] buildSuffixArray(CharSequence sequence) { /* * Allocate slightly more space, some suffix construction strategies need it and * we don't want to waste space for multiple symbol mappings. */ - this.input = new int [sequence.length() + SuffixArrays.MAX_EXTRA_TRAILING_SPACE]; - for (int i = sequence.length() - 1; i >= 0; i--) - { + this.input = new int[sequence.length() + SuffixArrays.MAX_EXTRA_TRAILING_SPACE]; + for (int i = sequence.length() - 1; i >= 0; i--) { input[i] = sequence.charAt(i); } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DeepShallow.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DeepShallow.java index 9a90d3cdb..ede4c188c 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DeepShallow.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DeepShallow.java @@ -17,200 +17,200 @@ * @author Dawid Weiss (Carrot Search) */ public class DeepShallow implements ISuffixArrayBuilder { - private static class SplitGroupResult { - final int equal; - final int lower; - - public SplitGroupResult(int equal, int lower) { - this.equal = equal; - this.lower = lower; - } - } - - private static class Node { - int skip; - int key; - Node right; - // original author uses down as a pointer to another Node, but sometimes he stores - // int values in it. Because of that, we have two following variables (luckily we - // could do so :)). - Node down; - int downInt; - } - - /** - * TODO: What is this magic constant? Do not make it public and do not reuse it anywhere where it isn't needed - * (especially not in the tests). If this algorithm has special considerations, we can run algorithm-specific tests - * with an appropriate decorator. - */ - final static int OVERSHOOT = 575; - private final static int SETMASK = 1 << 30; - private final static int CLEARMASK = ~SETMASK; - private final static int MARKER = 1 << 31; - - /** - * recursion limit for mk quicksort: - */ - private final static int MK_QS_TRESH = 20; - - private final static int MAX_TRESH = 30; - - /** - * limit for shallow_sort - */ - private final static int SHALLOW_LIMIT = 550; - - /** - * maximum offset considered when searching a pseudo anchor - */ - private final static int MAX_PSEUDO_ANCHOR_OFFSET = 0; - - /** - * maximum ratio bucket_size/group_size accepted for pseudo anchor_sorting - */ - private final static int B2G_RATIO = 1000; - - /** - * Update anchor ranks when determining rank for pseudo-sorting - */ - private final static boolean UPDATE_ANCHOR_RANKS = false; - - /** - * blind sort is used for groups of size ≤ Text_size/Blind_sort_ratio - */ - private final static int BLIND_SORT_RATIO = 2000; - - private final static int STACK_SIZE = 100; - - private int[] text; - private int textSize; - private int[] suffixArray; - private int anchorDist; // distance between anchors - private int anchorNum; - private int[] anchorOffset; - private int[] anchorRank; - private final int[] ftab = new int[66049]; - private final int[] bucketRanked = new int[66049]; - private final int[] runningOrder = new int[257]; - private final int[] lcpAux = new int[1 + MAX_TRESH]; - private int lcp; - private int cmpLeft; - private int cmpDone; - private int aux; - private int auxWritten; - private int stackSize; - private Node[] stack; - private int start; - - /** - * If true, {@link #buildSuffixArray(int[], int, int)} uses a copy of the input so it is left intact. - */ - private final boolean preserveInput; - - public DeepShallow() { - preserveInput = true; - } - - public DeepShallow(boolean preserveInput) { - this.preserveInput = preserveInput; - } - - /** - * {@inheritDoc} - *

    - * Additional constraints enforced by Deep-Shallow algorithm: - *

      - *
    • non-negative (≥0) symbols in the input
    • - *
    • maximal symbol value < 256
    • - *
    • input.lengthstart + length if {@link #preserveInput} is true
    • - *
    • input.lengthstart + length + {@link #OVERSHOOT} if {@link #preserveInput} is false
    • - *
    • length >= 2
    • - *
    - */ - @Override - public int[] buildSuffixArray(int[] input, int start, int length) { - Tools.assertAlways(input.length >= start + length, "Input array is too short"); - - MinMax mm = Tools.minmax(input, start, length); - Tools.assertAlways(mm.min >= 0, "input must not be negative"); - Tools.assertAlways(mm.max < 256, "max alphabet size is 256"); - - lcp = 1; - stack = new Node[length]; - this.start = start; - if (preserveInput) { - this.start = 0; - text = new int[length + OVERSHOOT]; - System.arraycopy(input, start, text, 0, length); - } else { - Tools.assertAlways(input.length >= start + length + OVERSHOOT, - "Input array length must have a trailing space of at least " + OVERSHOOT - + " bytes."); - text = input; - } - - for (int i = length; i < length + OVERSHOOT; i++) { - text[this.start + i] = 0; - } - - textSize = length; - suffixArray = new int[length]; - - int i, j, ss, sb, k, c1, c2, numQSorted = 0; - boolean[] bigDone = new boolean[257]; - int[] copyStart = new int[257]; - int[] copyEnd = new int[257]; - - // ------ init array containing positions of anchors - if (anchorDist == 0) { - anchorNum = 0; - } else { - anchorNum = 2 + (length - 1) / anchorDist; // see comment for helped_sort() - anchorRank = new int[anchorNum]; - anchorOffset = new int[anchorNum]; - for (i = 0; i < anchorNum; i++) { - anchorRank[i] = -1; // pos of anchors is initially unknown - anchorOffset[i] = anchorDist; // maximum possible value - } - } - - // ---------- init ftab ------------------ - // at first, clear values in ftab - for (i = 0; i < 66049; i++) - ftab[i] = 0; - - c1 = text[this.start + 0]; - for (i = 1; i <= textSize; i++) { - c2 = text[this.start + i]; - ftab[(c1 << 8) + c2]++; - c1 = c2; - } - for (i = 1; i < 66049; i++) - ftab[i] += ftab[i - 1]; - - // -------- sort suffixes considering only the first two chars - c1 = text[this.start + 0]; - for (i = 0; i < textSize; i++) { - c2 = text[this.start + i + 1]; - j = (c1 << 8) + c2; - c1 = c2; - ftab[j]--; - suffixArray[ftab[j]] = i; - } + private static class SplitGroupResult { + final int equal; + final int lower; + + public SplitGroupResult(int equal, int lower) { + this.equal = equal; + this.lower = lower; + } + } + + private static class Node { + int skip; + int key; + Node right; + // original author uses down as a pointer to another Node, but sometimes he stores + // int values in it. Because of that, we have two following variables (luckily we + // could do so :)). + Node down; + int downInt; + } + + /** + * TODO: What is this magic constant? Do not make it public and do not reuse it anywhere where it isn't needed + * (especially not in the tests). If this algorithm has special considerations, we can run algorithm-specific tests + * with an appropriate decorator. + */ + final static int OVERSHOOT = 575; + private final static int SETMASK = 1 << 30; + private final static int CLEARMASK = ~SETMASK; + private final static int MARKER = 1 << 31; + + /** + * recursion limit for mk quicksort: + */ + private final static int MK_QS_TRESH = 20; + + private final static int MAX_TRESH = 30; + + /** + * limit for shallow_sort + */ + private final static int SHALLOW_LIMIT = 550; + + /** + * maximum offset considered when searching a pseudo anchor + */ + private final static int MAX_PSEUDO_ANCHOR_OFFSET = 0; + + /** + * maximum ratio bucket_size/group_size accepted for pseudo anchor_sorting + */ + private final static int B2G_RATIO = 1000; + + /** + * Update anchor ranks when determining rank for pseudo-sorting + */ + private final static boolean UPDATE_ANCHOR_RANKS = false; + + /** + * blind sort is used for groups of size ≤ Text_size/Blind_sort_ratio + */ + private final static int BLIND_SORT_RATIO = 2000; + + private final static int STACK_SIZE = 100; + + private int[] text; + private int textSize; + private int[] suffixArray; + private int anchorDist; // distance between anchors + private int anchorNum; + private int[] anchorOffset; + private int[] anchorRank; + private final int[] ftab = new int[66049]; + private final int[] bucketRanked = new int[66049]; + private final int[] runningOrder = new int[257]; + private final int[] lcpAux = new int[1 + MAX_TRESH]; + private int lcp; + private int cmpLeft; + private int cmpDone; + private int aux; + private int auxWritten; + private int stackSize; + private Node[] stack; + private int start; + + /** + * If true, {@link #buildSuffixArray(int[], int, int)} uses a copy of the input so it is left intact. + */ + private final boolean preserveInput; + + public DeepShallow() { + preserveInput = true; + } + + public DeepShallow(boolean preserveInput) { + this.preserveInput = preserveInput; + } + + /** + * {@inheritDoc} + *

    + * Additional constraints enforced by Deep-Shallow algorithm: + *

      + *
    • non-negative (≥0) symbols in the input
    • + *
    • maximal symbol value < 256
    • + *
    • input.lengthstart + length if {@link #preserveInput} is true
    • + *
    • input.lengthstart + length + {@link #OVERSHOOT} if {@link #preserveInput} is false
    • + *
    • length ≥ 2
    • + *
    + */ + @Override + public int[] buildSuffixArray(int[] input, int start, int length) { + Tools.assertAlways(input.length >= start + length, "Input array is too short"); + + MinMax mm = Tools.minmax(input, start, length); + Tools.assertAlways(mm.min >= 0, "input must not be negative"); + Tools.assertAlways(mm.max < 256, "max alphabet size is 256"); + + lcp = 1; + stack = new Node[length]; + this.start = start; + if (preserveInput) { + this.start = 0; + text = new int[length + OVERSHOOT]; + System.arraycopy(input, start, text, 0, length); + } else { + Tools.assertAlways(input.length >= start + length + OVERSHOOT, + "Input array length must have a trailing space of at least " + OVERSHOOT + + " bytes."); + text = input; + } + + for (int i = length; i < length + OVERSHOOT; i++) { + text[this.start + i] = 0; + } + + textSize = length; + suffixArray = new int[length]; + + int i, j, ss, sb, k, c1, c2, numQSorted = 0; + boolean[] bigDone = new boolean[257]; + int[] copyStart = new int[257]; + int[] copyEnd = new int[257]; + + // ------ init array containing positions of anchors + if (anchorDist == 0) { + anchorNum = 0; + } else { + anchorNum = 2 + (length - 1) / anchorDist; // see comment for helped_sort() + anchorRank = new int[anchorNum]; + anchorOffset = new int[anchorNum]; + for (i = 0; i < anchorNum; i++) { + anchorRank[i] = -1; // pos of anchors is initially unknown + anchorOffset[i] = anchorDist; // maximum possible value + } + } + + // ---------- init ftab ------------------ + // at first, clear values in ftab + for (i = 0; i < 66049; i++) + ftab[i] = 0; + + c1 = text[this.start + 0]; + for (i = 1; i <= textSize; i++) { + c2 = text[this.start + i]; + ftab[(c1 << 8) + c2]++; + c1 = c2; + } + for (i = 1; i < 66049; i++) + ftab[i] += ftab[i - 1]; + + // -------- sort suffixes considering only the first two chars + c1 = text[this.start + 0]; + for (i = 0; i < textSize; i++) { + c2 = text[this.start + i + 1]; + j = (c1 << 8) + c2; + c1 = c2; + ftab[j]--; + suffixArray[ftab[j]] = i; + } /* decide on the running order */ - calculateRunningOrder(); - for (i = 0; i < 257; i++) { - bigDone[i] = false; - } + calculateRunningOrder(); + for (i = 0; i < 257; i++) { + bigDone[i] = false; + } /* Really do the suffix sorting */ - for (i = 0; i <= 256; i++) { + for (i = 0; i <= 256; i++) { /*-- - Process big buckets, starting with the least full. + Process big buckets, starting with the least full. --*/ - ss = runningOrder[i]; + ss = runningOrder[i]; /*-- Complete the big bucket [ss] by sorting any unsorted small buckets [ss, j]. Hopefully @@ -218,1639 +218,1639 @@ public int[] buildSuffixArray(int[] input, int start, int length) { completed many of the small buckets [ss, j], so we don't have to sort them at all. --*/ - for (j = 0; j <= 256; j++) { - if (j != ss) { - sb = (ss << 8) + j; - if ((ftab[sb] & SETMASK) == 0) { - int lo = ftab[sb] & CLEARMASK; - int hi = (ftab[sb + 1] & CLEARMASK) - 1; - if (hi > lo) { - shallowSort(lo, hi - lo + 1); - numQSorted += (hi - lo + 1); - } - } - ftab[sb] |= SETMASK; - } - } - { - for (j = 0; j <= 256; j++) { - copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; - copyEnd[j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; - } - // take care of the virtual -1 char in position textSize+1 - if (ss == 0) { - k = textSize - 1; - c1 = text[this.start + k]; - if (!bigDone[c1]) - suffixArray[copyStart[c1]++] = k; - } - for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { - k = suffixArray[j] - 1; - if (k < 0) - continue; - c1 = text[this.start + k]; - if (!bigDone[c1]) - suffixArray[copyStart[c1]++] = k; - } - for (j = (ftab[(ss + 1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { - k = suffixArray[j] - 1; - if (k < 0) - continue; - c1 = text[this.start + k]; - if (!bigDone[c1]) - suffixArray[copyEnd[c1]--] = k; - } - } - for (j = 0; j <= 256; j++) - ftab[(j << 8) + ss] |= SETMASK; - bigDone[ss] = true; - }// endfor - - return suffixArray; - } - - /** - * This is the multikey quicksort from bentley-sedgewick modified so that it stops recursion when depth reaches - * {@link #SHALLOW_LIMIT} (that is when two or more suffixes have {@link #SHALLOW_LIMIT} chars in common). - */ - private void shallowSort(int a, int n) { - // call multikey quicksort - // skip 2 chars since suffixes come from the same bucket - shallowMkq32(a, n, 2); - - } - - /** - * recursive multikey quicksort from Bentley-Sedgewick. - *

    - * Stops when text_depth reaches {@link #SHALLOW_LIMIT} that is when we have found that the current set of strings - * have {@link #SHALLOW_LIMIT} chars in common - */ - private void shallowMkq32(int a, int n, int text_depth) { - - int partval, val; - int pa = 0, pb = 0, pc = 0, pd = 0, pl = 0, pm = 0, pn = 0;// pointers - int d, r; - int next_depth;// text pointer - boolean repeatFlag = true; - - // ---- On small arrays use insertions sort - if (n < MK_QS_TRESH) { - shallowInssortLcp(a, n, text_depth); - return; - } - - // ----------- choose pivot -------------- - while (repeatFlag) { - - repeatFlag = false; - pl = a; - pm = a + (n / 2); - pn = a + (n - 1); - if (n > 30) { // On big arrays, pseudomedian of 9 - d = (n / 8); - pl = med3(pl, pl + d, pl + 2 * d, text_depth); - pm = med3(pm - d, pm, pm + d, text_depth); - pn = med3(pn - 2 * d, pn - d, pn, text_depth); - } - pm = med3(pl, pm, pn, text_depth); - swap2(a, pm); - partval = ptr2char32(a, text_depth); - pa = pb = a + 1; - pc = pd = a + n - 1; - // -------- partition ----------------- - for (;;) { - while (pb <= pc && (val = ptr2char32(pb, text_depth)) <= partval) { - if (val == partval) { - swap2(pa, pb); - pa++; - } - pb++; - } - while (pb <= pc && (val = ptr2char32(pc, text_depth)) >= partval) { - if (val == partval) { - swap2(pc, pd); - pd--; - } - pc--; - } - if (pb > pc) - break; - swap2(pb, pc); - pb++; - pc--; - } - if (pa > pd) { - // all values were equal to partval: make it simpler - if ((next_depth = text_depth + 4) >= SHALLOW_LIMIT) { - helpedSort(a, n, next_depth); - return; - } else { - text_depth = next_depth; - repeatFlag = true; - } - } - - } - // partition a[] into the values smaller, equal, and larger that partval - pn = a + n; - r = min(pa - a, pb - pa); - vecswap2(a, pb - r, r); - r = min(pd - pc, pn - pd - 1); - vecswap2(pb, pn - r, r); - // --- sort smaller strings ------- - if ((r = pb - pa) > 1) - shallowMkq32(a, r, text_depth); - // --- sort strings starting with partval ----- - if ((next_depth = text_depth + 4) < SHALLOW_LIMIT) - shallowMkq32(a + r, pa - pd + n - 1, next_depth); - else - helpedSort(a + r, pa - pd + n - 1, next_depth); - if ((r = pd - pc) > 1) - shallowMkq32(a + n - r, r, text_depth); - - } - - private void vecswap2(int a, int b, int n) { - while (n-- > 0) { - int t = suffixArray[a]; - suffixArray[a++] = suffixArray[b]; - suffixArray[b++] = t; - } - } - - private static int min(int i, int j) { - return i < j ? i : j; - } - - /** - * this is the insertion sort routine called by multikey-quicksort for sorting small groups. During insertion sort - * the comparisons are done calling cmp_unrolled_shallow_lcp() and two strings are equal if the coincides for - * SHALLOW_LIMIT characters. After this first phase we sort groups of "equal_string" using helped_sort(). - *

    - */ - private void shallowInssortLcp(int a, int n, int text_depth) { - int i, j, j1, lcp_new, r, ai, lcpi; - int cmp_from_limit; - int text_depth_ai;// pointer - // --------- initialize ---------------- - - lcpAux[0] = -1; // set lcp[-1] = -1 - for (i = 0; i < n; i++) { - lcpAux[lcp + i] = 0; - } - cmp_from_limit = SHALLOW_LIMIT - text_depth; - - // ----- start insertion sort ----------- - for (i = 1; i < n; i++) { - ai = suffixArray[a + i]; - lcpi = 0; - text_depth_ai = ai + text_depth; - j = i; - j1 = j - 1; // j1 is a shorhand for j-1 - while (true) { - - // ------ compare ai with a[j-1] -------- - cmpLeft = cmp_from_limit - lcpi; - r = cmpUnrolledShallowLcp(lcpi + suffixArray[a + j1] + text_depth, lcpi - + text_depth_ai); - lcp_new = cmp_from_limit - cmpLeft; // lcp between ai and a[j1] - assert (r != 0 || lcp_new >= cmp_from_limit); - - if (r <= 0) { // we have a[j-1] <= ai - lcpAux[lcp + j1] = lcp_new; // ai will be written in a[j]; update - // lcp[j-1] - break; - } - - // --- we have a[j-1]>ai. a[j-1] and maybe other will be moved down - // --- use lcp to move down as many elements of a[] as possible - lcpi = lcp_new; - do { - suffixArray[a + j] = suffixArray[a + j1]; // move down a[j-1] - lcpAux[lcp + j] = lcpAux[lcp + j1]; // move down lcp[j-1] - j = j1; - j1--; // update j and j1=j-1 - } while (lcpi < lcpAux[lcp + j1]); // recall that lcp[-1]=-1 - - if (lcpi > lcpAux[lcp + j1]) - break; // ai will be written in position j - - // if we get here lcpi==lcp[j1]: we will compare them at next iteration - - } // end for(j=i ... - suffixArray[a + j] = ai; - lcpAux[lcp + j] = lcpi; - } // end for(i=1 ... - // ----- done with insertion sort. now sort groups of equal strings - for (i = 0; i < n - 1; i = j + 1) { - for (j = i; j < n; j++) - if (lcpAux[lcp + j] < cmp_from_limit) - break; - if (j - i > 0) - helpedSort(a + i, j - i + 1, SHALLOW_LIMIT); - } - } - - /** - * Function to compare two strings originating from the *b1 and *b2 The size of the unrolled loop must be at most - * equal to the costant CMP_OVERSHOOT defined in common.h When the function is called cmpLeft must contain the - * maximum number of comparisons the algorithm can do before returning 0 (equal strings) At exit cmpLeft has been - * decreased by the # of comparisons done - */ - private int cmpUnrolledShallowLcp(int b1, int b2) { - - int c1, c2; - - // execute blocks of 16 comparisons until a difference - // is found or we run out of the string - do { - // 1 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - return c1 - c2; - } - b1++; - b2++; - // 2 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 1; - return c1 - c2; - } - b1++; - b2++; - // 3 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 2; - return c1 - c2; - } - b1++; - b2++; - // 4 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 3; - return c1 - c2; - } - b1++; - b2++; - // 5 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 4; - return c1 - c2; - } - b1++; - b2++; - // 6 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 5; - return c1 - c2; - } - b1++; - b2++; - // 7 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 6; - return c1 - c2; - } - b1++; - b2++; - // 8 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 7; - return c1 - c2; - } - b1++; - b2++; - // 9 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 8; - return c1 - c2; - } - b1++; - b2++; - // 10 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 9; - return c1 - c2; - } - b1++; - b2++; - // 11 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 10; - return c1 - c2; - } - b1++; - b2++; - // 12 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 11; - return c1 - c2; - } - b1++; - b2++; - // 13 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 12; - return c1 - c2; - } - b1++; - b2++; - // 14 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 13; - return c1 - c2; - } - b1++; - b2++; - // 15 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 14; - return c1 - c2; - } - b1++; - b2++; - // 16 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpLeft -= 15; - return c1 - c2; - } - b1++; - b2++; - // if we have done enough comparisons the strings are considered equal - cmpLeft -= 16; - if (cmpLeft <= 0) - return 0; - // assert( b1 0) { // anchor <= a[i] < (sorted suffix) - if (curr_sb != getSmallBucket(text_pos + diff)) { - if (diff < min_forw_offset) { - min_forw_offset = diff; - best_forw_anchor = anchor; - forw_anchor_index = i; - } - } else { // the sorted suffix belongs to the same bucket of a[0]..a[n-1] - if (diff < min_forw_offset_buc) { - min_forw_offset_buc = diff; - best_forw_anchor_buc = anchor; - forw_anchor_index_buc = i; - } - } - } else { // diff<0 => anchor <= (sorted suffix) < a[i] - if (diff > max_back_offset) { - max_back_offset = diff; - best_back_anchor = anchor; - back_anchor_index = i; - } - // try to find a sorted suffix > a[i] by looking at next anchor - aoffset = anchorOffset[++anchor]; - if (aoffset < anchorDist) { - diff = anchorDist + aoffset - toffset; - assert (diff > 0); - if (curr_sb != getSmallBucket(text_pos + diff)) { - if (diff < min_forw_offset) { - min_forw_offset = diff; - best_forw_anchor = anchor; - forw_anchor_index = i; - } - } else { - if (diff < min_forw_offset_buc) { - min_forw_offset_buc = diff; - best_forw_anchor_buc = anchor; - forw_anchor_index_buc = i; - } - } - } - } - } - } - - // ------ if forward anchor_sort is possible, do it! -------- - if (best_forw_anchor >= 0 && min_forw_offset < depth - 1) { - anchor_pos = suffixArray[a + forw_anchor_index] + min_forw_offset; - anchor_rank = anchorRank[best_forw_anchor]; - generalAnchorSort(a, n, anchor_pos, anchor_rank, min_forw_offset); - if (anchorDist > 0) - updateAnchors(a, n); - return; - } - - boolean fail = false; - if (best_back_anchor >= 0) { - int T0, Ti;// text pointers - int j; - - // make sure that the offset is legal for all a[i] - for (i = 0; i < n; i++) { - if (suffixArray[a + i] + max_back_offset < 0) - fail = true; - // goto fail; // illegal offset, give up - } - // make sure that a[0] .. a[n-1] are preceded by the same substring - T0 = suffixArray[a]; - for (i = 1; i < n; i++) { - Ti = suffixArray[a + i]; - for (j = max_back_offset; j <= -1; j++) - if (text[this.start + T0 + j] != text[this.start + Ti + j]) - fail = true; - // goto fail; // mismatch, give up - } - if (!fail) { - // backward anchor sorting is possible - anchor_pos = suffixArray[a + back_anchor_index] + max_back_offset; - anchor_rank = anchorRank[best_back_anchor]; - generalAnchorSort(a, n, anchor_pos, anchor_rank, max_back_offset); - if (anchorDist > 0) - updateAnchors(a, n); - return; - } - } - if (fail) { - if (best_forw_anchor_buc >= 0 && min_forw_offset_buc < depth - 1) { - int equal = 0, lower = 0, upper = 0; - - anchor_pos = suffixArray[a + forw_anchor_index_buc] + min_forw_offset_buc; - anchor_rank = anchorRank[best_forw_anchor_buc]; - - // establish how many suffixes can be sorted using anchor_sort() - SplitGroupResult res = splitGroup(a, n, depth, min_forw_offset_buc, - forw_anchor_index_buc, lower); - equal = res.equal; - lower = res.lower; - if (equal == n) { - generalAnchorSort(a, n, anchor_pos, anchor_rank, min_forw_offset_buc); - } else { - // -- a[0] ... a[n-1] are split into 3 groups: lower, equal, upper - upper = n - equal - lower; - // printf("Warning! lo=%d eq=%d up=%d a=%x\n",lower,equal,upper,(int)a); - // sort the equal group - if (equal > 1) - generalAnchorSort(a + lower, equal, anchor_pos, anchor_rank, - min_forw_offset_buc); - - // sort upper and lower groups using deep_sort - if (lower > 1) - pseudoOrDeepSort(a, lower, depth); - if (upper > 1) - pseudoOrDeepSort(a + lower + equal, upper, depth); - } // end if(equal==n) ... else - if (anchorDist > 0) - updateAnchors(a, n); - return; - } // end hard case - - } - // --------------------------------------------------------------- - // If we get here it means that everything failed - // In this case we simply deep_sort a[0] ... a[n-1] - // --------------------------------------------------------------- - pseudoOrDeepSort(a, n, depth); - - } - - /** - * This function takes as input an array a[0] .. a[n-1] of suffixes which share the first "depth" chars. "pivot" in - * an index in 0..n-1 and offset and integer>0. The function splits a[0] .. a[n-1] into 3 groups: first the suffixes - * which are smaller than a[pivot], then those which are equal to a[pivot] and finally those which are greater than - * a[pivot]. Here, smaller, equal, larger refer to a lexicographic ordering limited to the first depth+offest chars - * (since the first depth chars are equal we only look at the chars in position depth, depth+1, ... depth+offset-1). - * The function returns the number "num" of suffixes equal to a[pivot], and stores in *first the first of these - * suffixes. So at the end the smaller suffixes are in a[0] ... a[first-1], the equal suffixes in a[first] ... - * a[first+num-1], the larger suffixes in a[first+num] ... a[n-1] The splitting is done using a modified mkq() - */ - private SplitGroupResult splitGroup(int a, int n, int depth, int offset, int pivot, int first) { - int r, partval; - int pa, pb, pc, pd, pa_old, pd_old;// pointers - int pivot_pos; - int text_depth, text_limit;// pointers - - // --------- initialization ------------------------------------ - pivot_pos = suffixArray[a + pivot]; // starting position in T[] of pivot - text_depth = depth; - text_limit = text_depth + offset; - - // ------------------------------------------------------------- - // In the following for() loop: - // [pa ... pd] is the current working region, - // pb moves from pa towards pd - // pc moves from pd towards pa - // ------------------------------------------------------------- - pa = a; - pd = a + n - 1; - - for (; pa != pd && (text_depth < text_limit); text_depth++) { - // ------ the pivot char is text[this.start + pivot_pos+depth] where - // depth = text_depth-text. This is text_depth[pivot_pos] - partval = text[this.start + text_depth + pivot_pos]; - // ----- partition ------------ - pb = pa_old = pa; - pc = pd_old = pd; - for (;;) { - while (pb <= pc && (r = ptr2char(pb, text_depth) - partval) <= 0) { - if (r == 0) { - swap2(pa, pb); - pa++; - } - pb++; - } - while (pb <= pc && (r = ptr2char(pc, text_depth) - partval) >= 0) { - if (r == 0) { - swap2(pc, pd); - pd--; - } - pc--; - } - if (pb > pc) - break; - swap2(pb, pc); - pb++; - pc--; - } - r = min(pa - pa_old, pb - pa); - vecswap2(pa_old, pb - r, r); - r = min(pd - pc, pd_old - pd); - vecswap2(pb, pd_old + 1 - r, r); - // ------ compute new boundaries ----- - pa = pa_old + (pb - pa); // there are pb-pa chars < partval - pd = pd_old - (pd - pc); // there are pd-pc chars > partval - - } - - first = pa - a; // index in a[] of the first suf. equal to pivot - // return pd-pa+1; // return number of suffixes equal to pivot - return new SplitGroupResult(pd - pa + 1, first); - - } - - /** - * given a SORTED array of suffixes a[0] .. a[n-1] updates anchorRank[] and anchorOffset[] - */ - private void updateAnchors(int a, int n) { - int i, anchor, toffset, aoffset, text_pos; - - for (i = 0; i < n; i++) { - text_pos = suffixArray[a + i]; - // get anchor preceeding text_pos=a[i] - anchor = text_pos / anchorDist; - toffset = text_pos % anchorDist; // distance of a[i] from anchor - aoffset = anchorOffset[anchor]; // dist of sorted suf from anchor - if (toffset < aoffset) { - anchorOffset[anchor] = toffset; - anchorRank[anchor] = a + i; - } - } - - } - - /** - * This routines sorts a[0] ... a[n-1] using the fact that in their common prefix, after offset characters, there is - * a suffix whose rank is known. In this routine we call this suffix anchor (and we denote its position and rank - * with anchor_pos and anchor_rank respectively) but it is not necessarily an anchor (=does not necessarily starts - * at position multiple of anchorDist) since this function is called by pseudo_anchor_sort(). The routine works by - * scanning the suffixes before and after the anchor in order to find (and mark) those which are suffixes of a[0] - * ... a[n-1]. After that, the ordering of a[0] ... a[n-1] is derived with a sigle scan of the marked - * suffixes.******************************************************************* - */ - private void generalAnchorSort(int a, int n, int anchor_pos, int anchor_rank, int offset) { - int sb, lo, hi; - int curr_lo, curr_hi, to_be_found, i, j; - int item; - int ris; - // void *ris; + for (j = 0; j <= 256; j++) { + if (j != ss) { + sb = (ss << 8) + j; + if ((ftab[sb] & SETMASK) == 0) { + int lo = ftab[sb] & CLEARMASK; + int hi = (ftab[sb + 1] & CLEARMASK) - 1; + if (hi > lo) { + shallowSort(lo, hi - lo + 1); + numQSorted += (hi - lo + 1); + } + } + ftab[sb] |= SETMASK; + } + } + { + for (j = 0; j <= 256; j++) { + copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; + copyEnd[j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; + } + // take care of the virtual -1 char in position textSize+1 + if (ss == 0) { + k = textSize - 1; + c1 = text[this.start + k]; + if (!bigDone[c1]) + suffixArray[copyStart[c1]++] = k; + } + for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { + k = suffixArray[j] - 1; + if (k < 0) + continue; + c1 = text[this.start + k]; + if (!bigDone[c1]) + suffixArray[copyStart[c1]++] = k; + } + for (j = (ftab[(ss + 1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { + k = suffixArray[j] - 1; + if (k < 0) + continue; + c1 = text[this.start + k]; + if (!bigDone[c1]) + suffixArray[copyEnd[c1]--] = k; + } + } + for (j = 0; j <= 256; j++) + ftab[(j << 8) + ss] |= SETMASK; + bigDone[ss] = true; + }// endfor + + return suffixArray; + } + + /** + * This is the multikey quicksort from bentley-sedgewick modified so that it stops recursion when depth reaches + * {@link #SHALLOW_LIMIT} (that is when two or more suffixes have {@link #SHALLOW_LIMIT} chars in common). + */ + private void shallowSort(int a, int n) { + // call multikey quicksort + // skip 2 chars since suffixes come from the same bucket + shallowMkq32(a, n, 2); + + } + + /** + * recursive multikey quicksort from Bentley-Sedgewick. + *

    + * Stops when text_depth reaches {@link #SHALLOW_LIMIT} that is when we have found that the current set of strings + * have {@link #SHALLOW_LIMIT} chars in common + */ + private void shallowMkq32(int a, int n, int text_depth) { + + int partval, val; + int pa = 0, pb = 0, pc = 0, pd = 0, pl = 0, pm = 0, pn = 0;// pointers + int d, r; + int next_depth;// text pointer + boolean repeatFlag = true; + + // ---- On small arrays use insertions sort + if (n < MK_QS_TRESH) { + shallowInssortLcp(a, n, text_depth); + return; + } + + // ----------- choose pivot -------------- + while (repeatFlag) { + + repeatFlag = false; + pl = a; + pm = a + (n / 2); + pn = a + (n - 1); + if (n > 30) { // On big arrays, pseudomedian of 9 + d = (n / 8); + pl = med3(pl, pl + d, pl + 2 * d, text_depth); + pm = med3(pm - d, pm, pm + d, text_depth); + pn = med3(pn - 2 * d, pn - d, pn, text_depth); + } + pm = med3(pl, pm, pn, text_depth); + swap2(a, pm); + partval = ptr2char32(a, text_depth); + pa = pb = a + 1; + pc = pd = a + n - 1; + // -------- partition ----------------- + for (; ; ) { + while (pb <= pc && (val = ptr2char32(pb, text_depth)) <= partval) { + if (val == partval) { + swap2(pa, pb); + pa++; + } + pb++; + } + while (pb <= pc && (val = ptr2char32(pc, text_depth)) >= partval) { + if (val == partval) { + swap2(pc, pd); + pd--; + } + pc--; + } + if (pb > pc) + break; + swap2(pb, pc); + pb++; + pc--; + } + if (pa > pd) { + // all values were equal to partval: make it simpler + if ((next_depth = text_depth + 4) >= SHALLOW_LIMIT) { + helpedSort(a, n, next_depth); + return; + } else { + text_depth = next_depth; + repeatFlag = true; + } + } + + } + // partition a[] into the values smaller, equal, and larger that partval + pn = a + n; + r = min(pa - a, pb - pa); + vecswap2(a, pb - r, r); + r = min(pd - pc, pn - pd - 1); + vecswap2(pb, pn - r, r); + // --- sort smaller strings ------- + if ((r = pb - pa) > 1) + shallowMkq32(a, r, text_depth); + // --- sort strings starting with partval ----- + if ((next_depth = text_depth + 4) < SHALLOW_LIMIT) + shallowMkq32(a + r, pa - pd + n - 1, next_depth); + else + helpedSort(a + r, pa - pd + n - 1, next_depth); + if ((r = pd - pc) > 1) + shallowMkq32(a + n - r, r, text_depth); + + } + + private void vecswap2(int a, int b, int n) { + while (n-- > 0) { + int t = suffixArray[a]; + suffixArray[a++] = suffixArray[b]; + suffixArray[b++] = t; + } + } + + private static int min(int i, int j) { + return i < j ? i : j; + } + + /** + * this is the insertion sort routine called by multikey-quicksort for sorting small groups. During insertion sort + * the comparisons are done calling cmp_unrolled_shallow_lcp() and two strings are equal if the coincides for + * SHALLOW_LIMIT characters. After this first phase we sort groups of "equal_string" using helped_sort(). + *

    + */ + private void shallowInssortLcp(int a, int n, int text_depth) { + int i, j, j1, lcp_new, r, ai, lcpi; + int cmp_from_limit; + int text_depth_ai;// pointer + // --------- initialize ---------------- + + lcpAux[0] = -1; // set lcp[-1] = -1 + for (i = 0; i < n; i++) { + lcpAux[lcp + i] = 0; + } + cmp_from_limit = SHALLOW_LIMIT - text_depth; + + // ----- start insertion sort ----------- + for (i = 1; i < n; i++) { + ai = suffixArray[a + i]; + lcpi = 0; + text_depth_ai = ai + text_depth; + j = i; + j1 = j - 1; // j1 is a shorhand for j-1 + while (true) { + + // ------ compare ai with a[j-1] -------- + cmpLeft = cmp_from_limit - lcpi; + r = cmpUnrolledShallowLcp(lcpi + suffixArray[a + j1] + text_depth, lcpi + + text_depth_ai); + lcp_new = cmp_from_limit - cmpLeft; // lcp between ai and a[j1] + assert (r != 0 || lcp_new >= cmp_from_limit); + + if (r <= 0) { // we have a[j-1] <= ai + lcpAux[lcp + j1] = lcp_new; // ai will be written in a[j]; update + // lcp[j-1] + break; + } + + // --- we have a[j-1]>ai. a[j-1] and maybe other will be moved down + // --- use lcp to move down as many elements of a[] as possible + lcpi = lcp_new; + do { + suffixArray[a + j] = suffixArray[a + j1]; // move down a[j-1] + lcpAux[lcp + j] = lcpAux[lcp + j1]; // move down lcp[j-1] + j = j1; + j1--; // update j and j1=j-1 + } while (lcpi < lcpAux[lcp + j1]); // recall that lcp[-1]=-1 + + if (lcpi > lcpAux[lcp + j1]) + break; // ai will be written in position j + + // if we get here lcpi==lcp[j1]: we will compare them at next iteration + + } // end for(j=i ... + suffixArray[a + j] = ai; + lcpAux[lcp + j] = lcpi; + } // end for(i=1 ... + // ----- done with insertion sort. now sort groups of equal strings + for (i = 0; i < n - 1; i = j + 1) { + for (j = i; j < n; j++) + if (lcpAux[lcp + j] < cmp_from_limit) + break; + if (j - i > 0) + helpedSort(a + i, j - i + 1, SHALLOW_LIMIT); + } + } + + /** + * Function to compare two strings originating from the *b1 and *b2 The size of the unrolled loop must be at most + * equal to the costant CMP_OVERSHOOT defined in common.h When the function is called cmpLeft must contain the + * maximum number of comparisons the algorithm can do before returning 0 (equal strings) At exit cmpLeft has been + * decreased by the # of comparisons done + */ + private int cmpUnrolledShallowLcp(int b1, int b2) { + + int c1, c2; + + // execute blocks of 16 comparisons until a difference + // is found or we run out of the string + do { + // 1 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + return c1 - c2; + } + b1++; + b2++; + // 2 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 1; + return c1 - c2; + } + b1++; + b2++; + // 3 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 2; + return c1 - c2; + } + b1++; + b2++; + // 4 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 3; + return c1 - c2; + } + b1++; + b2++; + // 5 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 4; + return c1 - c2; + } + b1++; + b2++; + // 6 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 5; + return c1 - c2; + } + b1++; + b2++; + // 7 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 6; + return c1 - c2; + } + b1++; + b2++; + // 8 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 7; + return c1 - c2; + } + b1++; + b2++; + // 9 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 8; + return c1 - c2; + } + b1++; + b2++; + // 10 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 9; + return c1 - c2; + } + b1++; + b2++; + // 11 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 10; + return c1 - c2; + } + b1++; + b2++; + // 12 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 11; + return c1 - c2; + } + b1++; + b2++; + // 13 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 12; + return c1 - c2; + } + b1++; + b2++; + // 14 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 13; + return c1 - c2; + } + b1++; + b2++; + // 15 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 14; + return c1 - c2; + } + b1++; + b2++; + // 16 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpLeft -= 15; + return c1 - c2; + } + b1++; + b2++; + // if we have done enough comparisons the strings are considered equal + cmpLeft -= 16; + if (cmpLeft <= 0) + return 0; + // assert( b1 0) { // anchor <= a[i] < (sorted suffix) + if (curr_sb != getSmallBucket(text_pos + diff)) { + if (diff < min_forw_offset) { + min_forw_offset = diff; + best_forw_anchor = anchor; + forw_anchor_index = i; + } + } else { // the sorted suffix belongs to the same bucket of a[0]..a[n-1] + if (diff < min_forw_offset_buc) { + min_forw_offset_buc = diff; + best_forw_anchor_buc = anchor; + forw_anchor_index_buc = i; + } + } + } else { // diff<0 => anchor <= (sorted suffix) < a[i] + if (diff > max_back_offset) { + max_back_offset = diff; + best_back_anchor = anchor; + back_anchor_index = i; + } + // try to find a sorted suffix > a[i] by looking at next anchor + aoffset = anchorOffset[++anchor]; + if (aoffset < anchorDist) { + diff = anchorDist + aoffset - toffset; + assert (diff > 0); + if (curr_sb != getSmallBucket(text_pos + diff)) { + if (diff < min_forw_offset) { + min_forw_offset = diff; + best_forw_anchor = anchor; + forw_anchor_index = i; + } + } else { + if (diff < min_forw_offset_buc) { + min_forw_offset_buc = diff; + best_forw_anchor_buc = anchor; + forw_anchor_index_buc = i; + } + } + } + } + } + } + + // ------ if forward anchor_sort is possible, do it! -------- + if (best_forw_anchor >= 0 && min_forw_offset < depth - 1) { + anchor_pos = suffixArray[a + forw_anchor_index] + min_forw_offset; + anchor_rank = anchorRank[best_forw_anchor]; + generalAnchorSort(a, n, anchor_pos, anchor_rank, min_forw_offset); + if (anchorDist > 0) + updateAnchors(a, n); + return; + } + + boolean fail = false; + if (best_back_anchor >= 0) { + int T0, Ti;// text pointers + int j; + + // make sure that the offset is legal for all a[i] + for (i = 0; i < n; i++) { + if (suffixArray[a + i] + max_back_offset < 0) + fail = true; + // goto fail; // illegal offset, give up + } + // make sure that a[0] .. a[n-1] are preceded by the same substring + T0 = suffixArray[a]; + for (i = 1; i < n; i++) { + Ti = suffixArray[a + i]; + for (j = max_back_offset; j <= -1; j++) + if (text[this.start + T0 + j] != text[this.start + Ti + j]) + fail = true; + // goto fail; // mismatch, give up + } + if (!fail) { + // backward anchor sorting is possible + anchor_pos = suffixArray[a + back_anchor_index] + max_back_offset; + anchor_rank = anchorRank[best_back_anchor]; + generalAnchorSort(a, n, anchor_pos, anchor_rank, max_back_offset); + if (anchorDist > 0) + updateAnchors(a, n); + return; + } + } + if (fail) { + if (best_forw_anchor_buc >= 0 && min_forw_offset_buc < depth - 1) { + int equal = 0, lower = 0, upper = 0; + + anchor_pos = suffixArray[a + forw_anchor_index_buc] + min_forw_offset_buc; + anchor_rank = anchorRank[best_forw_anchor_buc]; + + // establish how many suffixes can be sorted using anchor_sort() + SplitGroupResult res = splitGroup(a, n, depth, min_forw_offset_buc, + forw_anchor_index_buc, lower); + equal = res.equal; + lower = res.lower; + if (equal == n) { + generalAnchorSort(a, n, anchor_pos, anchor_rank, min_forw_offset_buc); + } else { + // -- a[0] ... a[n-1] are split into 3 groups: lower, equal, upper + upper = n - equal - lower; + // printf("Warning! lo=%d eq=%d up=%d a=%x\n",lower,equal,upper,(int)a); + // sort the equal group + if (equal > 1) + generalAnchorSort(a + lower, equal, anchor_pos, anchor_rank, + min_forw_offset_buc); + + // sort upper and lower groups using deep_sort + if (lower > 1) + pseudoOrDeepSort(a, lower, depth); + if (upper > 1) + pseudoOrDeepSort(a + lower + equal, upper, depth); + } // end if(equal==n) ... else + if (anchorDist > 0) + updateAnchors(a, n); + return; + } // end hard case + + } + // --------------------------------------------------------------- + // If we get here it means that everything failed + // In this case we simply deep_sort a[0] ... a[n-1] + // --------------------------------------------------------------- + pseudoOrDeepSort(a, n, depth); + + } + + /** + * This function takes as input an array a[0] .. a[n-1] of suffixes which share the first "depth" chars. "pivot" in + * an index in 0..n-1 and offset and integer>0. The function splits a[0] .. a[n-1] into 3 groups: first the suffixes + * which are smaller than a[pivot], then those which are equal to a[pivot] and finally those which are greater than + * a[pivot]. Here, smaller, equal, larger refer to a lexicographic ordering limited to the first depth+offest chars + * (since the first depth chars are equal we only look at the chars in position depth, depth+1, ... depth+offset-1). + * The function returns the number "num" of suffixes equal to a[pivot], and stores in *first the first of these + * suffixes. So at the end the smaller suffixes are in a[0] ... a[first-1], the equal suffixes in a[first] ... + * a[first+num-1], the larger suffixes in a[first+num] ... a[n-1] The splitting is done using a modified mkq() + */ + private SplitGroupResult splitGroup(int a, int n, int depth, int offset, int pivot, int first) { + int r, partval; + int pa, pb, pc, pd, pa_old, pd_old;// pointers + int pivot_pos; + int text_depth, text_limit;// pointers + + // --------- initialization ------------------------------------ + pivot_pos = suffixArray[a + pivot]; // starting position in T[] of pivot + text_depth = depth; + text_limit = text_depth + offset; + + // ------------------------------------------------------------- + // In the following for() loop: + // [pa ... pd] is the current working region, + // pb moves from pa towards pd + // pc moves from pd towards pa + // ------------------------------------------------------------- + pa = a; + pd = a + n - 1; + + for (; pa != pd && (text_depth < text_limit); text_depth++) { + // ------ the pivot char is text[this.start + pivot_pos+depth] where + // depth = text_depth-text. This is text_depth[pivot_pos] + partval = text[this.start + text_depth + pivot_pos]; + // ----- partition ------------ + pb = pa_old = pa; + pc = pd_old = pd; + for (; ; ) { + while (pb <= pc && (r = ptr2char(pb, text_depth) - partval) <= 0) { + if (r == 0) { + swap2(pa, pb); + pa++; + } + pb++; + } + while (pb <= pc && (r = ptr2char(pc, text_depth) - partval) >= 0) { + if (r == 0) { + swap2(pc, pd); + pd--; + } + pc--; + } + if (pb > pc) + break; + swap2(pb, pc); + pb++; + pc--; + } + r = min(pa - pa_old, pb - pa); + vecswap2(pa_old, pb - r, r); + r = min(pd - pc, pd_old - pd); + vecswap2(pb, pd_old + 1 - r, r); + // ------ compute new boundaries ----- + pa = pa_old + (pb - pa); // there are pb-pa chars < partval + pd = pd_old - (pd - pc); // there are pd-pc chars > partval + + } + + first = pa - a; // index in a[] of the first suf. equal to pivot + // return pd-pa+1; // return number of suffixes equal to pivot + return new SplitGroupResult(pd - pa + 1, first); + + } + + /** + * given a SORTED array of suffixes a[0] .. a[n-1] updates anchorRank[] and anchorOffset[] + */ + private void updateAnchors(int a, int n) { + int i, anchor, toffset, aoffset, text_pos; + + for (i = 0; i < n; i++) { + text_pos = suffixArray[a + i]; + // get anchor preceeding text_pos=a[i] + anchor = text_pos / anchorDist; + toffset = text_pos % anchorDist; // distance of a[i] from anchor + aoffset = anchorOffset[anchor]; // dist of sorted suf from anchor + if (toffset < aoffset) { + anchorOffset[anchor] = toffset; + anchorRank[anchor] = a + i; + } + } + + } + + /** + * This routines sorts a[0] ... a[n-1] using the fact that in their common prefix, after offset characters, there is + * a suffix whose rank is known. In this routine we call this suffix anchor (and we denote its position and rank + * with anchor_pos and anchor_rank respectively) but it is not necessarily an anchor (=does not necessarily starts + * at position multiple of anchorDist) since this function is called by pseudo_anchor_sort(). The routine works by + * scanning the suffixes before and after the anchor in order to find (and mark) those which are suffixes of a[0] + * ... a[n-1]. After that, the ordering of a[0] ... a[n-1] is derived with a sigle scan of the marked + * suffixes.******************************************************************* + */ + private void generalAnchorSort(int a, int n, int anchor_pos, int anchor_rank, int offset) { + int sb, lo, hi; + int curr_lo, curr_hi, to_be_found, i, j; + int item; + int ris; + // void *ris; /* ---------- get bucket of anchor ---------- */ - sb = getSmallBucket(anchor_pos); - lo = bucketFirst(sb); - hi = bucketLast(sb); - // ------ sort pointers a[0] ... a[n-1] as plain integers - // qsort(a, n, sizeof(Int32), integer_cmp); - Arrays.sort(suffixArray, a, a + n); - - // ------------------------------------------------------------------ - // now we scan the bucket containing the anchor in search of suffixes - // corresponding to the ones we have to sort. When we find one of - // such suffixes we mark it. We go on untill n sfx's have been marked - // ------------------------------------------------------------------ - curr_hi = curr_lo = anchor_rank; - - mark(curr_lo); - // scan suffixes preceeding and following the anchor - for (to_be_found = n - 1; to_be_found > 0;) { - // invariant: the next positions to check are curr_lo-1 and curr_hi+1 - assert (curr_lo > lo || curr_hi < hi); - while (curr_lo > lo) { - item = suffixArray[--curr_lo] - offset; - ris = Arrays.binarySearch(suffixArray, a, a + n, item); - // ris = bsearch(&item,a,n,sizeof(Int32), integer_cmp); - if (ris != 0) { - mark(curr_lo); - to_be_found--; - } else - break; - } - while (curr_hi < hi) { - item = suffixArray[++curr_hi] - offset; - ris = Arrays.binarySearch(suffixArray, a, a + n, item); - if (ris != 0) { - mark(curr_hi); - to_be_found--; - } else - break; - } - } - // sort a[] using the marked suffixes - for (j = 0, i = curr_lo; i <= curr_hi; i++) - if (isMarked(i)) { - unmark(i); - suffixArray[a + j++] = suffixArray[i] - offset; - } - - } - - /** - * + sb = getSmallBucket(anchor_pos); + lo = bucketFirst(sb); + hi = bucketLast(sb); + // ------ sort pointers a[0] ... a[n-1] as plain integers + // qsort(a, n, sizeof(Int32), integer_cmp); + Arrays.sort(suffixArray, a, a + n); + + // ------------------------------------------------------------------ + // now we scan the bucket containing the anchor in search of suffixes + // corresponding to the ones we have to sort. When we find one of + // such suffixes we mark it. We go on untill n sfx's have been marked + // ------------------------------------------------------------------ + curr_hi = curr_lo = anchor_rank; + + mark(curr_lo); + // scan suffixes preceeding and following the anchor + for (to_be_found = n - 1; to_be_found > 0; ) { + // invariant: the next positions to check are curr_lo-1 and curr_hi+1 + assert (curr_lo > lo || curr_hi < hi); + while (curr_lo > lo) { + item = suffixArray[--curr_lo] - offset; + ris = Arrays.binarySearch(suffixArray, a, a + n, item); + // ris = bsearch(&item,a,n,sizeof(Int32), integer_cmp); + if (ris != 0) { + mark(curr_lo); + to_be_found--; + } else + break; + } + while (curr_hi < hi) { + item = suffixArray[++curr_hi] - offset; + ris = Arrays.binarySearch(suffixArray, a, a + n, item); + if (ris != 0) { + mark(curr_hi); + to_be_found--; + } else + break; + } + } + // sort a[] using the marked suffixes + for (j = 0, i = curr_lo; i <= curr_hi; i++) + if (isMarked(i)) { + unmark(i); + suffixArray[a + j++] = suffixArray[i] - offset; + } + + } + + /** + * */ - private void unmark(int i) { - suffixArray[i] &= ~MARKER; + private void unmark(int i) { + suffixArray[i] &= ~MARKER; - } + } - /** - * + /** + * */ - private boolean isMarked(int i) { - return (suffixArray[i] & MARKER) != 0; - } + private boolean isMarked(int i) { + return (suffixArray[i] & MARKER) != 0; + } - /** - * + /** + * */ - private void mark(int i) { - suffixArray[i] |= MARKER; + private void mark(int i) { + suffixArray[i] |= MARKER; - } + } - /** - * + /** + * */ - private int bucketLast(int sb) { - return (ftab[sb + 1] & CLEARMASK) - 1; - } + private int bucketLast(int sb) { + return (ftab[sb + 1] & CLEARMASK) - 1; + } - /** - * + /** + * */ - private int bucketFirst(int sb) { - return ftab[sb] & CLEARMASK; - } + private int bucketFirst(int sb) { + return ftab[sb] & CLEARMASK; + } - /** - * + /** + * */ - private int bucketSize(int sb) { - return (ftab[sb + 1] & CLEARMASK) - (ftab[sb] & CLEARMASK); - } + private int bucketSize(int sb) { + return (ftab[sb + 1] & CLEARMASK) - (ftab[sb] & CLEARMASK); + } - /** - * + /** + * */ - private int getSmallBucket(int pos) { - return (text[this.start + pos] << 8) + text[this.start + pos + 1]; - } + private int getSmallBucket(int pos) { + return (text[this.start + pos] << 8) + text[this.start + pos + 1]; + } - /** - * + /** + * */ - @SuppressWarnings("unused") + @SuppressWarnings("unused") private void pseudoOrDeepSort(int a, int n, int depth) { - int offset, text_pos, sb, pseudo_anchor_pos, max_offset, size; - - // ------- search for a useful pseudo-anchor ------------- - if (MAX_PSEUDO_ANCHOR_OFFSET > 0) { - max_offset = min(depth - 1, MAX_PSEUDO_ANCHOR_OFFSET); - text_pos = suffixArray[a]; - for (offset = 1; offset < max_offset; offset++) { - pseudo_anchor_pos = text_pos + offset; - sb = getSmallBucket(pseudo_anchor_pos); - // check if pseudo_anchor is in a sorted bucket - if (isSortedBucket(sb)) { - size = bucketSize(sb); // size of group - if (size > B2G_RATIO * n) - continue; // discard large groups - // sort a[0] ... a[n-1] using pseudo_anchor - pseudoAnchorSort(a, n, pseudo_anchor_pos, offset); - return; - } - } - } - deepSort(a, n, depth); - } - - /** - * + int offset, text_pos, sb, pseudo_anchor_pos, max_offset, size; + + // ------- search for a useful pseudo-anchor ------------- + if (MAX_PSEUDO_ANCHOR_OFFSET > 0) { + max_offset = min(depth - 1, MAX_PSEUDO_ANCHOR_OFFSET); + text_pos = suffixArray[a]; + for (offset = 1; offset < max_offset; offset++) { + pseudo_anchor_pos = text_pos + offset; + sb = getSmallBucket(pseudo_anchor_pos); + // check if pseudo_anchor is in a sorted bucket + if (isSortedBucket(sb)) { + size = bucketSize(sb); // size of group + if (size > B2G_RATIO * n) + continue; // discard large groups + // sort a[0] ... a[n-1] using pseudo_anchor + pseudoAnchorSort(a, n, pseudo_anchor_pos, offset); + return; + } + } + } + deepSort(a, n, depth); + } + + /** + * + */ + private boolean isSortedBucket(int sb) { + return (ftab[sb] & SETMASK) != 0; + } + + /** + * routine for deep-sorting the suffixes a[0] ... a[n-1] knowing that they have a common prefix of length "depth" + */ + private void deepSort(int a, int n, int depth) { + int blind_limit; + + blind_limit = textSize / BLIND_SORT_RATIO; + if (n <= blind_limit) + blindSsort(a, n, depth); // small_group + else + qsUnrolledLcp(a, n, depth, blind_limit); + + } + + /** + * ternary quicksort (seward-like) with lcp information */ - private boolean isSortedBucket(int sb) { - return (ftab[sb] & SETMASK) != 0; - } - - /** - * routine for deep-sorting the suffixes a[0] ... a[n-1] knowing that they have a common prefix of length "depth" - */ - private void deepSort(int a, int n, int depth) { - int blind_limit; - - blind_limit = textSize / BLIND_SORT_RATIO; - if (n <= blind_limit) - blindSsort(a, n, depth); // small_group - else - qsUnrolledLcp(a, n, depth, blind_limit); - - } - - /** - * ternary quicksort (seward-like) with lcp information - */ - private void qsUnrolledLcp(int a, int n, int depth, int blind_limit) { - int text_depth, text_pos_pivot;// pointers - int[] stack_lo = new int[STACK_SIZE]; - int[] stack_hi = new int[STACK_SIZE]; - int[] stack_d = new int[STACK_SIZE]; - int sp, r, r3, med; - int i, j, lo, hi, ris, lcp_lo, lcp_hi; - // ----- init quicksort -------------- - r = sp = 0; - // Pushd(0,n-1,depth); - stack_lo[sp] = 0; - stack_hi[sp] = n - 1; - stack_d[sp] = depth; - sp++; - // end Pushd - - // ----- repeat untill stack is empty ------ - while (sp > 0) { - assert (sp < STACK_SIZE); - // Popd(lo,hi,depth); - sp--; - lo = stack_lo[sp]; - hi = stack_hi[sp]; - depth = stack_d[sp]; - // end popd - text_depth = depth; - - // --- use shellsort for small groups - if (hi - lo < blind_limit) { - blindSsort(a + lo, hi - lo + 1, depth); - continue; - } + private void qsUnrolledLcp(int a, int n, int depth, int blind_limit) { + int text_depth, text_pos_pivot;// pointers + int[] stack_lo = new int[STACK_SIZE]; + int[] stack_hi = new int[STACK_SIZE]; + int[] stack_d = new int[STACK_SIZE]; + int sp, r, r3, med; + int i, j, lo, hi, ris, lcp_lo, lcp_hi; + // ----- init quicksort -------------- + r = sp = 0; + // Pushd(0,n-1,depth); + stack_lo[sp] = 0; + stack_hi[sp] = n - 1; + stack_d[sp] = depth; + sp++; + // end Pushd + + // ----- repeat untill stack is empty ------ + while (sp > 0) { + assert (sp < STACK_SIZE); + // Popd(lo,hi,depth); + sp--; + lo = stack_lo[sp]; + hi = stack_hi[sp]; + depth = stack_d[sp]; + // end popd + text_depth = depth; + + // --- use shellsort for small groups + if (hi - lo < blind_limit) { + blindSsort(a + lo, hi - lo + 1, depth); + continue; + } /* * Random partitioning. Guidance for the magic constants 7621 and 32768 is * taken from Sedgewick's algorithms book, chapter 35. */ - r = ((r * 7621) + 1) % 32768; - r3 = r % 3; - if (r3 == 0) - med = lo; - else if (r3 == 1) - med = (lo + hi) >> 1; - else - med = hi; - - // --- partition ---- - swap(med, hi, a); // put the pivot at the right-end - text_pos_pivot = text_depth + suffixArray[a + hi]; - i = lo - 1; - j = hi; - lcp_lo = lcp_hi = Integer.MAX_VALUE; - while (true) { - while (++i < hi) { - ris = cmpUnrolledLcp(text_depth + suffixArray[a + i], text_pos_pivot); - if (ris > 0) { - if (cmpDone < lcp_hi) - lcp_hi = cmpDone; - break; - } else if (cmpDone < lcp_lo) - lcp_lo = cmpDone; - } - while (--j > lo) { - ris = cmpUnrolledLcp(text_depth + suffixArray[a + j], text_pos_pivot); - if (ris < 0) { - if (cmpDone < lcp_lo) - lcp_lo = cmpDone; - break; - } else if (cmpDone < lcp_hi) - lcp_hi = cmpDone; - } - if (i >= j) - break; - swap(i, j, a); - } - swap(i, hi, a); // put pivot at the middle - - // --------- insert subproblems in stack; smallest last - if (i - lo < hi - i) { - // Pushd(i + 1, hi, depth + lcp_hi); - stack_lo[sp] = i + 1; - stack_hi[sp] = hi; - stack_d[sp] = depth + lcp_hi; - sp++; - // end pushd - if (i - lo > 1) { - // Pushd(lo, i - 1, depth + lcp_lo); - stack_lo[sp] = lo; - stack_hi[sp] = i - 1; - stack_d[sp] = depth + lcp_lo; - sp++; - // end push - } - - } else { - // Pushd(lo, i - 1, depth + lcp_lo); - stack_lo[sp] = lo; - stack_hi[sp] = i - 1; - stack_d[sp] = depth + lcp_lo; - sp++; - // end pushd - if (hi - i > 1) { - // Pushd(i + 1, hi, depth + lcp_hi); - stack_lo[sp] = i + 1; - stack_hi[sp] = hi; - stack_d[sp] = depth + lcp_hi; - sp++; - // end pushd - } - } - } - - } - - /** - * Function to compare two strings originating from the *b1 and *b2 The size of the unrolled loop must be at most - * equal to the costant CMP_OVERSHOOT defined in common.h the function return the result of the comparison (+ or -) - * and writes in cmpDone the number of successfull comparisons done - */ - private int cmpUnrolledLcp(int b1, int b2) { - - int c1, c2; - cmpDone = 0; - - // execute blocks of 16 comparisons untill a difference - // is found or we run out of the string - do { - // 1 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - return (c1 - c2); - } - b1++; - b2++; - // 2 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 1; - return (c1 - c2); - } - b1++; - b2++; - // 3 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 2; - return (c1 - c2); - } - b1++; - b2++; - // 4 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 3; - return (c1 - c2); - } - b1++; - b2++; - // 5 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 4; - return (c1 - c2); - } - b1++; - b2++; - // 6 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 5; - return (c1 - c2); - } - b1++; - b2++; - // 7 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 6; - return (c1 - c2); - } - b1++; - b2++; - // 8 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 7; - return (c1 - c2); - } - b1++; - b2++; - // 9 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 8; - return (c1 - c2); - } - b1++; - b2++; - // 10 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 9; - return (c1 - c2); - } - b1++; - b2++; - // 11 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 10; - return (c1 - c2); - } - b1++; - b2++; - // 12 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 11; - return (c1 - c2); - } - b1++; - b2++; - // 13 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 12; - return (c1 - c2); - } - b1++; - b2++; - // 14 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 13; - return (c1 - c2); - } - b1++; - b2++; - // 15 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 14; - return (c1 - c2); - } - b1++; - b2++; - // 16 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmpDone += 15; - return (c1 - c2); - } - b1++; - b2++; - - cmpDone += 16; - - } while (b1 < textSize && b2 < textSize); - - return b2 - b1; - - } - - /** - * + r = ((r * 7621) + 1) % 32768; + r3 = r % 3; + if (r3 == 0) + med = lo; + else if (r3 == 1) + med = (lo + hi) >> 1; + else + med = hi; + + // --- partition ---- + swap(med, hi, a); // put the pivot at the right-end + text_pos_pivot = text_depth + suffixArray[a + hi]; + i = lo - 1; + j = hi; + lcp_lo = lcp_hi = Integer.MAX_VALUE; + while (true) { + while (++i < hi) { + ris = cmpUnrolledLcp(text_depth + suffixArray[a + i], text_pos_pivot); + if (ris > 0) { + if (cmpDone < lcp_hi) + lcp_hi = cmpDone; + break; + } else if (cmpDone < lcp_lo) + lcp_lo = cmpDone; + } + while (--j > lo) { + ris = cmpUnrolledLcp(text_depth + suffixArray[a + j], text_pos_pivot); + if (ris < 0) { + if (cmpDone < lcp_lo) + lcp_lo = cmpDone; + break; + } else if (cmpDone < lcp_hi) + lcp_hi = cmpDone; + } + if (i >= j) + break; + swap(i, j, a); + } + swap(i, hi, a); // put pivot at the middle + + // --------- insert subproblems in stack; smallest last + if (i - lo < hi - i) { + // Pushd(i + 1, hi, depth + lcp_hi); + stack_lo[sp] = i + 1; + stack_hi[sp] = hi; + stack_d[sp] = depth + lcp_hi; + sp++; + // end pushd + if (i - lo > 1) { + // Pushd(lo, i - 1, depth + lcp_lo); + stack_lo[sp] = lo; + stack_hi[sp] = i - 1; + stack_d[sp] = depth + lcp_lo; + sp++; + // end push + } + + } else { + // Pushd(lo, i - 1, depth + lcp_lo); + stack_lo[sp] = lo; + stack_hi[sp] = i - 1; + stack_d[sp] = depth + lcp_lo; + sp++; + // end pushd + if (hi - i > 1) { + // Pushd(i + 1, hi, depth + lcp_hi); + stack_lo[sp] = i + 1; + stack_hi[sp] = hi; + stack_d[sp] = depth + lcp_hi; + sp++; + // end pushd + } + } + } + + } + + /** + * Function to compare two strings originating from the *b1 and *b2 The size of the unrolled loop must be at most + * equal to the costant CMP_OVERSHOOT defined in common.h the function return the result of the comparison (+ or -) + * and writes in cmpDone the number of successfull comparisons done */ - private void swap(int i, int j, int a) { - int tmp = suffixArray[a + i]; - suffixArray[a + i] = suffixArray[a + j]; - suffixArray[a + j] = tmp; - } - - /** - * routine for deep-sorting the suffixes a[0] ... a[n-1] knowing that they have a common prefix of length "depth" - */ - private void blindSsort(int a, int n, int depth) { - int i, j, aj, lcp; - Node nh, root, h; - - // ---- sort suffixes in order of increasing length - // qsort(a, n, sizeof(Int32), neg_integer_cmp); - Arrays.sort(suffixArray, a, a + n); - for (int left = 0, right = n - 1; left < right; left++, right--) { - // exchange the first and last - int temp = suffixArray[a + left]; - suffixArray[a + left] = suffixArray[a + right]; - suffixArray[a + right] = temp; - } - - // --- skip suffixes which have already reached the end-of-text - for (j = 0; j < n; j++) - if (suffixArray[a + j] + depth < textSize) - break; - if (j >= n - 1) - return; // everything is already sorted! - - // ------ init stack ------- - // stack = (node **) malloc(n*sizeof(node *)); - - // ------- init root with the first unsorted suffix - nh = new Node(); - nh.skip = -1; - nh.right = null; - // nh.down = (void *) a[j]; - nh.downInt = suffixArray[a + j]; - root = nh; - - // ------- insert suffixes a[j+1] ... a[n-1] - for (i = j + 1; i < n; i++) { - h = findCompanion(root, suffixArray[a + i]); - aj = h.downInt; - lcp = compareSuffixes(aj, suffixArray[a + i], depth); - insertSuffix(root, suffixArray[a + i], lcp, text[this.start + aj + lcp]); - } - - // ---- traverse the trie and get suffixes in lexicographic order - aux = a; - auxWritten = j; - traverseTrie(root); - - } - - /** - * this procedures traverse the trie in depth first order so that the suffixes (stored in the leaf) are recovered in - * lexicographic order - */ - private void traverseTrie(Node h) { - Node p, nextp; - - if (h.skip < 0) - suffixArray[aux + auxWritten++] = h.downInt; - else { - p = h.down; - do { - nextp = p.right; - if (nextp != null) { - // if there are 2 nodes with equal keys - // they must be considered in inverted order - if (nextp.key == p.key) { - traverseTrie(nextp); - traverseTrie(p); - p = nextp.right; - continue; - } - } - traverseTrie(p); - p = nextp; - } while (p != null); - } - - } - - /** - * insert a suffix in the trie rooted at *p. we know that the trie already contains a string which share the first n - * chars with suf - */ - private void insertSuffix(Node h, int suf, int n, int mmchar) { - int c, s; - Node p, pp; - - s = suf; - - // --------- insert a new node before node *h if necessary - if (h.skip != n) { - p = new Node(); - p.key = mmchar; - p.skip = h.skip; // p inherits skip and children of *h - p.down = h.down; - p.downInt = h.downInt; - p.right = null; - h.skip = n; - h.down = p; // now *h has p as the only child - } - - // -------- search the position of s[n] among *h offsprings - c = text[this.start + s + n]; - pp = h.down; - while (pp != null) { - if (pp.key >= c) - break; - pp = pp.right; - } - // ------- insert new node containing suf - p = new Node(); - p.skip = -1; - p.key = c; - p.right = pp; - pp = p; - p.downInt = suf; - return; - - } - - /** - * this function returns the lcp between suf1 and suf2 (that is returns n such that suf1[n]!=suf2[n] but - * suf1[i]==suf2[i] for i=0..n-1 However, it is possible that suf1 is a prefix of suf2 (not vice-versa because of - * the initial sorting of suffixes in order of descreasing length) in this case the function returns - * n=length(suf1)-1. So in this case suf1[n]==suf2[n] (and suf1[n+1] does not exists). - */ - private int compareSuffixes(int suf1, int suf2, int depth) { - int limit; - int s1, s2; - - s1 = depth + suf1; - s2 = depth + suf2; - limit = textSize - suf1 - depth; - return depth + getLcpUnrolled(s1, s2, limit); - } - - /** - * + private int cmpUnrolledLcp(int b1, int b2) { + + int c1, c2; + cmpDone = 0; + + // execute blocks of 16 comparisons untill a difference + // is found or we run out of the string + do { + // 1 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + return (c1 - c2); + } + b1++; + b2++; + // 2 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 1; + return (c1 - c2); + } + b1++; + b2++; + // 3 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 2; + return (c1 - c2); + } + b1++; + b2++; + // 4 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 3; + return (c1 - c2); + } + b1++; + b2++; + // 5 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 4; + return (c1 - c2); + } + b1++; + b2++; + // 6 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 5; + return (c1 - c2); + } + b1++; + b2++; + // 7 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 6; + return (c1 - c2); + } + b1++; + b2++; + // 8 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 7; + return (c1 - c2); + } + b1++; + b2++; + // 9 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 8; + return (c1 - c2); + } + b1++; + b2++; + // 10 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 9; + return (c1 - c2); + } + b1++; + b2++; + // 11 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 10; + return (c1 - c2); + } + b1++; + b2++; + // 12 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 11; + return (c1 - c2); + } + b1++; + b2++; + // 13 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 12; + return (c1 - c2); + } + b1++; + b2++; + // 14 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 13; + return (c1 - c2); + } + b1++; + b2++; + // 15 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 14; + return (c1 - c2); + } + b1++; + b2++; + // 16 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmpDone += 15; + return (c1 - c2); + } + b1++; + b2++; + + cmpDone += 16; + + } while (b1 < textSize && b2 < textSize); + + return b2 - b1; + + } + + /** + * */ - private int getLcpUnrolled(int b1, int b2, int cmp_limit) { - int cmp2do; - int c1, c2; - - // execute blocks of 16 comparisons untill a difference - // is found or we reach cmp_limit comparisons - cmp2do = cmp_limit; - do { - // 1 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - break; - } - b1++; - b2++; - // 2 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 1; - break; - } - b1++; - b2++; - // 3 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 2; - break; - } - b1++; - b2++; - // 4 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 3; - break; - } - b1++; - b2++; - // 5 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 4; - break; - } - b1++; - b2++; - // 6 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 5; - break; - } - b1++; - b2++; - // 7 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 6; - break; - } - b1++; - b2++; - // 8 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 7; - break; - } - b1++; - b2++; - // 9 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 8; - break; - } - b1++; - b2++; - // 10 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 9; - break; - } - b1++; - b2++; - // 11 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 10; - break; - } - b1++; - b2++; - // 12 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 11; - break; - } - b1++; - b2++; - // 13 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 12; - break; - } - b1++; - b2++; - // 14 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 13; - break; - } - b1++; - b2++; - // 15 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 14; - break; - } - b1++; - b2++; - // 16 - c1 = text[this.start + b1]; - c2 = text[this.start + b2]; - if (c1 != c2) { - cmp2do -= 15; - break; - } - b1++; - b2++; - - cmp2do -= 16; - } while (cmp2do > 0); - - if (cmp_limit - cmp2do < cmp_limit) - return cmp_limit - cmp2do; - - return cmp_limit - 1; - } - - /** - * this function traverses the trie rooted at head following the string s. Returns the leaf "corresponding" to the - * string s - */ - private Node findCompanion(Node head, int s) { - int c; - Node p; - int t; - - stackSize = 0; // init stack - while (head.skip >= 0) { - stack[stackSize++] = head; - t = head.skip; - if (s + t >= textSize) // s[t] does not exist: mismatch - return getLeaf(head); - c = text[this.start + s + t]; - p = head.down; - boolean repeat = true; - while (repeat) { - if (c == p.key) { // found branch corresponding to c - head = p; - repeat = false; - } else if (c < p.key) // no branch corresponding to c: mismatch - { - return getLeaf(head); - } - if (repeat && (p = (p.right)) == null) // no other branches: mismatch - { - return getLeaf(head); - } - } - } - stack[stackSize++] = head; - return head; - } - - /** - * this function returns a leaf below "head". any leaf will do for the algorithm: we take the easiest to reach - */ - private Node getLeaf(Node head) { - Tools.assertAlways(head.skip >= 0, ""); - do { - head = head.down; - } while (head.skip >= 0); - return head; - } - - /** - * + private void swap(int i, int j, int a) { + int tmp = suffixArray[a + i]; + suffixArray[a + i] = suffixArray[a + j]; + suffixArray[a + j] = tmp; + } + + /** + * routine for deep-sorting the suffixes a[0] ... a[n-1] knowing that they have a common prefix of length "depth" */ - @SuppressWarnings("unused") + private void blindSsort(int a, int n, int depth) { + int i, j, aj, lcp; + Node nh, root, h; + + // ---- sort suffixes in order of increasing length + // qsort(a, n, sizeof(Int32), neg_integer_cmp); + Arrays.sort(suffixArray, a, a + n); + for (int left = 0, right = n - 1; left < right; left++, right--) { + // exchange the first and last + int temp = suffixArray[a + left]; + suffixArray[a + left] = suffixArray[a + right]; + suffixArray[a + right] = temp; + } + + // --- skip suffixes which have already reached the end-of-text + for (j = 0; j < n; j++) + if (suffixArray[a + j] + depth < textSize) + break; + if (j >= n - 1) + return; // everything is already sorted! + + // ------ init stack ------- + // stack = (node **) malloc(n*sizeof(node *)); + + // ------- init root with the first unsorted suffix + nh = new Node(); + nh.skip = -1; + nh.right = null; + // nh.down = (void *) a[j]; + nh.downInt = suffixArray[a + j]; + root = nh; + + // ------- insert suffixes a[j+1] ... a[n-1] + for (i = j + 1; i < n; i++) { + h = findCompanion(root, suffixArray[a + i]); + aj = h.downInt; + lcp = compareSuffixes(aj, suffixArray[a + i], depth); + insertSuffix(root, suffixArray[a + i], lcp, text[this.start + aj + lcp]); + } + + // ---- traverse the trie and get suffixes in lexicographic order + aux = a; + auxWritten = j; + traverseTrie(root); + + } + + /** + * this procedures traverse the trie in depth first order so that the suffixes (stored in the leaf) are recovered in + * lexicographic order + */ + private void traverseTrie(Node h) { + Node p, nextp; + + if (h.skip < 0) + suffixArray[aux + auxWritten++] = h.downInt; + else { + p = h.down; + do { + nextp = p.right; + if (nextp != null) { + // if there are 2 nodes with equal keys + // they must be considered in inverted order + if (nextp.key == p.key) { + traverseTrie(nextp); + traverseTrie(p); + p = nextp.right; + continue; + } + } + traverseTrie(p); + p = nextp; + } while (p != null); + } + + } + + /** + * insert a suffix in the trie rooted at *p. we know that the trie already contains a string which share the first n + * chars with suf + */ + private void insertSuffix(Node h, int suf, int n, int mmchar) { + int c, s; + Node p, pp; + + s = suf; + + // --------- insert a new node before node *h if necessary + if (h.skip != n) { + p = new Node(); + p.key = mmchar; + p.skip = h.skip; // p inherits skip and children of *h + p.down = h.down; + p.downInt = h.downInt; + p.right = null; + h.skip = n; + h.down = p; // now *h has p as the only child + } + + // -------- search the position of s[n] among *h offsprings + c = text[this.start + s + n]; + pp = h.down; + while (pp != null) { + if (pp.key >= c) + break; + pp = pp.right; + } + // ------- insert new node containing suf + p = new Node(); + p.skip = -1; + p.key = c; + p.right = pp; + pp = p; + p.downInt = suf; + return; + + } + + /** + * this function returns the lcp between suf1 and suf2 (that is returns n such that suf1[n]!=suf2[n] but + * suf1[i]==suf2[i] for i=0..n-1 However, it is possible that suf1 is a prefix of suf2 (not vice-versa because of + * the initial sorting of suffixes in order of descreasing length) in this case the function returns + * n=length(suf1)-1. So in this case suf1[n]==suf2[n] (and suf1[n+1] does not exists). + */ + private int compareSuffixes(int suf1, int suf2, int depth) { + int limit; + int s1, s2; + + s1 = depth + suf1; + s2 = depth + suf2; + limit = textSize - suf1 - depth; + return depth + getLcpUnrolled(s1, s2, limit); + } + + /** + * + */ + private int getLcpUnrolled(int b1, int b2, int cmp_limit) { + int cmp2do; + int c1, c2; + + // execute blocks of 16 comparisons untill a difference + // is found or we reach cmp_limit comparisons + cmp2do = cmp_limit; + do { + // 1 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + break; + } + b1++; + b2++; + // 2 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 1; + break; + } + b1++; + b2++; + // 3 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 2; + break; + } + b1++; + b2++; + // 4 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 3; + break; + } + b1++; + b2++; + // 5 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 4; + break; + } + b1++; + b2++; + // 6 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 5; + break; + } + b1++; + b2++; + // 7 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 6; + break; + } + b1++; + b2++; + // 8 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 7; + break; + } + b1++; + b2++; + // 9 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 8; + break; + } + b1++; + b2++; + // 10 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 9; + break; + } + b1++; + b2++; + // 11 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 10; + break; + } + b1++; + b2++; + // 12 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 11; + break; + } + b1++; + b2++; + // 13 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 12; + break; + } + b1++; + b2++; + // 14 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 13; + break; + } + b1++; + b2++; + // 15 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 14; + break; + } + b1++; + b2++; + // 16 + c1 = text[this.start + b1]; + c2 = text[this.start + b2]; + if (c1 != c2) { + cmp2do -= 15; + break; + } + b1++; + b2++; + + cmp2do -= 16; + } while (cmp2do > 0); + + if (cmp_limit - cmp2do < cmp_limit) + return cmp_limit - cmp2do; + + return cmp_limit - 1; + } + + /** + * this function traverses the trie rooted at head following the string s. Returns the leaf "corresponding" to the + * string s + */ + private Node findCompanion(Node head, int s) { + int c; + Node p; + int t; + + stackSize = 0; // init stack + while (head.skip >= 0) { + stack[stackSize++] = head; + t = head.skip; + if (s + t >= textSize) // s[t] does not exist: mismatch + return getLeaf(head); + c = text[this.start + s + t]; + p = head.down; + boolean repeat = true; + while (repeat) { + if (c == p.key) { // found branch corresponding to c + head = p; + repeat = false; + } else if (c < p.key) // no branch corresponding to c: mismatch + { + return getLeaf(head); + } + if (repeat && (p = (p.right)) == null) // no other branches: mismatch + { + return getLeaf(head); + } + } + } + stack[stackSize++] = head; + return head; + } + + /** + * this function returns a leaf below "head". any leaf will do for the algorithm: we take the easiest to reach + */ + private Node getLeaf(Node head) { + Tools.assertAlways(head.skip >= 0, ""); + do { + head = head.down; + } while (head.skip >= 0); + return head; + } + + /** + * + */ + @SuppressWarnings("unused") private void pseudoAnchorSort(int a, int n, int pseudo_anchor_pos, int offset) { - int pseudo_anchor_rank; - - // ---------- compute rank ------------ - if (UPDATE_ANCHOR_RANKS && anchorDist > 0) - pseudo_anchor_rank = getRankUpdateAnchors(pseudo_anchor_pos); - else - pseudo_anchor_rank = getRank(pseudo_anchor_pos); - // ---------- check rank -------------- - assert (suffixArray[pseudo_anchor_rank] == pseudo_anchor_pos); - // ---------- do the sorting ---------- - generalAnchorSort(a, n, pseudo_anchor_pos, pseudo_anchor_rank, offset); - - } - - /** - * compute the rank of the suffix starting at pos. It is required that the suffix is in an already sorted bucket - */ - private int getRank(int pos) { - int sb, lo, hi, j; - - sb = getSmallBucket(pos); - if (!isSortedBucket(sb)) { - throw new RuntimeException("Illegal call to get_rank! (get_rank1)"); - } - lo = bucketFirst(sb); - hi = bucketLast(sb); - for (j = lo; j <= hi; j++) - if (suffixArray[j] == pos) - return j; - throw new RuntimeException("Illegal call to get_rank! (get_rank2)"); - } - - /** - * compute the rank of the suffix starting at pos. At the same time check if the rank of the suffixes in the bucket - * containing pos can be used to update some entries in anchorOffset[] and anchorRank[] It is required that the - * suffix is in an already sorted bucket - */ - private int getRankUpdateAnchors(int pos) { - int sb, lo, hi, j, toffset, aoffset, anchor, rank; - - // --- get bucket and verify it is a sorted one - sb = getSmallBucket(pos); - if (!(isSortedBucket(sb))) { - throw new RuntimeException("Illegal call to get_rank! (get_rank_update_anchors)"); - } - // --- if the bucket has been already ranked just compute rank; - if (bucketRanked[sb] != 0) - return getRank(pos); - // --- rank all the bucket - bucketRanked[sb] = 1; - rank = -1; - lo = bucketFirst(sb); - hi = bucketLast(sb); - for (j = lo; j <= hi; j++) { - // see if we can update an anchor - toffset = suffixArray[j] % anchorDist; - anchor = suffixArray[j] / anchorDist; - aoffset = anchorOffset[anchor]; // dist of sorted suf from anchor - if (toffset < aoffset) { - anchorOffset[anchor] = toffset; - anchorRank[anchor] = j; - } - // see if we have found the rank of pos, if so store it in rank - if (suffixArray[j] == pos) { - assert (rank == -1); - rank = j; - } - } - assert (rank >= 0); - return rank; - } - - private void swap2(int a, int b) { - int tmp = suffixArray[a]; - suffixArray[a] = suffixArray[b]; - suffixArray[b] = tmp; - - } - - /* - * #define ptr2char32(i) (getword32(*(i) + text_depth)) - */ - private int ptr2char32(int a, int depth) { - return getword32(suffixArray[a] + depth); - } - - /* - * #define getword32(s) ((unsigned)( (*(s) << 24) | ((*((s)+1)) << 16) \ | ((*((s)+2)) - * << 8) | (*((s)+3)) )) - */ - private int getword32(int s) { - return text[this.start + s] << 24 | text[this.start + s + 1] << 16 - | text[this.start + s + 2] << 8 | text[this.start + s + 3]; - } - - private int ptr2char(int i, int text_depth) { - return text[this.start + suffixArray[i] + text_depth]; - } - - private int med3(int a, int b, int c, int depth) { - int va = ptr2char(a, depth); - int vb = ptr2char(b, depth); - if (va == vb) { - return a; - } - int vc = ptr2char(c, depth); - if (vc == va || vc == vb) { - return c; - } - return va < vb ? (vb < vc ? b : (va < vc ? c : a)) : (vb > vc ? b : (va < vc ? a : c)); - } - - private void calculateRunningOrder() { - int i, j; - for (i = 0; i <= 256; i++) - runningOrder[i] = i; - { - int vv; - int h = 1; - do - h = 3 * h + 1; - while (h <= 257); - do { - h = h / 3; - for (i = h; i <= 256; i++) { - vv = runningOrder[i]; - j = i; - while (bigFreq(runningOrder[j - h]) > bigFreq(vv)) { - runningOrder[j] = runningOrder[j - h]; - j = j - h; - if (j <= (h - 1)) - break; - } - runningOrder[j] = vv; - } - } while (h != 1); - } - } - - /** - * + int pseudo_anchor_rank; + + // ---------- compute rank ------------ + if (UPDATE_ANCHOR_RANKS && anchorDist > 0) + pseudo_anchor_rank = getRankUpdateAnchors(pseudo_anchor_pos); + else + pseudo_anchor_rank = getRank(pseudo_anchor_pos); + // ---------- check rank -------------- + assert (suffixArray[pseudo_anchor_rank] == pseudo_anchor_pos); + // ---------- do the sorting ---------- + generalAnchorSort(a, n, pseudo_anchor_pos, pseudo_anchor_rank, offset); + + } + + /** + * compute the rank of the suffix starting at pos. It is required that the suffix is in an already sorted bucket + */ + private int getRank(int pos) { + int sb, lo, hi, j; + + sb = getSmallBucket(pos); + if (!isSortedBucket(sb)) { + throw new RuntimeException("Illegal call to get_rank! (get_rank1)"); + } + lo = bucketFirst(sb); + hi = bucketLast(sb); + for (j = lo; j <= hi; j++) + if (suffixArray[j] == pos) + return j; + throw new RuntimeException("Illegal call to get_rank! (get_rank2)"); + } + + /** + * compute the rank of the suffix starting at pos. At the same time check if the rank of the suffixes in the bucket + * containing pos can be used to update some entries in anchorOffset[] and anchorRank[] It is required that the + * suffix is in an already sorted bucket + */ + private int getRankUpdateAnchors(int pos) { + int sb, lo, hi, j, toffset, aoffset, anchor, rank; + + // --- get bucket and verify it is a sorted one + sb = getSmallBucket(pos); + if (!(isSortedBucket(sb))) { + throw new RuntimeException("Illegal call to get_rank! (get_rank_update_anchors)"); + } + // --- if the bucket has been already ranked just compute rank; + if (bucketRanked[sb] != 0) + return getRank(pos); + // --- rank all the bucket + bucketRanked[sb] = 1; + rank = -1; + lo = bucketFirst(sb); + hi = bucketLast(sb); + for (j = lo; j <= hi; j++) { + // see if we can update an anchor + toffset = suffixArray[j] % anchorDist; + anchor = suffixArray[j] / anchorDist; + aoffset = anchorOffset[anchor]; // dist of sorted suf from anchor + if (toffset < aoffset) { + anchorOffset[anchor] = toffset; + anchorRank[anchor] = j; + } + // see if we have found the rank of pos, if so store it in rank + if (suffixArray[j] == pos) { + assert (rank == -1); + rank = j; + } + } + assert (rank >= 0); + return rank; + } + + private void swap2(int a, int b) { + int tmp = suffixArray[a]; + suffixArray[a] = suffixArray[b]; + suffixArray[b] = tmp; + + } + + /* + * #define ptr2char32(i) (getword32(*(i) + text_depth)) + */ + private int ptr2char32(int a, int depth) { + return getword32(suffixArray[a] + depth); + } + + /* + * #define getword32(s) ((unsigned)( (*(s) << 24) | ((*((s)+1)) << 16) \ | ((*((s)+2)) + * << 8) | (*((s)+3)) )) + */ + private int getword32(int s) { + return text[this.start + s] << 24 | text[this.start + s + 1] << 16 + | text[this.start + s + 2] << 8 | text[this.start + s + 3]; + } + + private int ptr2char(int i, int text_depth) { + return text[this.start + suffixArray[i] + text_depth]; + } + + private int med3(int a, int b, int c, int depth) { + int va = ptr2char(a, depth); + int vb = ptr2char(b, depth); + if (va == vb) { + return a; + } + int vc = ptr2char(c, depth); + if (vc == va || vc == vb) { + return c; + } + return va < vb ? (vb < vc ? b : (va < vc ? c : a)) : (vb > vc ? b : (va < vc ? a : c)); + } + + private void calculateRunningOrder() { + int i, j; + for (i = 0; i <= 256; i++) + runningOrder[i] = i; + { + int vv; + int h = 1; + do + h = 3 * h + 1; + while (h <= 257); + do { + h = h / 3; + for (i = h; i <= 256; i++) { + vv = runningOrder[i]; + j = i; + while (bigFreq(runningOrder[j - h]) > bigFreq(vv)) { + runningOrder[j] = runningOrder[j - h]; + j = j - h; + if (j <= (h - 1)) + break; + } + runningOrder[j] = vv; + } + } while (h != 1); + } + } + + /** + * */ - private int bigFreq(int b) { - return ftab[((b) + 1) << 8] - ftab[(b) << 8]; - } - - public static void main(String[] args) { - for (int i = 0; i < 5; i++) { - System.gc(); - } - int size = 1000000; - final Runtime rt = Runtime.getRuntime(); - long before, after; - Node[] nodes = new Node[size]; - before = rt.totalMemory() - rt.freeMemory(); - for (int i = 0; i < size; i++) { - nodes[i] = new Node(); - } - after = rt.totalMemory() - rt.freeMemory(); - - double a = 1.0 * (after - before) / size; - - System.out.println(before + " " + after + " " + size + " " + a); - - } + private int bigFreq(int b) { + return ftab[((b) + 1) << 8] - ftab[(b) << 8]; + } + + public static void main(String[] args) { + for (int i = 0; i < 5; i++) { + System.gc(); + } + int size = 1000000; + final Runtime rt = Runtime.getRuntime(); + long before, after; + Node[] nodes = new Node[size]; + before = rt.totalMemory() - rt.freeMemory(); + for (int i = 0; i < size; i++) { + nodes[i] = new Node(); + } + after = rt.totalMemory() - rt.freeMemory(); + + double a = 1.0 * (after - before) / size; + + System.out.println(before + " " + after + " " + size + " " + a); + + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DensePositiveDecorator.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DensePositiveDecorator.java index ba14dbd3c..43a515240 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DensePositiveDecorator.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DensePositiveDecorator.java @@ -9,43 +9,34 @@ * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -public final class DensePositiveDecorator implements ISuffixArrayBuilder -{ +public final class DensePositiveDecorator implements ISuffixArrayBuilder { private final ISuffixArrayBuilder delegate; /* - * + * */ - public DensePositiveDecorator(ISuffixArrayBuilder delegate) - { + public DensePositiveDecorator(ISuffixArrayBuilder delegate) { this.delegate = delegate; } /* - * + * */ @Override - public int [] buildSuffixArray(int [] input, final int start, final int length) - { + public int[] buildSuffixArray(int[] input, final int start, final int length) { final MinMax minmax = Tools.minmax(input, start, length); final ISymbolMapper mapper; - if (minmax.range() > 0x10000) - { + if (minmax.range() > 0x10000) { throw new RuntimeException("Large symbol space not implemented yet."); - } - else - { + } else { mapper = new DensePositiveMapper(input, start, length); } mapper.map(input, start, length); - try - { + try { return delegate.buildSuffixArray(input, start, length); - } - finally - { + } finally { mapper.undo(input, start, length); } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DensePositiveMapper.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DensePositiveMapper.java index ff7f6abdc..307e72ae6 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DensePositiveMapper.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DensePositiveMapper.java @@ -8,44 +8,38 @@ * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -final class DensePositiveMapper implements ISymbolMapper -{ +final class DensePositiveMapper implements ISymbolMapper { private final int offset; - private final int [] forward; - private final int [] backward; + private final int[] forward; + private final int[] backward; /* - * + * */ - public DensePositiveMapper(int [] input, int start, int length) - { + public DensePositiveMapper(int[] input, int start, int length) { final MinMax minmax = Tools.minmax(input, start, length); final int min = minmax.min; final int max = minmax.max; - final int [] forward = new int [max - min + 1]; + final int[] forward = new int[max - min + 1]; final int offset = -min; // Mark all symbols present in the alphabet. final int end = start + length; - for (int i = start; i < end; i++) - { + for (int i = start; i < end; i++) { forward[input[i] + offset] = 1; } - + // Collect present symbols, assign unique codes. int k = 1; - for (int i = 0; i < forward.length; i++) - { - if (forward[i] != 0) - { + for (int i = 0; i < forward.length; i++) { + if (forward[i] != 0) { forward[i] = k++; } } - final int [] backward = new int [k]; - for (int i = start; i < end; i++) - { + final int[] backward = new int[k]; + for (int i = start; i < end; i++) { final int v = forward[input[i] + offset]; backward[v] = input[i]; } @@ -56,25 +50,21 @@ public DensePositiveMapper(int [] input, int start, int length) } /* - * + * */ @Override - public void map(int [] input, final int start, final int length) - { - for (int i = start, l = length; l > 0; l--, i++) - { + public void map(int[] input, final int start, final int length) { + for (int i = start, l = length; l > 0; l--, i++) { input[i] = forward[input[i] + offset]; } } /* - * + * */ @Override - public void undo(int [] input, final int start, final int length) - { - for (int i = start, l = length; l > 0; l--, i++) - { + public void undo(int[] input, final int start, final int length) { + for (int i = start, l = length; l > 0; l--, i++) { input[i] = backward[input[i]]; } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DivSufSort.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DivSufSort.java index 447dcf0ea..66f3bf196 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DivSufSort.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/DivSufSort.java @@ -17,18 +17,15 @@ * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -public final class DivSufSort implements ISuffixArrayBuilder -{ +public final class DivSufSort implements ISuffixArrayBuilder { /* - * + * */ - private final static class StackElement - { + private final static class StackElement { final int a, b, c, e; int d; - StackElement(int a, int b, int c, int d, int e) - { + StackElement(int a, int b, int c, int d, int e) { this.a = a; this.b = b; this.c = c; @@ -36,38 +33,32 @@ private final static class StackElement this.e = e; } - StackElement(int a, int b, int c, int d) - { + StackElement(int a, int b, int c, int d) { this(a, b, c, d, 0); } } /* - * + * */ - private final static class TRBudget - { + private final static class TRBudget { int chance; int remain; int incval; int count; - private TRBudget(int chance, int incval) - { + private TRBudget(int chance, int incval) { this.chance = chance; this.remain = incval; this.incval = incval; } - private int check(int size) - { - if (size <= this.remain) - { + private int check(int size) { + if (size <= this.remain) { this.remain -= size; return 1; } - if (this.chance == 0) - { + if (this.chance == 0) { this.count += size; return 0; } @@ -78,22 +69,19 @@ private int check(int size) } /* - * + * */ - private static final class TRPartitionResult - { + private static final class TRPartitionResult { final int a; final int b; - public TRPartitionResult(int a, int b) - { + public TRPartitionResult(int a, int b) { this.a = a; this.b = b; } } - public DivSufSort() - { + public DivSufSort() { ALPHABET_SIZE = DEFAULT_ALPHABET_SIZE; BUCKET_A_SIZE = ALPHABET_SIZE; BUCKET_B_SIZE = ALPHABET_SIZE * ALPHABET_SIZE; @@ -102,8 +90,7 @@ public DivSufSort() /** * @param alphabetSize */ - public DivSufSort(int alphabetSize) - { + public DivSufSort(int alphabetSize) { ALPHABET_SIZE = alphabetSize; BUCKET_A_SIZE = ALPHABET_SIZE; BUCKET_B_SIZE = ALPHABET_SIZE * ALPHABET_SIZE; @@ -119,46 +106,46 @@ public DivSufSort(int alphabetSize) private final static int TR_STACKSIZE = 64; private final static int TR_INSERTIONSORT_THRESHOLD = 8; - private final static int [] sqq_table = - { - 0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61, 64, 65, 67, 69, - 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89, 90, 91, 93, 94, 96, 97, 98, 99, - 101, 102, 103, 104, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118, - 119, 120, 121, 122, 123, 124, 125, 126, 128, 128, 129, 130, 131, 132, 133, 134, - 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 144, 145, 146, 147, 148, 149, - 150, 150, 151, 152, 153, 154, 155, 155, 156, 157, 158, 159, 160, 160, 161, 162, - 163, 163, 164, 165, 166, 167, 167, 168, 169, 170, 170, 171, 172, 173, 173, 174, - 175, 176, 176, 177, 178, 178, 179, 180, 181, 181, 182, 183, 183, 184, 185, 185, - 186, 187, 187, 188, 189, 189, 190, 191, 192, 192, 193, 193, 194, 195, 195, 196, - 197, 197, 198, 199, 199, 200, 201, 201, 202, 203, 203, 204, 204, 205, 206, 206, - 207, 208, 208, 209, 209, 210, 211, 211, 212, 212, 213, 214, 214, 215, 215, 216, - 217, 217, 218, 218, 219, 219, 220, 221, 221, 222, 222, 223, 224, 224, 225, 225, - 226, 226, 227, 227, 228, 229, 229, 230, 230, 231, 231, 232, 232, 233, 234, 234, - 235, 235, 236, 236, 237, 237, 238, 238, 239, 240, 240, 241, 241, 242, 242, 243, - 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 250, 250, 251, - 251, 252, 252, 253, 253, 254, 254, 255 - }; - - private final static int [] lg_table = - { - -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 - }; + private final static int[] sqq_table = + { + 0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61, 64, 65, 67, 69, + 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89, 90, 91, 93, 94, 96, 97, 98, 99, + 101, 102, 103, 104, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118, + 119, 120, 121, 122, 123, 124, 125, 126, 128, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 144, 145, 146, 147, 148, 149, + 150, 150, 151, 152, 153, 154, 155, 155, 156, 157, 158, 159, 160, 160, 161, 162, + 163, 163, 164, 165, 166, 167, 167, 168, 169, 170, 170, 171, 172, 173, 173, 174, + 175, 176, 176, 177, 178, 178, 179, 180, 181, 181, 182, 183, 183, 184, 185, 185, + 186, 187, 187, 188, 189, 189, 190, 191, 192, 192, 193, 193, 194, 195, 195, 196, + 197, 197, 198, 199, 199, 200, 201, 201, 202, 203, 203, 204, 204, 205, 206, 206, + 207, 208, 208, 209, 209, 210, 211, 211, 212, 212, 213, 214, 214, 215, 215, 216, + 217, 217, 218, 218, 219, 219, 220, 221, 221, 222, 222, 223, 224, 224, 225, 225, + 226, 226, 227, 227, 228, 229, 229, 230, 230, 231, 231, 232, 232, 233, 234, 234, + 235, 235, 236, 236, 237, 237, 238, 238, 239, 240, 240, 241, 241, 242, 242, 243, + 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 250, 250, 251, + 251, 252, 252, 253, 253, 254, 254, 255 + }; + + private final static int[] lg_table = + { + -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 + }; /* fields */ private final int ALPHABET_SIZE; private final int BUCKET_A_SIZE; private final int BUCKET_B_SIZE; - private int [] SA; - private int [] T; + private int[] SA; + private int[] T; private int start; /** @@ -168,24 +155,23 @@ public DivSufSort(int alphabetSize) *

      *
    • non-negative (≥0) symbols in the input
    • *
    • symbols limited by alphabet size passed in the constructor.
    • - *
    • length >= 2
    • + *
    • length >= 2
    • *
    *

    */ @Override - public final int [] buildSuffixArray(int [] input, int start, int length) - { + public final int[] buildSuffixArray(int[] input, int start, int length) { Tools.assertAlways(input != null, "input must not be null"); Tools.assertAlways(length >= 2, "input length must be >= 2"); MinMax mm = Tools.minmax(input, start, length); Tools.assertAlways(mm.min >= 0, "input must not be negative"); Tools.assertAlways(mm.max < ALPHABET_SIZE, "max alphabet size is " + ALPHABET_SIZE); - final int [] ret = new int [length]; + final int[] ret = new int[length]; this.SA = ret; this.T = input; - int [] bucket_A = new int [BUCKET_A_SIZE]; - int [] bucket_B = new int [BUCKET_B_SIZE]; + int[] bucket_A = new int[BUCKET_A_SIZE]; + int[] bucket_B = new int[BUCKET_B_SIZE]; this.start = start; /* Suffixsort. */ int m = sortTypeBstar(bucket_A, bucket_B, length); @@ -196,24 +182,19 @@ public DivSufSort(int alphabetSize) /** * Constructs the suffix array by using the sorted order of type B* suffixes. */ - private final void constructSuffixArray(int [] bucket_A, int [] bucket_B, int n, int m) - { + private final void constructSuffixArray(int[] bucket_A, int[] bucket_B, int n, int m) { int i, j, k; // ptr int s, c0, c1, c2; // (_c1)]) - if (0 < m) - { + if (0 < m) { /* * Construct the sorted order of type B suffixes by using the sorted order of * type B suffixes. */ - for (c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) - { + for (c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { /* Scan the suffix array from right to left. */ - for (i = bucket_B[(c1) * ALPHABET_SIZE + (c1 + 1)], j = bucket_A[c1 + 1] - 1, k = 0, c2 = -1; i <= j; --j) - { - if (0 < (s = SA[j])) - { + for (i = bucket_B[(c1) * ALPHABET_SIZE + (c1 + 1)], j = bucket_A[c1 + 1] - 1, k = 0, c2 = -1; i <= j; --j) { + if (0 < (s = SA[j])) { // Tools.assertAlways(T[s] == c1, ""); // Tools.assertAlways(((s + 1) < n) && (T[s] <= T[s + // 1]), @@ -221,23 +202,18 @@ private final void constructSuffixArray(int [] bucket_A, int [] bucket_B, int n, // Tools.assertAlways(T[s - 1] <= T[s], ""); SA[j] = ~s; c0 = T[start + --s]; - if ((0 < s) && (T[start + s - 1] > c0)) - { + if ((0 < s) && (T[start + s - 1] > c0)) { s = ~s; } - if (c0 != c2) - { - if (0 <= c2) - { + if (c0 != c2) { + if (0 <= c2) { bucket_B[(c1) * ALPHABET_SIZE + (c2)] = k; } k = bucket_B[(c1) * ALPHABET_SIZE + (c2 = c0)]; } // Tools.assertAlways(k < j, ""); SA[k--] = s; - } - else - { + } else { // Tools.assertAlways(((s == 0) && (T[s] == c1)) // || (s < 0), ""); SA[j] = ~s; @@ -252,26 +228,20 @@ private final void constructSuffixArray(int [] bucket_A, int [] bucket_B, int n, k = bucket_A[c2 = T[start + n - 1]]; SA[k++] = (T[start + n - 2] < c2) ? ~(n - 1) : (n - 1); /* Scan the suffix array from left to right. */ - for (i = 0, j = n; i < j; ++i) - { - if (0 < (s = SA[i])) - { + for (i = 0, j = n; i < j; ++i) { + if (0 < (s = SA[i])) { // Tools.assertAlways(T[s - 1] >= T[s], ""); c0 = T[start + --s]; - if ((s == 0) || (T[start + s - 1] < c0)) - { + if ((s == 0) || (T[start + s - 1] < c0)) { s = ~s; } - if (c0 != c2) - { + if (c0 != c2) { bucket_A[c2] = k; k = bucket_A[c2 = c0]; } // Tools.assertAlways(i < k, ""); SA[k++] = s; - } - else - { + } else { // Tools.assertAlways(s < 0, ""); SA[i] = ~s; } @@ -279,10 +249,9 @@ private final void constructSuffixArray(int [] bucket_A, int [] bucket_B, int n, } /** - * - */ - private final int sortTypeBstar(int [] bucket_A, int [] bucket_B, int n) - { + * + */ + private final int sortTypeBstar(int[] bucket_A, int[] bucket_B, int n) { int PAb, ISAb, buf; int i, j, k, t, m, bufsize; @@ -293,22 +262,18 @@ private final int sortTypeBstar(int [] bucket_A, int [] bucket_B, int n) * A, B and B suffix. Moreover, store the beginning position of all type B * suffixes into the array SA. */ - for (i = n - 1, m = n, c0 = T[start + n - 1]; 0 <= i;) - { + for (i = n - 1, m = n, c0 = T[start + n - 1]; 0 <= i; ) { /* type A suffix. */ - do - { + do { ++bucket_A[c1 = c0]; } while ((0 <= --i) && ((c0 = T[start + i]) >= c1)); - if (0 <= i) - { + if (0 <= i) { /* type B suffix. */ ++bucket_B[(c0) * ALPHABET_SIZE + (c1)]; SA[--m] = i; /* type B suffix. */ - for (--i, c1 = c0; (0 <= i) && ((c0 = T[start + i]) <= c1); --i, c1 = c0) - { + for (--i, c1 = c0; (0 <= i) && ((c0 = T[start + i]) <= c1); --i, c1 = c0) { ++bucket_B[(c1) * ALPHABET_SIZE + (c0)]; } } @@ -321,26 +286,22 @@ private final int sortTypeBstar(int [] bucket_A, int [] bucket_B, int n) // begins with the same first two characters. // Calculate the index of start/end point of each bucket. - for (c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) - { + for (c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { t = i + bucket_A[c0]; bucket_A[c0] = i + j; /* start point */ i = t + bucket_B[(c0) * ALPHABET_SIZE + (c0)]; - for (c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) - { + for (c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { j += bucket_B[(c0) * ALPHABET_SIZE + (c1)]; bucket_B[(c0) * ALPHABET_SIZE + (c1)] = j; // end point i += bucket_B[(c1) * ALPHABET_SIZE + (c0)]; } } - if (0 < m) - { + if (0 < m) { // Sort the type B* suffixes by their first two characters. PAb = n - m;// SA ISAb = m;// SA - for (i = m - 2; 0 <= i; --i) - { + for (i = m - 2; 0 <= i; --i) { t = SA[PAb + i]; c0 = T[start + t]; c1 = T[start + t + 1]; @@ -356,38 +317,30 @@ private final int sortTypeBstar(int [] bucket_A, int [] bucket_B, int n) buf = m;// SA bufsize = n - (2 * m); - for (c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) - { - for (c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) - { + for (c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { + for (c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { i = bucket_B[(c0) * ALPHABET_SIZE + (c1)]; - if (1 < (j - i)) - { + if (1 < (j - i)) { ssSort(PAb, i, j, buf, bufsize, 2, n, SA[i] == (m - 1)); } } } // Compute ranks of type B* substrings. - for (i = m - 1; 0 <= i; --i) - { - if (0 <= SA[i]) - { + for (i = m - 1; 0 <= i; --i) { + if (0 <= SA[i]) { j = i; - do - { + do { SA[ISAb + SA[i]] = i; } while ((0 <= --i) && (0 <= SA[i])); SA[i + 1] = i - j; - if (i <= 0) - { + if (i <= 0) { break; } } j = i; - do - { + do { SA[ISAb + (SA[i] = ~SA[i])] = j; } while (SA[--i] < 0); @@ -397,16 +350,12 @@ private final int sortTypeBstar(int [] bucket_A, int [] bucket_B, int n) // trsort. trSort(ISAb, m, 1); // Set the sorted order of type B* suffixes. - for (i = n - 1, j = m, c0 = T[start + n - 1]; 0 <= i;) - { - for (--i, c1 = c0; (0 <= i) && ((c0 = T[start + i]) >= c1); --i, c1 = c0) - { + for (i = n - 1, j = m, c0 = T[start + n - 1]; 0 <= i; ) { + for (--i, c1 = c0; (0 <= i) && ((c0 = T[start + i]) >= c1); --i, c1 = c0) { } - if (0 <= i) - { + if (0 <= i) { t = i; - for (--i, c1 = c0; (0 <= i) && ((c0 = T[start + i]) <= c1); --i, c1 = c0) - { + for (--i, c1 = c0; (0 <= i) && ((c0 = T[start + i]) <= c1); --i, c1 = c0) { } SA[SA[ISAb + --j]] = ((t == 0) || (1 < (t - i))) ? t : ~t; } @@ -415,17 +364,14 @@ private final int sortTypeBstar(int [] bucket_A, int [] bucket_B, int n) // Calculate the index of start/end point of each bucket. bucket_B[(ALPHABET_SIZE - 1) * ALPHABET_SIZE + (ALPHABET_SIZE - 1)] = n; // end // point - for (c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) - { + for (c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) { i = bucket_A[c0 + 1] - 1; - for (c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) - { + for (c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { t = i - bucket_B[(c1) * ALPHABET_SIZE + (c0)]; bucket_B[(c1) * ALPHABET_SIZE + (c0)] = i; // end point // Move all type B* suffixes to the correct position. - for (i = t, j = bucket_B[(c0) * ALPHABET_SIZE + (c1)]; j <= k; --i, --k) - { + for (i = t, j = bucket_B[(c0) * ALPHABET_SIZE + (c1)]; j <= k; --i, --k) { SA[i] = SA[k]; } } @@ -442,71 +388,57 @@ private final int sortTypeBstar(int [] bucket_A, int [] bucket_B, int n) * */ private final void ssSort(final int PA, int first, int last, int buf, int bufsize, - int depth, int n, boolean lastsuffix) - { + int depth, int n, boolean lastsuffix) { int a, b, middle, curbuf;// SA pointer int j, k, curbufsize, limit; int i; - if (lastsuffix) - { + if (lastsuffix) { ++first; } if ((bufsize < SS_BLOCKSIZE) && (bufsize < (last - first)) - && (bufsize < (limit = ssIsqrt(last - first)))) - { - if (SS_BLOCKSIZE < limit) - { + && (bufsize < (limit = ssIsqrt(last - first)))) { + if (SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; } buf = middle = last - limit; bufsize = limit; - } - else - { + } else { middle = last; limit = 0; } - for (a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) - { + for (a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) { ssMintroSort(PA, a, a + SS_BLOCKSIZE, depth); curbufsize = last - (a + SS_BLOCKSIZE); curbuf = a + SS_BLOCKSIZE; - if (curbufsize <= bufsize) - { + if (curbufsize <= bufsize) { curbufsize = bufsize; curbuf = buf; } - for (b = a, k = SS_BLOCKSIZE, j = i; (j & 1) != 0; b -= k, k <<= 1, j >>= 1) - { + for (b = a, k = SS_BLOCKSIZE, j = i; (j & 1) != 0; b -= k, k <<= 1, j >>= 1) { ssSwapMerge(PA, b - k, b, b + k, curbuf, curbufsize, depth); } } ssMintroSort(PA, a, middle, depth); - for (k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) - { - if ((i & 1) != 0) - { + for (k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) { + if ((i & 1) != 0) { ssSwapMerge(PA, a - k, a, middle, buf, bufsize, depth); a -= k; } } - if (limit != 0) - { + if (limit != 0) { ssMintroSort(PA, middle, last, depth); ssInplaceMerge(PA, first, middle, last, depth); } - if (lastsuffix) - { + if (lastsuffix) { int p1 = SA[PA + SA[first - 1]]; int p11 = n - 2; for (a = first, i = SA[first - 1]; (a < last) - && ((SA[a] < 0) || (0 < ssCompare(p1, p11, PA + SA[a], depth))); ++a) - { + && ((SA[a] < 0) || (0 < ssCompare(p1, p11, PA + SA[a], depth))); ++a) { SA[a - 1] = SA[a]; } SA[a - 1] = i; @@ -518,13 +450,11 @@ private final void ssSort(final int PA, int first, int last, int buf, int bufsiz * special version of ss_compare for handling * ss_compare(T, &(PAi[0]), PA + *a, depth) situation. */ - private final int ssCompare(int pa, int pb, int p2, int depth) - { + private final int ssCompare(int pa, int pb, int p2, int depth) { int U1, U2, U1n, U2n;// pointers to T for (U1 = depth + pa, U2 = depth + SA[p2], U1n = pb + 2, U2n = SA[p2 + 1] + 2; (U1 < U1n) - && (U2 < U2n) && (T[start + U1] == T[start + U2]); ++U1, ++U2) - { + && (U2 < U2n) && (T[start + U1] == T[start + U2]); ++U1, ++U2) { } return U1 < U1n ? (U2 < U2n ? T[start + U1] - T[start + U2] : 1) : (U2 < U2n ? -1 @@ -532,15 +462,13 @@ private final int ssCompare(int pa, int pb, int p2, int depth) } /** - * + * */ - private final int ssCompare(int p1, int p2, int depth) - { + private final int ssCompare(int p1, int p2, int depth) { int U1, U2, U1n, U2n;// pointers to T for (U1 = depth + SA[p1], U2 = depth + SA[p2], U1n = SA[p1 + 1] + 2, U2n = SA[p2 + 1] + 2; (U1 < U1n) - && (U2 < U2n) && (T[start + U1] == T[start + U2]); ++U1, ++U2) - { + && (U2 < U2n) && (T[start + U1] == T[start + U2]); ++U1, ++U2) { } return U1 < U1n ? (U2 < U2n ? T[start + U1] - T[start + U2] : 1) : (U2 < U2n ? -1 @@ -549,66 +477,51 @@ private final int ssCompare(int p1, int p2, int depth) } /** - * + * */ - private final void ssInplaceMerge(int PA, int first, int middle, int last, int depth) - { + private final void ssInplaceMerge(int PA, int first, int middle, int last, int depth) { // PA, middle, first, last are pointers to SA int p, a, b;// pointer to SA int len, half; int q, r; int x; - for (;;) - { - if (SA[last - 1] < 0) - { + for (; ; ) { + if (SA[last - 1] < 0) { x = 1; p = PA + ~SA[last - 1]; - } - else - { + } else { x = 0; p = PA + SA[last - 1]; } - for (a = first, len = middle - first, half = len >> 1, r = -1; 0 < len; len = half, half >>= 1) - { + for (a = first, len = middle - first, half = len >> 1, r = -1; 0 < len; len = half, half >>= 1) { b = a + half; q = ssCompare(PA + ((0 <= SA[b]) ? SA[b] : ~SA[b]), p, depth); - if (q < 0) - { + if (q < 0) { a = b + 1; half -= (len & 1) ^ 1; - } - else - { + } else { r = q; } } - if (a < middle) - { - if (r == 0) - { + if (a < middle) { + if (r == 0) { SA[a] = ~SA[a]; } ssRotate(a, middle, last); last -= middle - a; middle = a; - if (first == middle) - { + if (first == middle) { break; } } --last; - if (x != 0) - { - while (SA[--last] < 0) - { + if (x != 0) { + while (SA[--last] < 0) { // nop } } - if (middle == last) - { + if (middle == last) { break; } } @@ -616,37 +529,30 @@ private final void ssInplaceMerge(int PA, int first, int middle, int last, int d } /** - * - */ - private final void ssRotate(int first, int middle, int last) - { + * + */ + private final void ssRotate(int first, int middle, int last) { // first, middle, last are pointers in SA int a, b, t;// pointers in SA int l, r; l = middle - first; r = last - middle; - for (; (0 < l) && (0 < r);) - { - if (l == r) - { + for (; (0 < l) && (0 < r); ) { + if (l == r) { ssBlockSwap(first, middle, l); break; } - if (l < r) - { + if (l < r) { a = last - 1; b = middle - 1; t = SA[a]; - do - { + do { SA[a--] = SA[b]; SA[b--] = SA[a]; - if (b < first) - { + if (b < first) { SA[a] = t; last = a; - if ((r -= l + 1) <= l) - { + if ((r -= l + 1) <= l) { break; } a -= 1; @@ -655,22 +561,17 @@ private final void ssRotate(int first, int middle, int last) } } while (true); - } - else - { + } else { a = first; b = middle; t = SA[a]; - do - { + do { SA[a++] = SA[b]; SA[b++] = SA[a]; - if (last <= b) - { + if (last <= b) { SA[a] = t; first = a + 1; - if ((l -= r + 1) <= r) - { + if ((l -= r + 1) <= r) { break; } a += 1; @@ -684,27 +585,23 @@ private final void ssRotate(int first, int middle, int last) } /** - * + * */ - private final void ssBlockSwap(int a, int b, int n) - { + private final void ssBlockSwap(int a, int b, int n) { // a, b -- pointer to SA int t; - for (; 0 < n; --n, ++a, ++b) - { + for (; 0 < n; --n, ++a, ++b) { t = SA[a]; SA[a] = SA[b]; SA[b] = t; } } - private final static int getIDX(int a) - { + private final static int getIDX(int a) { return (0 <= (a)) ? (a) : (~(a)); } - private final static int min(int a, int b) - { + private final static int min(int a, int b) { return a < b ? a : b; } @@ -712,140 +609,110 @@ private final static int min(int a, int b) * D&C based merge. */ private final void ssSwapMerge(int PA, int first, int middle, int last, int buf, - int bufsize, int depth) - { + int bufsize, int depth) { // Pa, first, middle, last and buf - pointers in SA array final int STACK_SIZE = SS_SMERGE_STACKSIZE; - StackElement [] stack = new StackElement [STACK_SIZE]; + StackElement[] stack = new StackElement[STACK_SIZE]; int l, r, lm, rm;// pointers in SA int m, len, half; int ssize; int check, next; - for (check = 0, ssize = 0;;) - { + for (check = 0, ssize = 0; ; ) { - if ((last - middle) <= bufsize) - { - if ((first < middle) && (middle < last)) - { + if ((last - middle) <= bufsize) { + if ((first < middle) && (middle < last)) { ssMergeBackward(PA, first, middle, last, buf, depth); } if (((check & 1) != 0) || (((check & 2) != 0) && (ssCompare(PA + getIDX(SA[first - 1]), PA - + SA[first], depth) == 0))) - { + + SA[first], depth) == 0))) { SA[first] = ~SA[first]; } if (((check & 4) != 0) - && ((ssCompare(PA + getIDX(SA[last - 1]), PA + SA[last], depth) == 0))) - { + && ((ssCompare(PA + getIDX(SA[last - 1]), PA + SA[last], depth) == 0))) { SA[last] = ~SA[last]; } - if (ssize > 0) - { + if (ssize > 0) { StackElement se = stack[--ssize]; first = se.a; middle = se.b; last = se.c; check = se.d; - } - else - { + } else { return; } continue; } - if ((middle - first) <= bufsize) - { - if (first < middle) - { + if ((middle - first) <= bufsize) { + if (first < middle) { ssMergeForward(PA, first, middle, last, buf, depth); } if (((check & 1) != 0) || (((check & 2) != 0) && (ssCompare(PA + getIDX(SA[first - 1]), PA - + SA[first], depth) == 0))) - { + + SA[first], depth) == 0))) { SA[first] = ~SA[first]; } if (((check & 4) != 0) - && ((ssCompare(PA + getIDX(SA[last - 1]), PA + SA[last], depth) == 0))) - { + && ((ssCompare(PA + getIDX(SA[last - 1]), PA + SA[last], depth) == 0))) { SA[last] = ~SA[last]; } - if (ssize > 0) - { + if (ssize > 0) { StackElement se = stack[--ssize]; first = se.a; middle = se.b; last = se.c; check = se.d; - } - else - { + } else { return; } continue; } - for (m = 0, len = min(middle - first, last - middle), half = len >> 1; 0 < len; len = half, half >>= 1) - { + for (m = 0, len = min(middle - first, last - middle), half = len >> 1; 0 < len; len = half, half >>= 1) { if (ssCompare(PA + getIDX(SA[middle + m + half]), PA - + getIDX(SA[middle - m - half - 1]), depth) < 0) - { + + getIDX(SA[middle - m - half - 1]), depth) < 0) { m += half + 1; half -= (len & 1) ^ 1; } } - if (0 < m) - { + if (0 < m) { lm = middle - m; rm = middle + m; ssBlockSwap(lm, middle, m); l = r = middle; next = 0; - if (rm < last) - { - if (SA[rm] < 0) - { + if (rm < last) { + if (SA[rm] < 0) { SA[rm] = ~SA[rm]; - if (first < lm) - { - for (; SA[--l] < 0;) - { + if (first < lm) { + for (; SA[--l] < 0; ) { } next |= 4; } next |= 1; - } - else if (first < lm) - { - for (; SA[r] < 0; ++r) - { + } else if (first < lm) { + for (; SA[r] < 0; ++r) { } next |= 2; } } - if ((l - first) <= (last - r)) - { + if ((l - first) <= (last - r)) { stack[ssize++] = new StackElement(r, rm, last, (next & 3) | (check & 4)); middle = lm; last = l; check = (check & 3) | (next & 4); - } - else - { - if (((next & 2) != 0) && (r == middle)) - { + } else { + if (((next & 2) != 0) && (r == middle)) { next ^= 6; } stack[ssize++] = new StackElement(first, lm, l, (check & 3) @@ -855,36 +722,28 @@ else if (first < lm) middle = rm; check = (next & 3) | (check & 4); } - } - else - { - if (ssCompare(PA + getIDX(SA[middle - 1]), PA + SA[middle], depth) == 0) - { + } else { + if (ssCompare(PA + getIDX(SA[middle - 1]), PA + SA[middle], depth) == 0) { SA[middle] = ~SA[middle]; } if (((check & 1) != 0) || (((check & 2) != 0) && (ssCompare(PA + getIDX(SA[first - 1]), PA - + SA[first], depth) == 0))) - { + + SA[first], depth) == 0))) { SA[first] = ~SA[first]; } if (((check & 4) != 0) - && ((ssCompare(PA + getIDX(SA[last - 1]), PA + SA[last], depth) == 0))) - { + && ((ssCompare(PA + getIDX(SA[last - 1]), PA + SA[last], depth) == 0))) { SA[last] = ~SA[last]; } - if (ssize > 0) - { + if (ssize > 0) { StackElement se = stack[--ssize]; first = se.a; middle = se.b; last = se.c; check = se.d; - } - else - { + } else { return; } @@ -898,8 +757,7 @@ else if (first < lm) * Merge-forward with internal buffer. */ private final void ssMergeForward(int PA, int first, int middle, int last, int buf, - int depth) - { + int depth) { // PA, first, middle, last, buf are pointers to SA int a, b, c, bufend;// pointers to SA int t, r; @@ -907,33 +765,24 @@ private final void ssMergeForward(int PA, int first, int middle, int last, int b bufend = buf + (middle - first) - 1; ssBlockSwap(buf, first, middle - first); - for (t = SA[a = first], b = buf, c = middle;;) - { + for (t = SA[a = first], b = buf, c = middle; ; ) { r = ssCompare(PA + SA[b], PA + SA[c], depth); - if (r < 0) - { - do - { + if (r < 0) { + do { SA[a++] = SA[b]; - if (bufend <= b) - { + if (bufend <= b) { SA[bufend] = t; return; } SA[b++] = SA[a]; } while (SA[b] < 0); - } - else if (r > 0) - { - do - { + } else if (r > 0) { + do { SA[a++] = SA[c]; SA[c++] = SA[a]; - if (last <= c) - { - while (b < bufend) - { + if (last <= c) { + while (b < bufend) { SA[a++] = SA[b]; SA[b++] = SA[a]; } @@ -943,15 +792,11 @@ else if (r > 0) } } while (SA[c] < 0); - } - else - { + } else { SA[c] = ~SA[c]; - do - { + do { SA[a++] = SA[b]; - if (bufend <= b) - { + if (bufend <= b) { SA[bufend] = t; return; } @@ -959,14 +804,11 @@ else if (r > 0) } while (SA[b] < 0); - do - { + do { SA[a++] = SA[c]; SA[c++] = SA[a]; - if (last <= c) - { - while (b < bufend) - { + if (last <= c) { + while (b < bufend) { SA[a++] = SA[b]; SA[b++] = SA[a]; } @@ -985,8 +827,7 @@ else if (r > 0) * Merge-backward with internal buffer. */ private final void ssMergeBackward(int PA, int first, int middle, int last, int buf, - int depth) - { + int depth) { // PA, first, middle, last, buf are pointers in SA int p1, p2;// pointers in SA int a, b, c, bufend;// pointers in SA @@ -996,33 +837,23 @@ private final void ssMergeBackward(int PA, int first, int middle, int last, int ssBlockSwap(buf, middle, last - middle); x = 0; - if (SA[bufend] < 0) - { + if (SA[bufend] < 0) { p1 = PA + ~SA[bufend]; x |= 1; - } - else - { + } else { p1 = PA + SA[bufend]; } - if (SA[middle - 1] < 0) - { + if (SA[middle - 1] < 0) { p2 = PA + ~SA[middle - 1]; x |= 2; - } - else - { + } else { p2 = PA + SA[middle - 1]; } - for (t = SA[a = last - 1], b = bufend, c = middle - 1;;) - { + for (t = SA[a = last - 1], b = bufend, c = middle - 1; ; ) { r = ssCompare(p1, p2, depth); - if (0 < r) - { - if ((x & 1) != 0) - { - do - { + if (0 < r) { + if ((x & 1) != 0) { + do { SA[a--] = SA[b]; SA[b--] = SA[a]; } @@ -1030,28 +861,20 @@ private final void ssMergeBackward(int PA, int first, int middle, int last, int x ^= 1; } SA[a--] = SA[b]; - if (b <= buf) - { + if (b <= buf) { SA[buf] = t; break; } SA[b--] = SA[a]; - if (SA[b] < 0) - { + if (SA[b] < 0) { p1 = PA + ~SA[b]; x |= 1; - } - else - { + } else { p1 = PA + SA[b]; } - } - else if (r < 0) - { - if ((x & 2) != 0) - { - do - { + } else if (r < 0) { + if ((x & 2) != 0) { + do { SA[a--] = SA[c]; SA[c--] = SA[a]; } @@ -1060,10 +883,8 @@ else if (r < 0) } SA[a--] = SA[c]; SA[c--] = SA[a]; - if (c < first) - { - while (buf < b) - { + if (c < first) { + while (buf < b) { SA[a--] = SA[b]; SA[b--] = SA[a]; } @@ -1071,22 +892,15 @@ else if (r < 0) SA[b] = t; break; } - if (SA[c] < 0) - { + if (SA[c] < 0) { p2 = PA + ~SA[c]; x |= 2; - } - else - { + } else { p2 = PA + SA[c]; } - } - else - { - if ((x & 1) != 0) - { - do - { + } else { + if ((x & 1) != 0) { + do { SA[a--] = SA[b]; SA[b--] = SA[a]; } @@ -1094,16 +908,13 @@ else if (r < 0) x ^= 1; } SA[a--] = ~SA[b]; - if (b <= buf) - { + if (b <= buf) { SA[buf] = t; break; } SA[b--] = SA[a]; - if ((x & 2) != 0) - { - do - { + if ((x & 2) != 0) { + do { SA[a--] = SA[c]; SA[c--] = SA[a]; } @@ -1112,10 +923,8 @@ else if (r < 0) } SA[a--] = SA[c]; SA[c--] = SA[a]; - if (c < first) - { - while (buf < b) - { + if (c < first) { + while (buf < b) { SA[a--] = SA[b]; SA[b--] = SA[a]; } @@ -1123,22 +932,16 @@ else if (r < 0) SA[b] = t; break; } - if (SA[b] < 0) - { + if (SA[b] < 0) { p1 = PA + ~SA[b]; x |= 1; - } - else - { + } else { p1 = PA + SA[b]; } - if (SA[c] < 0) - { + if (SA[c] < 0) { p2 = PA + ~SA[c]; x |= 2; - } - else - { + } else { p2 = PA + SA[c]; } } @@ -1148,28 +951,22 @@ else if (r < 0) /** * Insertionsort for small size groups */ - private final void ssInsertionSort(int PA, int first, int last, int depth) - { + private final void ssInsertionSort(int PA, int first, int last, int depth) { // PA, first, last are pointers in SA int i, j;// pointers in SA int t, r; - for (i = last - 2; first <= i; --i) - { - for (t = SA[i], j = i + 1; 0 < (r = ssCompare(PA + t, PA + SA[j], depth));) - { - do - { + for (i = last - 2; first <= i; --i) { + for (t = SA[i], j = i + 1; 0 < (r = ssCompare(PA + t, PA + SA[j], depth)); ) { + do { SA[j - 1] = SA[j]; } while ((++j < last) && (SA[j] < 0)); - if (last <= j) - { + if (last <= j) { break; } } - if (r == 0) - { + if (r == 0) { SA[j] = ~SA[j]; } SA[j - 1] = t; @@ -1178,36 +975,28 @@ private final void ssInsertionSort(int PA, int first, int last, int depth) } /** - * + * */ - private final static int ssIsqrt(int x) - { + private final static int ssIsqrt(int x) { int y, e; - if (x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) - { + if (x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; } e = ((x & 0xffff0000) != 0) ? (((x & 0xff000000) != 0) ? 24 + lg_table[(x >> 24) & 0xff] : 16 + lg_table[(x >> 16) & 0xff]) : (((x & 0x0000ff00) != 0) ? 8 + lg_table[(x >> 8) & 0xff] - : 0 + lg_table[(x >> 0) & 0xff]); + : 0 + lg_table[(x >> 0) & 0xff]); - if (e >= 16) - { + if (e >= 16) { y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7); - if (e >= 24) - { + if (e >= 24) { y = (y + 1 + x / y) >> 1; } y = (y + 1 + x / y) >> 1; - } - else if (e >= 8) - { + } else if (e >= 8) { y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1; - } - else - { + } else { return sqq_table[x] >> 4; } @@ -1215,35 +1004,28 @@ else if (e >= 8) } /* Multikey introsort for medium size groups. */ - private final void ssMintroSort(int PA, int first, int last, int depth) - { + private final void ssMintroSort(int PA, int first, int last, int depth) { final int STACK_SIZE = SS_MISORT_STACKSIZE; - StackElement [] stack = new StackElement [STACK_SIZE]; + StackElement[] stack = new StackElement[STACK_SIZE]; int Td;// T ptr int a, b, c, d, e, f;// SA ptr int s, t; int ssize; int limit; int v, x = 0; - for (ssize = 0, limit = ssIlg(last - first);;) - { + for (ssize = 0, limit = ssIlg(last - first); ; ) { - if ((last - first) <= SS_INSERTIONSORT_THRESHOLD) - { - if (1 < (last - first)) - { + if ((last - first) <= SS_INSERTIONSORT_THRESHOLD) { + if (1 < (last - first)) { ssInsertionSort(PA, first, last, depth); } - if (ssize > 0) - { + if (ssize > 0) { StackElement se = stack[--ssize]; first = se.a; last = se.b; depth = se.c; limit = se.d; - } - else - { + } else { return; } @@ -1251,19 +1033,14 @@ private final void ssMintroSort(int PA, int first, int last, int depth) } Td = depth; - if (limit-- == 0) - { + if (limit-- == 0) { ssHeapSort(Td, PA, first, last - first); } - if (limit < 0) - { - for (a = first + 1, v = T[start + Td + SA[PA + SA[first]]]; a < last; ++a) - { - if ((x = T[start + Td + SA[PA + SA[a]]]) != v) - { - if (1 < (a - first)) - { + if (limit < 0) { + for (a = first + 1, v = T[start + Td + SA[PA + SA[first]]]; a < last; ++a) { + if ((x = T[start + Td + SA[PA + SA[a]]]) != v) { + if (1 < (a - first)) { break; } v = x; @@ -1271,36 +1048,26 @@ private final void ssMintroSort(int PA, int first, int last, int depth) } } - if (T[start + Td + SA[PA + SA[first]] - 1] < v) - { + if (T[start + Td + SA[PA + SA[first]] - 1] < v) { first = ssPartition(PA, first, a, depth); } - if ((a - first) <= (last - a)) - { - if (1 < (a - first)) - { + if ((a - first) <= (last - a)) { + if (1 < (a - first)) { stack[ssize++] = new StackElement(a, last, depth, -1); last = a; depth += 1; limit = ssIlg(a - first); - } - else - { + } else { first = a; limit = -1; } - } - else - { - if (1 < (last - a)) - { + } else { + if (1 < (last - a)) { stack[ssize++] = new StackElement(first, a, depth + 1, ssIlg(a - first)); first = a; limit = -1; - } - else - { + } else { last = a; depth += 1; limit = ssIlg(a - first); @@ -1315,75 +1082,57 @@ private final void ssMintroSort(int PA, int first, int last, int depth) swapInSA(first, a); // partition - for (b = first; (++b < last) && ((x = T[start + Td + SA[PA + SA[b]]]) == v);) - { - } - if (((a = b) < last) && (x < v)) - { - for (; (++b < last) && ((x = T[start + Td + SA[PA + SA[b]]]) <= v);) - { - if (x == v) - { + for (b = first; (++b < last) && ((x = T[start + Td + SA[PA + SA[b]]]) == v); ) { + } + if (((a = b) < last) && (x < v)) { + for (; (++b < last) && ((x = T[start + Td + SA[PA + SA[b]]]) <= v); ) { + if (x == v) { swapInSA(b, a); ++a; } } } - for (c = last; (b < --c) && ((x = T[start + Td + SA[PA + SA[c]]]) == v);) - { + for (c = last; (b < --c) && ((x = T[start + Td + SA[PA + SA[c]]]) == v); ) { } - if ((b < (d = c)) && (x > v)) - { - for (; (b < --c) && ((x = T[start + Td + SA[PA + SA[c]]]) >= v);) - { - if (x == v) - { + if ((b < (d = c)) && (x > v)) { + for (; (b < --c) && ((x = T[start + Td + SA[PA + SA[c]]]) >= v); ) { + if (x == v) { swapInSA(c, d); --d; } } } - for (; b < c;) - { + for (; b < c; ) { swapInSA(b, c); - for (; (++b < c) && ((x = T[start + Td + SA[PA + SA[b]]]) <= v);) - { - if (x == v) - { + for (; (++b < c) && ((x = T[start + Td + SA[PA + SA[b]]]) <= v); ) { + if (x == v) { swapInSA(b, a); ++a; } } - for (; (b < --c) && ((x = T[start + Td + SA[PA + SA[c]]]) >= v);) - { - if (x == v) - { + for (; (b < --c) && ((x = T[start + Td + SA[PA + SA[c]]]) >= v); ) { + if (x == v) { swapInSA(c, d); --d; } } } - if (a <= d) - { + if (a <= d) { c = b - 1; - if ((s = a - first) > (t = b - a)) - { + if ((s = a - first) > (t = b - a)) { s = t; } - for (e = first, f = b - s; 0 < s; --s, ++e, ++f) - { + for (e = first, f = b - s; 0 < s; --s, ++e, ++f) { swapInSA(e, f); } - if ((s = d - c) > (t = last - d - 1)) - { + if ((s = d - c) > (t = last - d - 1)) { s = t; } - for (e = b, f = last - s; 0 < s; --s, ++e, ++f) - { + for (e = b, f = last - s; 0 < s; --s, ++e, ++f) { swapInSA(e, f); } @@ -1392,22 +1141,16 @@ private final void ssMintroSort(int PA, int first, int last, int depth) b = (v <= T[start + Td + SA[PA + SA[a]] - 1]) ? a : ssPartition(PA, a, c, depth); - if ((a - first) <= (last - c)) - { - if ((last - c) <= (c - b)) - { + if ((a - first) <= (last - c)) { + if ((last - c) <= (c - b)) { stack[ssize++] = new StackElement(b, c, depth + 1, ssIlg(c - b)); stack[ssize++] = new StackElement(c, last, depth, limit); last = a; - } - else if ((a - first) <= (c - b)) - { + } else if ((a - first) <= (c - b)) { stack[ssize++] = new StackElement(c, last, depth, limit); stack[ssize++] = new StackElement(b, c, depth + 1, ssIlg(c - b)); last = a; - } - else - { + } else { stack[ssize++] = new StackElement(c, last, depth, limit); stack[ssize++] = new StackElement(first, a, depth, limit); first = b; @@ -1415,23 +1158,16 @@ else if ((a - first) <= (c - b)) depth += 1; limit = ssIlg(c - b); } - } - else - { - if ((a - first) <= (c - b)) - { + } else { + if ((a - first) <= (c - b)) { stack[ssize++] = new StackElement(b, c, depth + 1, ssIlg(c - b)); stack[ssize++] = new StackElement(first, a, depth, limit); first = c; - } - else if ((last - c) <= (c - b)) - { + } else if ((last - c) <= (c - b)) { stack[ssize++] = new StackElement(first, a, depth, limit); stack[ssize++] = new StackElement(b, c, depth + 1, ssIlg(c - b)); first = c; - } - else - { + } else { stack[ssize++] = new StackElement(first, a, depth, limit); stack[ssize++] = new StackElement(c, last, depth, limit); first = b; @@ -1441,12 +1177,9 @@ else if ((last - c) <= (c - b)) } } - } - else - { + } else { limit += 1; - if (T[start + Td + SA[PA + SA[first]] - 1] < v) - { + if (T[start + Td + SA[PA + SA[first]] - 1] < v) { first = ssPartition(PA, first, last, depth); limit = ssIlg(last - first); } @@ -1460,20 +1193,15 @@ else if ((last - c) <= (c - b)) /** * Returns the pivot element. */ - private final int ssPivot(int Td, int PA, int first, int last) - { + private final int ssPivot(int Td, int PA, int first, int last) { int middle;// SA pointer int t = last - first; middle = first + t / 2; - if (t <= 512) - { - if (t <= 32) - { + if (t <= 512) { + if (t <= 32) { return ssMedian3(Td, PA, first, middle, last - 1); - } - else - { + } else { t >>= 2; return ssMedian5(Td, PA, first, first + t, middle, last - 1 - t, last - 1); } @@ -1488,24 +1216,20 @@ private final int ssPivot(int Td, int PA, int first, int last) /** * Returns the median of five elements */ - private final int ssMedian5(int Td, int PA, int v1, int v2, int v3, int v4, int v5) - { + private final int ssMedian5(int Td, int PA, int v1, int v2, int v3, int v4, int v5) { int t; - if (T[start + Td + SA[PA + SA[v2]]] > T[start + Td + SA[PA + SA[v3]]]) - { + if (T[start + Td + SA[PA + SA[v2]]] > T[start + Td + SA[PA + SA[v3]]]) { t = v2; v2 = v3; v3 = t; } - if (T[start + Td + SA[PA + SA[v4]]] > T[start + Td + SA[PA + SA[v5]]]) - { + if (T[start + Td + SA[PA + SA[v4]]] > T[start + Td + SA[PA + SA[v5]]]) { t = v4; v4 = v5; v5 = t; } - if (T[start + Td + SA[PA + SA[v2]]] > T[start + Td + SA[PA + SA[v4]]]) - { + if (T[start + Td + SA[PA + SA[v2]]] > T[start + Td + SA[PA + SA[v4]]]) { t = v2; v2 = v4; v4 = t; @@ -1513,14 +1237,12 @@ private final int ssMedian5(int Td, int PA, int v1, int v2, int v3, int v4, int v3 = v5; v5 = t; } - if (T[start + Td + SA[PA + SA[v1]]] > T[start + Td + SA[PA + SA[v3]]]) - { + if (T[start + Td + SA[PA + SA[v1]]] > T[start + Td + SA[PA + SA[v3]]]) { t = v1; v1 = v3; v3 = t; } - if (T[start + Td + SA[PA + SA[v1]]] > T[start + Td + SA[PA + SA[v4]]]) - { + if (T[start + Td + SA[PA + SA[v1]]] > T[start + Td + SA[PA + SA[v4]]]) { t = v1; v1 = v4; v4 = t; @@ -1528,8 +1250,7 @@ private final int ssMedian5(int Td, int PA, int v1, int v2, int v3, int v4, int v3 = v5; v5 = t; } - if (T[start + Td + SA[PA + SA[v3]]] > T[start + Td + SA[PA + SA[v4]]]) - { + if (T[start + Td + SA[PA + SA[v3]]] > T[start + Td + SA[PA + SA[v4]]]) { return v4; } return v3; @@ -1538,22 +1259,16 @@ private final int ssMedian5(int Td, int PA, int v1, int v2, int v3, int v4, int /** * Returns the median of three elements. */ - private final int ssMedian3(int Td, int PA, int v1, int v2, int v3) - { - if (T[start + Td + SA[PA + SA[v1]]] > T[start + Td + SA[PA + SA[v2]]]) - { + private final int ssMedian3(int Td, int PA, int v1, int v2, int v3) { + if (T[start + Td + SA[PA + SA[v1]]] > T[start + Td + SA[PA + SA[v2]]]) { int t = v1; v1 = v2; v2 = t; } - if (T[start + Td + SA[PA + SA[v2]]] > T[start + Td + SA[PA + SA[v3]]]) - { - if (T[start + Td + SA[PA + SA[v1]]] > T[start + Td + SA[PA + SA[v3]]]) - { + if (T[start + Td + SA[PA + SA[v2]]] > T[start + Td + SA[PA + SA[v3]]]) { + if (T[start + Td + SA[PA + SA[v1]]] > T[start + Td + SA[PA + SA[v3]]]) { return v1; - } - else - { + } else { return v3; } } @@ -1563,29 +1278,23 @@ private final int ssMedian3(int Td, int PA, int v1, int v2, int v3) /** * Binary partition for substrings. */ - private final int ssPartition(int PA, int first, int last, int depth) - { + private final int ssPartition(int PA, int first, int last, int depth) { int a, b;// SA pointer int t; - for (a = first - 1, b = last;;) - { - for (; (++a < b) && ((SA[PA + SA[a]] + depth) >= (SA[PA + SA[a] + 1] + 1));) - { + for (a = first - 1, b = last; ; ) { + for (; (++a < b) && ((SA[PA + SA[a]] + depth) >= (SA[PA + SA[a] + 1] + 1)); ) { SA[a] = ~SA[a]; } - for (; (a < --b) && ((SA[PA + SA[b]] + depth) < (SA[PA + SA[b] + 1] + 1));) - { + for (; (a < --b) && ((SA[PA + SA[b]] + depth) < (SA[PA + SA[b] + 1] + 1)); ) { } - if (b <= a) - { + if (b <= a) { break; } t = ~SA[b]; SA[b] = SA[a]; SA[a] = t; } - if (first < a) - { + if (first < a) { SA[first] = ~SA[first]; } return a; @@ -1594,32 +1303,26 @@ private final int ssPartition(int PA, int first, int last, int depth) /** * Simple top-down heapsort. */ - private final void ssHeapSort(int Td, int PA, int sa, int size) - { + private final void ssHeapSort(int Td, int PA, int sa, int size) { int i, m, t; m = size; - if ((size % 2) == 0) - { + if ((size % 2) == 0) { m--; if (T[start + Td + SA[PA + SA[sa + (m / 2)]]] < T[start + Td - + SA[PA + SA[sa + m]]]) - { + + SA[PA + SA[sa + m]]]) { swapInSA(sa + m, sa + (m / 2)); } } - for (i = m / 2 - 1; 0 <= i; --i) - { + for (i = m / 2 - 1; 0 <= i; --i) { ssFixDown(Td, PA, sa, i, m); } - if ((size % 2) == 0) - { + if ((size % 2) == 0) { swapInSA(sa, sa + m); ssFixDown(Td, PA, sa, 0, m); } - for (i = m - 1; 0 < i; --i) - { + for (i = m - 1; 0 < i; --i) { t = SA[sa]; SA[sa] = SA[sa + i]; ssFixDown(Td, PA, sa, 0, i); @@ -1629,25 +1332,21 @@ private final void ssHeapSort(int Td, int PA, int sa, int size) } /** - * + * */ - private final void ssFixDown(int Td, int PA, int sa, int i, int size) - { + private final void ssFixDown(int Td, int PA, int sa, int i, int size) { int j, k; int v; int c, d, e; for (v = SA[sa + i], c = T[start + Td + SA[PA + v]]; (j = 2 * i + 1) < size; SA[sa - + i] = SA[sa + k], i = k) - { + + i] = SA[sa + k], i = k) { d = T[start + Td + SA[PA + SA[sa + (k = j++)]]]; - if (d < (e = T[start + Td + SA[PA + SA[sa + j]]])) - { + if (d < (e = T[start + Td + SA[PA + SA[sa + j]]])) { k = j; d = e; } - if (d <= c) - { + if (d <= c) { break; } } @@ -1658,18 +1357,16 @@ private final void ssFixDown(int Td, int PA, int sa, int i, int size) /** * */ - private final static int ssIlg(int n) - { + private final static int ssIlg(int n) { return ((n & 0xff00) != 0) ? 8 + lg_table[(n >> 8) & 0xff] : 0 + lg_table[(n >> 0) & 0xff]; } /** - * + * */ - private final void swapInSA(int a, int b) - { + private final void swapInSA(int a, int b) { int tmp = SA[a]; SA[a] = SA[b]; SA[b] = tmp; @@ -1678,59 +1375,44 @@ private final void swapInSA(int a, int b) /** * Tandem repeat sort */ - private final void trSort(int ISA, int n, int depth) - { + private final void trSort(int ISA, int n, int depth) { TRBudget budget = new TRBudget(trIlg(n) * 2 / 3, n); int ISAd; int first, last;// SA pointers int t, skip, unsorted; - for (ISAd = ISA + depth; -n < SA[0]; ISAd += ISAd - ISA) - { + for (ISAd = ISA + depth; -n < SA[0]; ISAd += ISAd - ISA) { first = 0; skip = 0; unsorted = 0; - do - { - if ((t = SA[first]) < 0) - { + do { + if ((t = SA[first]) < 0) { first -= t; skip += t; - } - else - { - if (skip != 0) - { + } else { + if (skip != 0) { SA[first + skip] = skip; skip = 0; } last = SA[ISA + t] + 1; - if (1 < (last - first)) - { + if (1 < (last - first)) { budget.count = 0; trIntroSort(ISA, ISAd, first, last, budget); - if (budget.count != 0) - { + if (budget.count != 0) { unsorted += budget.count; - } - else - { + } else { skip = first - last; } - } - else if ((last - first) == 1) - { + } else if ((last - first) == 1) { skip = -1; } first = last; } } while (first < n); - if (skip != 0) - { + if (skip != 0) { SA[first + skip] = skip; } - if (unsorted == 0) - { + if (unsorted == 0) { break; } } @@ -1740,77 +1422,58 @@ else if ((last - first) == 1) * */ private final TRPartitionResult trPartition(int ISAd, int first, int middle, - int last, int pa, int pb, int v) - { + int last, int pa, int pb, int v) { int a, b, c, d, e, f;// ptr int t, s, x = 0; - for (b = middle - 1; (++b < last) && ((x = SA[ISAd + SA[b]]) == v);) - { + for (b = middle - 1; (++b < last) && ((x = SA[ISAd + SA[b]]) == v); ) { } - if (((a = b) < last) && (x < v)) - { - for (; (++b < last) && ((x = SA[ISAd + SA[b]]) <= v);) - { - if (x == v) - { + if (((a = b) < last) && (x < v)) { + for (; (++b < last) && ((x = SA[ISAd + SA[b]]) <= v); ) { + if (x == v) { swapInSA(a, b); ++a; } } } - for (c = last; (b < --c) && ((x = SA[ISAd + SA[c]]) == v);) - { + for (c = last; (b < --c) && ((x = SA[ISAd + SA[c]]) == v); ) { } - if ((b < (d = c)) && (x > v)) - { - for (; (b < --c) && ((x = SA[ISAd + SA[c]]) >= v);) - { - if (x == v) - { + if ((b < (d = c)) && (x > v)) { + for (; (b < --c) && ((x = SA[ISAd + SA[c]]) >= v); ) { + if (x == v) { swapInSA(c, d); --d; } } } - for (; b < c;) - { + for (; b < c; ) { swapInSA(c, b); - for (; (++b < c) && ((x = SA[ISAd + SA[b]]) <= v);) - { - if (x == v) - { + for (; (++b < c) && ((x = SA[ISAd + SA[b]]) <= v); ) { + if (x == v) { swapInSA(a, b); ++a; } } - for (; (b < --c) && ((x = SA[ISAd + SA[c]]) >= v);) - { - if (x == v) - { + for (; (b < --c) && ((x = SA[ISAd + SA[c]]) >= v); ) { + if (x == v) { swapInSA(c, d); --d; } } } - if (a <= d) - { + if (a <= d) { c = b - 1; - if ((s = a - first) > (t = b - a)) - { + if ((s = a - first) > (t = b - a)) { s = t; } - for (e = first, f = b - s; 0 < s; --s, ++e, ++f) - { + for (e = first, f = b - s; 0 < s; --s, ++e, ++f) { swapInSA(e, f); } - if ((s = d - c) > (t = last - d - 1)) - { + if ((s = d - c) > (t = last - d - 1)) { s = t; } - for (e = b, f = last - s; 0 < s; --s, ++e, ++f) - { + for (e = b, f = last - s; 0 < s; --s, ++e, ++f) { swapInSA(e, f); } first += (b - a); @@ -1819,247 +1482,181 @@ private final TRPartitionResult trPartition(int ISAd, int first, int middle, return new TRPartitionResult(first, last); } - private final void trIntroSort(int ISA, int ISAd, int first, int last, TRBudget budget) - { + private final void trIntroSort(int ISA, int ISAd, int first, int last, TRBudget budget) { final int STACK_SIZE = TR_STACKSIZE; - StackElement [] stack = new StackElement [STACK_SIZE]; + StackElement[] stack = new StackElement[STACK_SIZE]; int a = 0, b = 0, c;// pointers int v, x = 0; int incr = ISAd - ISA; int limit, next; int ssize, trlink = -1; - for (ssize = 0, limit = trIlg(last - first);;) - { - if (limit < 0) - { - if (limit == -1) - { + for (ssize = 0, limit = trIlg(last - first); ; ) { + if (limit < 0) { + if (limit == -1) { /* tandem repeat partition */ TRPartitionResult res = trPartition(ISAd - incr, first, first, last, a, b, last - 1); a = res.a; b = res.b; /* update ranks */ - if (a < last) - { - for (c = first, v = a - 1; c < a; ++c) - { + if (a < last) { + for (c = first, v = a - 1; c < a; ++c) { SA[ISA + SA[c]] = v; } } - if (b < last) - { - for (c = a, v = b - 1; c < b; ++c) - { + if (b < last) { + for (c = a, v = b - 1; c < b; ++c) { SA[ISA + SA[c]] = v; } } /* push */ - if (1 < (b - a)) - { + if (1 < (b - a)) { stack[ssize++] = new StackElement(0, a, b, 0, 0); stack[ssize++] = new StackElement(ISAd - incr, first, last, -2, trlink); trlink = ssize - 2; } - if ((a - first) <= (last - b)) - { - if (1 < (a - first)) - { + if ((a - first) <= (last - b)) { + if (1 < (a - first)) { stack[ssize++] = new StackElement(ISAd, b, last, trIlg(last - b), trlink); last = a; limit = trIlg(a - first); - } - else if (1 < (last - b)) - { + } else if (1 < (last - b)) { first = b; limit = trIlg(last - b); - } - else - { - if (ssize > 0) - { + } else { + if (ssize > 0) { StackElement se = stack[--ssize]; ISAd = se.a; first = se.b; last = se.c; limit = se.d; trlink = se.e; - } - else - { + } else { return; } } - } - else - { - if (1 < (last - b)) - { + } else { + if (1 < (last - b)) { stack[ssize++] = new StackElement(ISAd, first, a, trIlg(a - first), trlink); first = b; limit = trIlg(last - b); - } - else if (1 < (a - first)) - { + } else if (1 < (a - first)) { last = a; limit = trIlg(a - first); - } - else - { - if (ssize > 0) - { + } else { + if (ssize > 0) { StackElement se = stack[--ssize]; ISAd = se.a; first = se.b; last = se.c; limit = se.d; trlink = se.e; - } - else - { + } else { return; } } } - } - else if (limit == -2) - { + } else if (limit == -2) { /* tandem repeat copy */ StackElement se = stack[--ssize]; a = se.b; b = se.c; - if (stack[ssize].d == 0) - { + if (stack[ssize].d == 0) { trCopy(ISA, first, a, b, last, ISAd - ISA); - } - else - { - if (0 <= trlink) - { + } else { + if (0 <= trlink) { stack[trlink].d = -1; } trPartialCopy(ISA, first, a, b, last, ISAd - ISA); } - if (ssize > 0) - { + if (ssize > 0) { se = stack[--ssize]; ISAd = se.a; first = se.b; last = se.c; limit = se.d; trlink = se.e; - } - else - { + } else { return; } - } - else - { + } else { /* sorted partition */ - if (0 <= SA[first]) - { + if (0 <= SA[first]) { a = first; - do - { + do { SA[ISA + SA[a]] = a; } while ((++a < last) && (0 <= SA[a])); first = a; } - if (first < last) - { + if (first < last) { a = first; - do - { + do { SA[a] = ~SA[a]; } while (SA[++a] < 0); next = (SA[ISA + SA[a]] != SA[ISAd + SA[a]]) ? trIlg(a - first + 1) : -1; - if (++a < last) - { - for (b = first, v = a - 1; b < a; ++b) - { + if (++a < last) { + for (b = first, v = a - 1; b < a; ++b) { SA[ISA + SA[b]] = v; } } /* push */ - if (budget.check(a - first) != 0) - { - if ((a - first) <= (last - a)) - { + if (budget.check(a - first) != 0) { + if ((a - first) <= (last - a)) { stack[ssize++] = new StackElement(ISAd, a, last, -3, trlink); ISAd += incr; last = a; limit = next; - } - else - { - if (1 < (last - a)) - { + } else { + if (1 < (last - a)) { stack[ssize++] = new StackElement(ISAd + incr, first, a, next, trlink); first = a; limit = -3; - } - else - { + } else { ISAd += incr; last = a; limit = next; } } - } - else - { - if (0 <= trlink) - { + } else { + if (0 <= trlink) { stack[trlink].d = -1; } - if (1 < (last - a)) - { + if (1 < (last - a)) { first = a; limit = -3; - } - else - { - if (ssize > 0) - { + } else { + if (ssize > 0) { StackElement se = stack[--ssize]; ISAd = se.a; first = se.b; last = se.c; limit = se.d; trlink = se.e; - } - else - { + } else { return; } } } - } - else - { - if (ssize > 0) - { + } else { + if (ssize > 0) { StackElement se = stack[--ssize]; ISAd = se.a; first = se.b; last = se.c; limit = se.d; trlink = se.e; - } - else - { + } else { return; } } @@ -2067,21 +1664,17 @@ else if (limit == -2) continue; } - if ((last - first) <= TR_INSERTIONSORT_THRESHOLD) - { + if ((last - first) <= TR_INSERTIONSORT_THRESHOLD) { trInsertionSort(ISAd, first, last); limit = -3; continue; } - if (limit-- == 0) - { + if (limit-- == 0) { trHeapSort(ISAd, first, last - first); - for (a = last - 1; first < a; a = b) - { + for (a = last - 1; first < a; a = b) { for (x = SA[ISAd + SA[a]], b = a - 1; (first <= b) - && (SA[ISAd + SA[b]] == x); --b) - { + && (SA[ISAd + SA[b]] == x); --b) { SA[b] = ~SA[b]; } } @@ -2098,64 +1691,47 @@ else if (limit == -2) a = res.a; b = res.b; - if ((last - first) != (b - a)) - { + if ((last - first) != (b - a)) { next = (SA[ISA + SA[a]] != v) ? trIlg(b - a) : -1; /* update ranks */ - for (c = first, v = a - 1; c < a; ++c) - { + for (c = first, v = a - 1; c < a; ++c) { SA[ISA + SA[c]] = v; } - if (b < last) - { - for (c = a, v = b - 1; c < b; ++c) - { + if (b < last) { + for (c = a, v = b - 1; c < b; ++c) { SA[ISA + SA[c]] = v; } } /* push */ - if ((1 < (b - a)) && ((budget.check(b - a) != 0))) - { - if ((a - first) <= (last - b)) - { - if ((last - b) <= (b - a)) - { - if (1 < (a - first)) - { + if ((1 < (b - a)) && ((budget.check(b - a) != 0))) { + if ((a - first) <= (last - b)) { + if ((last - b) <= (b - a)) { + if (1 < (a - first)) { stack[ssize++] = new StackElement(ISAd + incr, a, b, next, trlink); stack[ssize++] = new StackElement(ISAd, b, last, limit, trlink); last = a; - } - else if (1 < (last - b)) - { + } else if (1 < (last - b)) { stack[ssize++] = new StackElement(ISAd + incr, a, b, next, trlink); first = b; - } - else - { + } else { ISAd += incr; first = a; last = b; limit = next; } - } - else if ((a - first) <= (b - a)) - { - if (1 < (a - first)) - { + } else if ((a - first) <= (b - a)) { + if (1 < (a - first)) { stack[ssize++] = new StackElement(ISAd, b, last, limit, trlink); stack[ssize++] = new StackElement(ISAd + incr, a, b, next, trlink); last = a; - } - else - { + } else { stack[ssize++] = new StackElement(ISAd, b, last, limit, trlink); ISAd += incr; @@ -2163,9 +1739,7 @@ else if ((a - first) <= (b - a)) last = b; limit = next; } - } - else - { + } else { stack[ssize++] = new StackElement(ISAd, b, last, limit, trlink); stack[ssize++] = new StackElement(ISAd, first, a, limit, @@ -2175,45 +1749,32 @@ else if ((a - first) <= (b - a)) last = b; limit = next; } - } - else - { - if ((a - first) <= (b - a)) - { - if (1 < (last - b)) - { + } else { + if ((a - first) <= (b - a)) { + if (1 < (last - b)) { stack[ssize++] = new StackElement(ISAd + incr, a, b, next, trlink); stack[ssize++] = new StackElement(ISAd, first, a, limit, trlink); first = b; - } - else if (1 < (a - first)) - { + } else if (1 < (a - first)) { stack[ssize++] = new StackElement(ISAd + incr, a, b, next, trlink); last = a; - } - else - { + } else { ISAd += incr; first = a; last = b; limit = next; } - } - else if ((last - b) <= (b - a)) - { - if (1 < (last - b)) - { + } else if ((last - b) <= (b - a)) { + if (1 < (last - b)) { stack[ssize++] = new StackElement(ISAd, first, a, limit, trlink); stack[ssize++] = new StackElement(ISAd + incr, a, b, next, trlink); first = b; - } - else - { + } else { stack[ssize++] = new StackElement(ISAd, first, a, limit, trlink); ISAd += incr; @@ -2221,9 +1782,7 @@ else if ((last - b) <= (b - a)) last = b; limit = next; } - } - else - { + } else { stack[ssize++] = new StackElement(ISAd, first, a, limit, trlink); stack[ssize++] = new StackElement(ISAd, b, last, limit, @@ -2234,97 +1793,66 @@ else if ((last - b) <= (b - a)) limit = next; } } - } - else - { - if ((1 < (b - a)) && (0 <= trlink)) - { + } else { + if ((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; } - if ((a - first) <= (last - b)) - { - if (1 < (a - first)) - { + if ((a - first) <= (last - b)) { + if (1 < (a - first)) { stack[ssize++] = new StackElement(ISAd, b, last, limit, trlink); last = a; - } - else if (1 < (last - b)) - { + } else if (1 < (last - b)) { first = b; - } - else - { - if (ssize > 0) - { + } else { + if (ssize > 0) { StackElement se = stack[--ssize]; ISAd = se.a; first = se.b; last = se.c; limit = se.d; trlink = se.e; - } - else - { + } else { return; } } - } - else - { - if (1 < (last - b)) - { + } else { + if (1 < (last - b)) { stack[ssize++] = new StackElement(ISAd, first, a, limit, trlink); first = b; - } - else if (1 < (a - first)) - { + } else if (1 < (a - first)) { last = a; - } - else - { - if (ssize > 0) - { + } else { + if (ssize > 0) { StackElement se = stack[--ssize]; ISAd = se.a; first = se.b; last = se.c; limit = se.d; trlink = se.e; - } - else - { + } else { return; } } } } - } - else - { - if (budget.check(last - first) != 0) - { + } else { + if (budget.check(last - first) != 0) { limit = trIlg(last - first); ISAd += incr; - } - else - { - if (0 <= trlink) - { + } else { + if (0 <= trlink) { stack[trlink].d = -1; } - if (ssize > 0) - { + if (ssize > 0) { StackElement se = stack[--ssize]; ISAd = se.a; first = se.b; last = se.c; limit = se.d; trlink = se.e; - } - else - { + } else { return; } } @@ -2337,22 +1865,17 @@ else if (1 < (a - first)) /** * Returns the pivot element. */ - private final int trPivot(int ISAd, int first, int last) - { + private final int trPivot(int ISAd, int first, int last) { int middle; int t; t = last - first; middle = first + t / 2; - if (t <= 512) - { - if (t <= 32) - { + if (t <= 512) { + if (t <= 32) { return trMedian3(ISAd, first, middle, last - 1); - } - else - { + } else { t >>= 2; return trMedian5(ISAd, first, first + t, middle, last - 1 - t, last - 1); } @@ -2367,23 +1890,19 @@ private final int trPivot(int ISAd, int first, int last) /** * Returns the median of five elements. */ - private final int trMedian5(int ISAd, int v1, int v2, int v3, int v4, int v5) - { + private final int trMedian5(int ISAd, int v1, int v2, int v3, int v4, int v5) { int t; - if (SA[ISAd + SA[v2]] > SA[ISAd + SA[v3]]) - { + if (SA[ISAd + SA[v2]] > SA[ISAd + SA[v3]]) { t = v2; v2 = v3; v3 = t; } - if (SA[ISAd + SA[v4]] > SA[ISAd + SA[v5]]) - { + if (SA[ISAd + SA[v4]] > SA[ISAd + SA[v5]]) { t = v4; v4 = v5; v5 = t; } - if (SA[ISAd + SA[v2]] > SA[ISAd + SA[v4]]) - { + if (SA[ISAd + SA[v2]] > SA[ISAd + SA[v4]]) { t = v2; v2 = v4; v4 = t; @@ -2391,14 +1910,12 @@ private final int trMedian5(int ISAd, int v1, int v2, int v3, int v4, int v5) v3 = v5; v5 = t; } - if (SA[ISAd + SA[v1]] > SA[ISAd + SA[v3]]) - { + if (SA[ISAd + SA[v1]] > SA[ISAd + SA[v3]]) { t = v1; v1 = v3; v3 = t; } - if (SA[ISAd + SA[v1]] > SA[ISAd + SA[v4]]) - { + if (SA[ISAd + SA[v1]] > SA[ISAd + SA[v4]]) { t = v1; v1 = v4; v4 = t; @@ -2406,8 +1923,7 @@ private final int trMedian5(int ISAd, int v1, int v2, int v3, int v4, int v5) v3 = v5; v5 = t; } - if (SA[ISAd + SA[v3]] > SA[ISAd + SA[v4]]) - { + if (SA[ISAd + SA[v3]] > SA[ISAd + SA[v4]]) { return v4; } return v3; @@ -2416,22 +1932,16 @@ private final int trMedian5(int ISAd, int v1, int v2, int v3, int v4, int v5) /** * Returns the median of three elements. */ - private final int trMedian3(int ISAd, int v1, int v2, int v3) - { - if (SA[ISAd + SA[v1]] > SA[ISAd + SA[v2]]) - { + private final int trMedian3(int ISAd, int v1, int v2, int v3) { + if (SA[ISAd + SA[v1]] > SA[ISAd + SA[v2]]) { int t = v1; v1 = v2; v2 = t; } - if (SA[ISAd + SA[v2]] > SA[ISAd + SA[v3]]) - { - if (SA[ISAd + SA[v1]] > SA[ISAd + SA[v3]]) - { + if (SA[ISAd + SA[v2]] > SA[ISAd + SA[v3]]) { + if (SA[ISAd + SA[v1]] > SA[ISAd + SA[v3]]) { return v1; - } - else - { + } else { return v3; } } @@ -2439,33 +1949,27 @@ private final int trMedian3(int ISAd, int v1, int v2, int v3) } /** - * + * */ - private final void trHeapSort(int ISAd, int sa, int size) - { + private final void trHeapSort(int ISAd, int sa, int size) { int i, m, t; m = size; - if ((size % 2) == 0) - { + if ((size % 2) == 0) { m--; - if (SA[ISAd + SA[sa + m / 2]] < SA[ISAd + SA[sa + m]]) - { + if (SA[ISAd + SA[sa + m / 2]] < SA[ISAd + SA[sa + m]]) { swapInSA(sa + m, sa + m / 2); } } - for (i = m / 2 - 1; 0 <= i; --i) - { + for (i = m / 2 - 1; 0 <= i; --i) { trFixDown(ISAd, sa, i, m); } - if ((size % 2) == 0) - { + if ((size % 2) == 0) { swapInSA(sa, sa + m); trFixDown(ISAd, sa, 0, m); } - for (i = m - 1; 0 < i; --i) - { + for (i = m - 1; 0 < i; --i) { t = SA[sa]; SA[sa] = SA[sa + i]; trFixDown(ISAd, sa, 0, i); @@ -2475,25 +1979,21 @@ private final void trHeapSort(int ISAd, int sa, int size) } /** - * + * */ - private final void trFixDown(int ISAd, int sa, int i, int size) - { + private final void trFixDown(int ISAd, int sa, int i, int size) { int j, k; int v; int c, d, e; for (v = SA[sa + i], c = SA[ISAd + v]; (j = 2 * i + 1) < size; SA[sa + i] = SA[sa - + k], i = k) - { + + k], i = k) { d = SA[ISAd + SA[sa + (k = j++)]]; - if (d < (e = SA[ISAd + SA[sa + j]])) - { + if (d < (e = SA[ISAd + SA[sa + j]])) { k = j; d = e; } - if (d <= c) - { + if (d <= c) { break; } } @@ -2503,27 +2003,21 @@ private final void trFixDown(int ISAd, int sa, int i, int size) /** */ - private final void trInsertionSort(int ISAd, int first, int last) - { + private final void trInsertionSort(int ISAd, int first, int last) { int a, b;// SA ptr int t, r; - for (a = first + 1; a < last; ++a) - { - for (t = SA[a], b = a - 1; 0 > (r = SA[ISAd + t] - SA[ISAd + SA[b]]);) - { - do - { + for (a = first + 1; a < last; ++a) { + for (t = SA[a], b = a - 1; 0 > (r = SA[ISAd + t] - SA[ISAd + SA[b]]); ) { + do { SA[b + 1] = SA[b]; } while ((first <= --b) && (SA[b] < 0)); - if (b < first) - { + if (b < first) { break; } } - if (r == 0) - { + if (r == 0) { SA[b] = ~SA[b]; } SA[b + 1] = t; @@ -2533,22 +2027,18 @@ private final void trInsertionSort(int ISAd, int first, int last) /** */ - private final void trPartialCopy(int ISA, int first, int a, int b, int last, int depth) - { + private final void trPartialCopy(int ISA, int first, int a, int b, int last, int depth) { int c, d, e;// ptr int s, v; int rank, lastrank, newrank = -1; v = b - 1; lastrank = -1; - for (c = first, d = a - 1; c <= d; ++c) - { - if ((0 <= (s = SA[c] - depth)) && (SA[ISA + s] == v)) - { + for (c = first, d = a - 1; c <= d; ++c) { + if ((0 <= (s = SA[c] - depth)) && (SA[ISA + s] == v)) { SA[++d] = s; rank = SA[ISA + s + depth]; - if (lastrank != rank) - { + if (lastrank != rank) { lastrank = rank; newrank = d; } @@ -2557,29 +2047,23 @@ private final void trPartialCopy(int ISA, int first, int a, int b, int last, int } lastrank = -1; - for (e = d; first <= e; --e) - { + for (e = d; first <= e; --e) { rank = SA[ISA + SA[e]]; - if (lastrank != rank) - { + if (lastrank != rank) { lastrank = rank; newrank = e; } - if (newrank != rank) - { + if (newrank != rank) { SA[ISA + SA[e]] = newrank; } } lastrank = -1; - for (c = last - 1, e = d + 1, d = b; e < d; --c) - { - if ((0 <= (s = SA[c] - depth)) && (SA[ISA + s] == v)) - { + for (c = last - 1, e = d + 1, d = b; e < d; --c) { + if ((0 <= (s = SA[c] - depth)) && (SA[ISA + s] == v)) { SA[--d] = s; rank = SA[ISA + s + depth]; - if (lastrank != rank) - { + if (lastrank != rank) { lastrank = rank; newrank = d; } @@ -2593,26 +2077,21 @@ private final void trPartialCopy(int ISA, int first, int a, int b, int last, int * sort suffixes of middle partition by using sorted order of suffixes of left and * right partition. */ - private final void trCopy(int ISA, int first, int a, int b, int last, int depth) - { + private final void trCopy(int ISA, int first, int a, int b, int last, int depth) { int c, d, e;// ptr int s, v; v = b - 1; - for (c = first, d = a - 1; c <= d; ++c) - { + for (c = first, d = a - 1; c <= d; ++c) { s = SA[c] - depth; - if ((0 <= s) && (SA[ISA + s] == v)) - { + if ((0 <= s) && (SA[ISA + s] == v)) { SA[++d] = s; SA[ISA + s] = d; } } - for (c = last - 1, e = d + 1, d = b; e < d; --c) - { + for (c = last - 1, e = d + 1, d = b; e < d; --c) { s = SA[c] - depth; - if ((0 <= s) && (SA[ISA + s] == v)) - { + if ((0 <= s) && (SA[ISA + s] == v)) { SA[--d] = s; SA[ISA + s] = d; } @@ -2622,12 +2101,11 @@ private final void trCopy(int ISA, int first, int a, int b, int last, int depth) /** * */ - private final static int trIlg(int n) - { + private final static int trIlg(int n) { return ((n & 0xffff0000) != 0) ? (((n & 0xff000000) != 0) ? 24 + lg_table[(n >> 24) & 0xff] : 16 + lg_table[(n >> 16) & 0xff]) : (((n & 0x0000ff00) != 0) ? 8 + lg_table[(n >> 8) & 0xff] - : 0 + lg_table[(n >> 0) & 0xff]); + : 0 + lg_table[(n >> 0) & 0xff]); } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ExtraTrailingCellsDecorator.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ExtraTrailingCellsDecorator.java index 51804d48a..cd190322e 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ExtraTrailingCellsDecorator.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ExtraTrailingCellsDecorator.java @@ -10,35 +10,31 @@ * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -public final class ExtraTrailingCellsDecorator implements ISuffixArrayBuilder -{ +public final class ExtraTrailingCellsDecorator implements ISuffixArrayBuilder { private final ISuffixArrayBuilder delegate; private final int extraCells; /** * @see SuffixArrays#MAX_EXTRA_TRAILING_SPACE */ - public ExtraTrailingCellsDecorator(ISuffixArrayBuilder delegate, int extraCells) - { + public ExtraTrailingCellsDecorator(ISuffixArrayBuilder delegate, int extraCells) { this.delegate = delegate; this.extraCells = extraCells; } /* - * + * */ @Override - public int [] buildSuffixArray(int [] input, final int start, final int length) - { - if (start == 0 && start + length + extraCells < input.length) - { + public int[] buildSuffixArray(int[] input, final int start, final int length) { + if (start == 0 && start + length + extraCells < input.length) { return delegate.buildSuffixArray(input, start, length); } - final int [] shifted = new int [input.length + extraCells]; + final int[] shifted = new int[input.length + extraCells]; System.arraycopy(input, start, shifted, 0, length); - final int [] SA = delegate.buildSuffixArray(shifted, 0, length); + final int[] SA = delegate.buildSuffixArray(shifted, 0, length); return SA; } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/GenericArrayAdapter.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/GenericArrayAdapter.java index e709ed8a3..f19da62fd 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/GenericArrayAdapter.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/GenericArrayAdapter.java @@ -7,64 +7,64 @@ /** * An adapter for constructing suffix arrays on generic arrays. * - * @author Anton Olsson for friprogramvarusyndikatet.se + * @author Anton Olsson for friprogramvarusyndikatet.se */ class GenericArrayAdapter { - private final ISuffixArrayBuilder delegate; - int[] input; - TreeMap tokIDs; - private final Comparator comparator; + private final ISuffixArrayBuilder delegate; + int[] input; + TreeMap tokIDs; + private final Comparator comparator; - public GenericArrayAdapter(ISuffixArrayBuilder builder) { - // TODO make sure T is comparable - this.delegate = builder; - this.comparator = null; - } + public GenericArrayAdapter(ISuffixArrayBuilder builder) { + // TODO make sure T is comparable + this.delegate = builder; + this.comparator = null; + } - public GenericArrayAdapter(ISuffixArrayBuilder builder, Comparator comparator) { - // TODO make sure that comparator != null or T is comparable - this.delegate = builder; - this.comparator = comparator; - } + public GenericArrayAdapter(ISuffixArrayBuilder builder, Comparator comparator) { + // TODO make sure that comparator != null or T is comparable + this.delegate = builder; + this.comparator = comparator; + } - /** - * Construct a suffix array for a given generic token array. - */ - public int[] buildSuffixArray(T[] tokens) { - final int length = tokens.length; + /** + * Construct a suffix array for a given generic token array. + */ + public int[] buildSuffixArray(T[] tokens) { + final int length = tokens.length; /* * Allocate slightly more space, some suffix construction strategies need it and * we don't want to waste space for multiple symbol mappings. */ - this.input = new int[length + SuffixArrays.MAX_EXTRA_TRAILING_SPACE]; + this.input = new int[length + SuffixArrays.MAX_EXTRA_TRAILING_SPACE]; - //System.out.println("Renaming tokens ..."); + //System.out.println("Renaming tokens ..."); /* * Here we create a mapping for the token to an integer id which we * can use in the suffax array construction algorithm. */ - this.tokIDs = new TreeMap(comparator); + this.tokIDs = new TreeMap(comparator); - // put and order all tokens in tokIDs - for (int i = 0; i < length; i++) { - tokIDs.put(tokens[i], null); // null is temporary placeholder value - } + // put and order all tokens in tokIDs + for (int i = 0; i < length; i++) { + tokIDs.put(tokens[i], null); // null is temporary placeholder value + } - // assign each token an ascending id - int _id = 1; - for (Entry entry : tokIDs.entrySet()) { - entry.setValue(_id++); - } + // assign each token an ascending id + int _id = 1; + for (Entry entry : tokIDs.entrySet()) { + entry.setValue(_id++); + } - // fill input array with ids - for (int i = 0; i < length; i++) { - input[i] = tokIDs.get(tokens[i]); - } + // fill input array with ids + for (int i = 0; i < length; i++) { + input[i] = tokIDs.get(tokens[i]); + } - //System.out.println("Renaming tokens done."); + //System.out.println("Renaming tokens done."); - return delegate.buildSuffixArray(input, 0, length); - } + return delegate.buildSuffixArray(input, 0, length); + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ISuffixArrayBuilder.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ISuffixArrayBuilder.java index 2d2898240..53117080e 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ISuffixArrayBuilder.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ISuffixArrayBuilder.java @@ -2,14 +2,12 @@ /** * An algorithm that can produce a suffix array for a sequence of integer symbols. - * - * @see #buildSuffixArray(int[], int, int) * * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) + * @see #buildSuffixArray(int[], int, int) */ -public interface ISuffixArrayBuilder -{ +public interface ISuffixArrayBuilder { /** * Computes suffix array for sequence of symbols (integers). The processed sequence is * a subsequence of input determined by start and @@ -20,19 +18,19 @@ public interface ISuffixArrayBuilder * after start + length to store special marker symbols. Also, some * algorithms may require non-negative symbols in the input. For such constrained * algorithms, use various decorators and adapters available in this package. - * - * @param input A sequence of input symbols, int-coded. - * @param start The starting index (inclusive) in input. + * + * @param input A sequence of input symbols, int-coded. + * @param start The starting index (inclusive) in input. * @param length Number of symbols to process. * @return An array of indices such that the suffix of input at index - * result[i] is lexicographically larger or equal to any other - * suffix that precede it. Note that the output array may be larger than - * input.length, in which case only the first - * input.length elements are of relevance. - *

    - * The returned array contains suffix indexes starting from 0 (so - * start needs to be added manually to access a given suffix in - * input). + * result[i] is lexicographically larger or equal to any other + * suffix that precede it. Note that the output array may be larger than + * input.length, in which case only the first + * input.length elements are of relevance. + *

    + * The returned array contains suffix indexes starting from 0 (so + * start needs to be added manually to access a given suffix in + * input). */ - int [] buildSuffixArray(int [] input, int start, int length); + int[] buildSuffixArray(int[] input, int start, int length); } \ No newline at end of file diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ISymbolMapper.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ISymbolMapper.java index a3205468e..3a167548b 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ISymbolMapper.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/ISymbolMapper.java @@ -6,8 +6,8 @@ * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -interface ISymbolMapper -{ - void map(int [] input, int start, int length); - void undo(int [] input, int start, int length); +interface ISymbolMapper { + void map(int[] input, int start, int length); + + void undo(int[] input, int start, int length); } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/MinMax.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/MinMax.java index bc40b434d..8168aaf0e 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/MinMax.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/MinMax.java @@ -2,25 +2,21 @@ /** * Holder for minimum and maximum. - * - * @see Tools#minmax(int[],int,int) * * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) + * @see Tools#minmax(int[], int, int) */ -final class MinMax -{ +final class MinMax { public final int min; public final int max; - - MinMax(int min, int max) - { + + MinMax(int min, int max) { this.min = min; this.max = max; } - public int range() - { + public int range() { return max - min; } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/QSufSort.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/QSufSort.java index 7f53ad8e3..fec1d252c 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/QSufSort.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/QSufSort.java @@ -3,10 +3,10 @@ /** *

    * Straightforward reimplementation of the qsufsort algorithm given in: - * + *

    *

      * <code>
    - * Larsson, N. Jesper and Sadakane, Kunihiko. Faster Suffix Sorting. 
    + * Larsson, N. Jesper and Sadakane, Kunihiko. Faster Suffix Sorting.
      * Report number LU-CS-TR:99-214, LUNDFD6/(NFCS-3140)/1--20/(1999). Department of Computer Science, Lund University"
      * </code>
      * 
    @@ -22,18 +22,25 @@ * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -public class QSufSort implements ISuffixArrayBuilder -{ - /** group array, ultimately suffix array. */ +public class QSufSort implements ISuffixArrayBuilder { + /** + * group array, ultimately suffix array. + */ private int I[]; - /** inverse array, ultimately inverse of I. */ + /** + * inverse array, ultimately inverse of I. + */ private int V[]; - /** number of symbols aggregated by transform. */ + /** + * number of symbols aggregated by transform. + */ private int r; - /** length of already-sorted prefixes. */ + /** + * length of already-sorted prefixes. + */ private int h; /** @@ -48,8 +55,7 @@ public class QSufSort implements ISuffixArrayBuilder * Default constructor, uses the input array of symbols to preserve memory (and * destroys it). */ - public QSufSort() - { + public QSufSort() { this.preserveInput = true; } @@ -57,8 +63,7 @@ public QSufSort() * If true, the algorithm will use a copy of the input so it is left * intact. */ - public QSufSort(boolean preserveInput) - { + public QSufSort(boolean preserveInput) { this.preserveInput = preserveInput; } @@ -68,31 +73,27 @@ public QSufSort(boolean preserveInput) * Additional constraints enforced by qsufsort algorithm: *
      *
    • non-negative (≥0) symbols in the input
    • - *
    • length >= 2
    • + *
    • length >= 2
    • *
    *

    */ @Override - public final int [] buildSuffixArray(int [] input, int start, int length) - { + public final int[] buildSuffixArray(int[] input, int start, int length) { Tools.assertAlways(input.length >= start + length + 1, "no extra space after input end"); MinMax minmax = Tools.minmax(input, start, length); Tools.assertAlways(minmax.min >= 0, "input must not be negative"); - I = new int [length + 1]; + I = new int[length + 1]; this.start = start; - if (preserveInput) - { - V = new int [length + 1]; + if (preserveInput) { + V = new int[length + 1]; this.start = 0; System.arraycopy(input, start, V, 0, length); - } - else - { + } else { V = input; } suffixsort(length, minmax.max + 1, minmax.min); - final int [] tmp = I; + final int[] tmp = I; V = I = null; return tmp; } @@ -105,18 +106,14 @@ public QSufSort(boolean preserveInput) * Original contents of x[n] is disregarded, the n -th * symbol being regarded as end-of-string smaller than all other symbols. */ - private void suffixsort(int n, int k, int l) - { + private void suffixsort(int n, int k, int l) { int pi, pk; // I pointers int i, j, s, sl; - if (n >= k - l) - { /* if bucketing possible, */ + if (n >= k - l) { /* if bucketing possible, */ j = transform(n, k, l, n); bucketsort(n, j); /* bucketsort on first r positions. */ - } - else - { + } else { transform(n, k, l, Integer.MAX_VALUE); for (i = 0; i <= n; ++i) I[i] = i; /* initialize I with suffix numbers. */ @@ -124,21 +121,15 @@ private void suffixsort(int n, int k, int l) sort_split(0, n + 1); /* quicksort on first r positions. */ } h = r; /* number of symbols aggregated by transform. */ - while (I[0] >= -n) - { + while (I[0] >= -n) { pi = 0; /* pi is first position of group. */ sl = 0; /* sl is negated length of sorted groups. */ - do - { - if ((s = I[pi]) < 0) - { + do { + if ((s = I[pi]) < 0) { pi -= s; /* skip over sorted group. */ sl += s; /* add negated length to sl. */ - } - else - { - if (sl != 0) - { + } else { + if (sl != 0) { I[pi + sl] = sl; /* combine sorted groups before pi. */ sl = 0; } @@ -149,15 +140,13 @@ private void suffixsort(int n, int k, int l) } while (pi <= n); if (sl != 0) /* if the array ends with a sorted group. */ - I[pi + sl] = sl; /* combine sorted groups at end of I. */ + I[pi + sl] = sl; /* combine sorted groups at end of I. */ h = 2 * h; /* double sorted-depth. */ } - for (i = 0; i <= n; ++i) - { + for (i = 0; i <= n; ++i) { /* reconstruct suffix array from inverse. */ - if (V[start + i] > 0) - { + if (V[start + i] > 0) { I[V[start + i] - 1] = i; } } @@ -171,13 +160,11 @@ private void suffixsort(int n, int k, int l) * "Engineering a Sort Function", Software -- Practice and Experience 23(11), * 1249-1265 (November 1993). This function is based on Program 7. */ - private void sort_split(int p, int n) - { + private void sort_split(int p, int n) { int pa, pb, pc, pd, pl, pm, pn;// pointers int f, v, s, t; - if (n < 7) - { /* multi-selection sort smallest arrays. */ + if (n < 7) { /* multi-selection sort smallest arrays. */ select_sort_split(p, n); return; } @@ -185,21 +172,16 @@ private void sort_split(int p, int n) v = choose_pivot(p, n); pa = pb = p; pc = pd = p + n - 1; - while (true) - { /* split-end partition. */ - while (pb <= pc && (f = KEY(pb)) <= v) - { - if (f == v) - { + while (true) { /* split-end partition. */ + while (pb <= pc && (f = KEY(pb)) <= v) { + if (f == v) { SWAP(pa, pb); ++pa; } ++pb; } - while (pc >= pb && (f = KEY(pc)) >= v) - { - if (f == v) - { + while (pc >= pb && (f = KEY(pc)) >= v) { + if (f == v) { SWAP(pc, pd); --pd; } @@ -230,8 +212,7 @@ private void sort_split(int p, int n) * {@link #sort_split(int, int)}. Sets group numbers for a group whose lowest position * in {@link #I} is pl and highest position is pm. */ - private void update_group(int pl, int pm) - { + private void update_group(int pl, int pm) { int g; g = pm; /* group number. */ @@ -247,18 +228,15 @@ private void update_group(int pl, int pm) /** * Subroutine for {@link #sort_split(int, int)} , algorithm by Bentley & McIlroy. */ - private int choose_pivot(int p, int n) - { + private int choose_pivot(int p, int n) { int pl, pm, pn;// pointers int s; pm = p + (n >> 1); /* small arrays, middle element. */ - if (n > 7) - { + if (n > 7) { pl = p; pn = p + n - 1; - if (n > 40) - { /* big arrays, pseudomedian of 9. */ + if (n > 40) { /* big arrays, pseudomedian of 9. */ s = n >> 3; pl = MED3(pl, pl + s, pl + s + s); pm = MED3(pm - s, pm, pm + s); @@ -273,32 +251,26 @@ private int choose_pivot(int p, int n) * Quadratic sorting method to use for small subarrays. To be able to update group * numbers consistently, a variant of selection sorting is used. */ - private void select_sort_split(int p, int n) - { + private void select_sort_split(int p, int n) { int pa, pb, pi, pn; int f, v; pa = p; /* pa is start of group being picked out. */ pn = p + n - 1; /* pn is last position of subarray. */ - while (pa < pn) - { + while (pa < pn) { for (pi = pb = pa + 1, f = KEY(pa); pi <= pn; ++pi) - if ((v = KEY(pi)) < f) - { + if ((v = KEY(pi)) < f) { f = v; /* f is smallest key found. */ SWAP(pi, pa); /* place smallest element at beginning. */ pb = pa + 1; /* pb is position for elements equal to f. */ - } - else if (v == f) - { /* if equal to smallest key. */ + } else if (v == f) { /* if equal to smallest key. */ SWAP(pi, pb); /* place next to other smallest elements. */ ++pb; } update_group(pa, pb - 1); /* update group values for new group. */ pa = pb; /* continue sorting rest of the subarray. */ } - if (pa == pn) - { /* check if last part is single element. */ + if (pa == pn) { /* check if last part is single element. */ V[start + I[pa]] = pa; I[pa] = -1; /* sorted group. */ } @@ -313,34 +285,28 @@ else if (v == f) * n+1. I is array of size n+1 whose contents * are disregarded. */ - private void bucketsort(int n, int k) - { + private void bucketsort(int n, int k) { int pi;// pointer int i, c, d, g; for (pi = 0; pi < k; ++pi) I[pi] = -1; /* mark linked lists empty. */ - for (i = 0; i <= n; ++i) - { + for (i = 0; i <= n; ++i) { V[start + i] = I[c = V[start + i]]; /* insert in linked list. */ I[c] = i; } - for (pi = k - 1, i = n; pi >= 0; --pi) - { + for (pi = k - 1, i = n; pi >= 0; --pi) { d = V[start + (c = I[pi])]; /* c is position, d is next in list. */ V[start + c] = g = i; /* last position equals group number. */ - if (d >= 0) - { /* if more than one element in group. */ + if (d >= 0) { /* if more than one element in group. */ I[i--] = c; /* p is permutation for the sorted x. */ - do - { + do { d = V[start + (c = d)]; /* next in linked list. */ V[start + c] = g; /* group number in x. */ I[i--] = c; /* permutation in p. */ } while (d >= 0); - } - else I[i--] = -1; /* one element, sorted group. */ + } else I[i--] = -1; /* one element, sorted group. */ } } @@ -359,72 +325,61 @@ private void bucketsort(int n, int k) * k-l>n, compaction is never done; if q is * {@link Integer#MAX_VALUE} , the maximum number of symbols are aggregated into one. *

    - * + * * @return an integer j in the range 1...q representing the - * size of the new alphabet. If j<=n+1 , the alphabet is - * compacted. The global variable r is set to the number of old - * symbols grouped into one. Only V[n] is 0. + * size of the new alphabet. If j<=n+1 , the alphabet is + * compacted. The global variable r is set to the number of old + * symbols grouped into one. Only V[n] is 0. */ - private int transform(int n, int k, int l, int q) - { + private int transform(int n, int k, int l, int q) { int b, c, d, e, i, j, m, s; int pi, pj;// pointers for (s = 0, i = k - l; i != 0; i >>= 1) ++s; /* s is number of bits in old symbol. */ e = Integer.MAX_VALUE >> s; /* e is for overflow checking. */ - for (b = d = r = 0; r < n && d <= e && (c = d << s | (k - l)) <= q; ++r) - { + for (b = d = r = 0; r < n && d <= e && (c = d << s | (k - l)) <= q; ++r) { b = b << s | (V[start + r] - l + 1); /* b is start of x in chunk alphabet. */ d = c; /* d is max symbol in chunk alphabet. */ } m = (1 << (r - 1) * s) - 1; /* m masks off top old symbol from chunk. */ V[start + n] = l - 1; /* emulate zero terminator. */ - if (d <= n) - { /* if bucketing possible, compact alphabet. */ + if (d <= n) { /* if bucketing possible, compact alphabet. */ for (pi = 0; pi <= d; ++pi) I[pi] = 0; /* zero transformation table. */ - for (pi = r, c = b; pi <= n; ++pi) - { + for (pi = r, c = b; pi <= n; ++pi) { I[c] = 1; /* mark used chunk symbol. */ c = (c & m) << s | (V[start + pi] - l + 1); /* * shift in next old symbol in * chunk. */ } - for (i = 1; i < r; ++i) - { /* handle last r-1 positions. */ + for (i = 1; i < r; ++i) { /* handle last r-1 positions. */ I[c] = 1; /* mark used chunk symbol. */ c = (c & m) << s; /* shift in next old symbol in chunk. */ } for (pi = 0, j = 1; pi <= d; ++pi) if (I[pi] != 0) I[pi] = j++; /* j is new alphabet size. */ - for (pi = 0, pj = r, c = b; pj <= n; ++pi, ++pj) - { + for (pi = 0, pj = r, c = b; pj <= n; ++pi, ++pj) { V[start + pi] = I[c]; /* transform to new alphabet. */ c = (c & m) << s | (V[start + pj] - l + 1); /* * shift in next old symbol in * chunk. */ } - while (pi < n) - { /* handle last r-1 positions. */ + while (pi < n) { /* handle last r-1 positions. */ V[start + pi++] = I[c]; /* transform to new alphabet. */ c = (c & m) << s; /* shift right-end zero in chunk. */ } - } - else - { /* bucketing not possible, don't compact. */ - for (pi = 0, pj = r, c = b; pj <= n; ++pi, ++pj) - { + } else { /* bucketing not possible, don't compact. */ + for (pi = 0, pj = r, c = b; pj <= n; ++pi, ++pj) { V[start + pi] = c; /* transform to new alphabet. */ c = (c & m) << s | (V[start + pj] - l + 1); /* * shift in next old symbol in * chunk. */ } - while (pi < n) - { /* handle last r-1 positions. */ + while (pi < n) { /* handle last r-1 positions. */ V[start + pi++] = c; /* transform to new alphabet. */ c = (c & m) << s; /* shift right-end zero in chunk. */ } @@ -434,20 +389,17 @@ private int transform(int n, int k, int l, int q) return j; /* return new alphabet size. */ } - private int KEY(int p) - { + private int KEY(int p) { return V[start + I[p] + h]; } - private void SWAP(int a, int b) - { + private void SWAP(int a, int b) { int tmp = I[a]; I[a] = I[b]; I[b] = tmp; } - private int MED3(int a, int b, int c) - { + private int MED3(int a, int b, int c) { return (KEY(a) < KEY(b) ? (KEY(b) < KEY(c) ? (b) : KEY(a) < KEY(c) ? (c) : (a)) : (KEY(b) > KEY(c) ? (b) : KEY(a) > KEY(c) ? (c) : (a))); } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SAIS.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SAIS.java index 51aa126eb..ee05fd9f1 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SAIS.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SAIS.java @@ -39,354 +39,567 @@ *

    * Ge Nong, Sen Zhang and Wai Hong Chan, Two Efficient Algorithms for Linear Suffix Array * Construction, 2008. - * - * @see "http://yuta.256.googlepages.com/sais" * * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) + * @see "http://yuta.256.googlepages.com/sais" */ -public final class SAIS implements ISuffixArrayBuilder -{ - private static interface BaseArray - { - public int get(int i); - public void set(int i, int val); - public int update(int i, int val); - } - - private static final class ByteArray implements BaseArray - { - private byte[] m_A; - private int m_pos; - - ByteArray(byte[] A, int pos) { m_A = A; m_pos = pos; } - public int get(int i) { return m_A[m_pos + i] & 0xff; } - public void set(int i, int val) { m_A[m_pos + i] = (byte)(val & 0xff); } - public int update(int i, int val) { return m_A[m_pos + i] += val & 0xff; } - } - - private static final class CharArray implements BaseArray - { - private char[] m_A; - private int m_pos; - CharArray(char[] A, int pos) { m_A = A; m_pos = pos; } - public int get(int i) { return m_A[m_pos + i] & 0xffff; } - public void set(int i, int val) { m_A[m_pos + i] = (char)(val & 0xffff); } - public int update(int i, int val) { return m_A[m_pos + i] += val & 0xffff; } - } - - private static final class ShortArray implements BaseArray - { - private short[] m_A; - private int m_pos; - ShortArray(short[] A, int pos) { m_A = A; m_pos = pos; } - public int get(int i) { return m_A[m_pos + i] & 0xffff; } - public void set(int i, int val) { m_A[m_pos + i] = (short)(val & 0xffff); } - public int update(int i, int val) { return m_A[m_pos + i] += val & 0xffff; } - } - - private static final class IntArray implements BaseArray - { - private int[] m_A; - private int m_pos; - IntArray(int[] A, int pos) { m_A = A; m_pos = pos; } - public int get(int i) { return m_A[m_pos + i]; } - public void set(int i, int val) { m_A[m_pos + i] = val; } - public int update(int i, int val) { return m_A[m_pos + i] += val; } - } - - private static final class StringArray implements BaseArray - { - private String m_A; - private int m_pos; - StringArray(String A, int pos) { m_A = A; m_pos = pos; } - public int get(int i) { return (int)(m_A.charAt(m_pos + i) & 0xffff); } - public void set(int i, int val) { } - public int update(int i, int val) { return 0; } - } - - /* find the start or end of each bucket */ - private static void getCounts(BaseArray T, BaseArray C, int n, int k) { - for(int i = 0; i < k; ++i) { C.set(i, 0); } - for(int i = 0; i < n; ++i) { C.update(T.get(i), 1); } - } - - private static void getBuckets(BaseArray C, BaseArray B, int k, boolean end) { - int i, sum = 0; - if (end != false) { for(i = 0; i < k; ++i) { sum += C.get(i); B.set(i, sum); } } - else { for(i = 0; i < k; ++i) { sum += C.get(i); B.set(i, sum - C.get(i)); } } - } - - /* compute SA and BWT */ - private static void induceSA(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) - { - int b, i, j; - int c0, c1; +public final class SAIS implements ISuffixArrayBuilder { + private static interface BaseArray { + public int get(int i); + + public void set(int i, int val); + + public int update(int i, int val); + } + + private static final class ByteArray implements BaseArray { + private byte[] m_A; + private int m_pos; + + ByteArray(byte[] A, int pos) { + m_A = A; + m_pos = pos; + } + + public int get(int i) { + return m_A[m_pos + i] & 0xff; + } + + public void set(int i, int val) { + m_A[m_pos + i] = (byte) (val & 0xff); + } + + public int update(int i, int val) { + return m_A[m_pos + i] += val & 0xff; + } + } + + private static final class CharArray implements BaseArray { + private char[] m_A; + private int m_pos; + + CharArray(char[] A, int pos) { + m_A = A; + m_pos = pos; + } + + public int get(int i) { + return m_A[m_pos + i] & 0xffff; + } + + public void set(int i, int val) { + m_A[m_pos + i] = (char) (val & 0xffff); + } + + public int update(int i, int val) { + return m_A[m_pos + i] += val & 0xffff; + } + } + + private static final class ShortArray implements BaseArray { + private short[] m_A; + private int m_pos; + + ShortArray(short[] A, int pos) { + m_A = A; + m_pos = pos; + } + + public int get(int i) { + return m_A[m_pos + i] & 0xffff; + } + + public void set(int i, int val) { + m_A[m_pos + i] = (short) (val & 0xffff); + } + + public int update(int i, int val) { + return m_A[m_pos + i] += val & 0xffff; + } + } + + private static final class IntArray implements BaseArray { + private int[] m_A; + private int m_pos; + + IntArray(int[] A, int pos) { + m_A = A; + m_pos = pos; + } + + public int get(int i) { + return m_A[m_pos + i]; + } + + public void set(int i, int val) { + m_A[m_pos + i] = val; + } + + public int update(int i, int val) { + return m_A[m_pos + i] += val; + } + } + + private static final class StringArray implements BaseArray { + private String m_A; + private int m_pos; + + StringArray(String A, int pos) { + m_A = A; + m_pos = pos; + } + + public int get(int i) { + return (int) (m_A.charAt(m_pos + i) & 0xffff); + } + + public void set(int i, int val) { + } + + public int update(int i, int val) { + return 0; + } + } + + /* find the start or end of each bucket */ + private static void getCounts(BaseArray T, BaseArray C, int n, int k) { + for (int i = 0; i < k; ++i) { + C.set(i, 0); + } + for (int i = 0; i < n; ++i) { + C.update(T.get(i), 1); + } + } + + private static void getBuckets(BaseArray C, BaseArray B, int k, boolean end) { + int i, sum = 0; + if (end != false) { + for (i = 0; i < k; ++i) { + sum += C.get(i); + B.set(i, sum); + } + } else { + for (i = 0; i < k; ++i) { + sum += C.get(i); + B.set(i, sum - C.get(i)); + } + } + } + + /* compute SA and BWT */ + private static void induceSA(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) { + int b, i, j; + int c0, c1; /* compute SAl */ - if(C == B) { getCounts(T, C, n, k); } - getBuckets(C, B, k, false); /* find starts of buckets */ - j = n - 1; - b = B.get(c1 = T.get(j)); - SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j; - for(i = 0; i < n; ++i) { - j = SA[i]; SA[i] = ~j; - if(0 < j) { - if((c0 = T.get(--j)) != c1) { B.set(c1, b); b = B.get(c1 = c0); } + if (C == B) { + getCounts(T, C, n, k); + } + getBuckets(C, B, k, false); /* find starts of buckets */ + j = n - 1; + b = B.get(c1 = T.get(j)); SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j; - } - } + for (i = 0; i < n; ++i) { + j = SA[i]; + SA[i] = ~j; + if (0 < j) { + if ((c0 = T.get(--j)) != c1) { + B.set(c1, b); + b = B.get(c1 = c0); + } + SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j; + } + } /* compute SAs */ - if(C == B) { getCounts(T, C, n, k); } - getBuckets(C, B, k, true); /* find ends of buckets */ - for(i = n - 1, b = B.get(c1 = 0); 0 <= i; --i) { - if(0 < (j = SA[i])) { - if((c0 = T.get(--j)) != c1) { B.set(c1, b); b = B.get(c1 = c0); } - SA[--b] = ((j == 0) || (T.get(j - 1) > c1)) ? ~j : j; - } else { - SA[i] = ~j; - } + if (C == B) { + getCounts(T, C, n, k); + } + getBuckets(C, B, k, true); /* find ends of buckets */ + for (i = n - 1, b = B.get(c1 = 0); 0 <= i; --i) { + if (0 < (j = SA[i])) { + if ((c0 = T.get(--j)) != c1) { + B.set(c1, b); + b = B.get(c1 = c0); + } + SA[--b] = ((j == 0) || (T.get(j - 1) > c1)) ? ~j : j; + } else { + SA[i] = ~j; + } + } } - } - - private static int computeBWT(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) { - int b, i, j, pidx = -1; - int c0, c1; + + private static int computeBWT(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) { + int b, i, j, pidx = -1; + int c0, c1; /* compute SAl */ - if(C == B) { getCounts(T, C, n, k); } - getBuckets(C, B, k, false); /* find starts of buckets */ - j = n - 1; - b = B.get(c1 = T.get(j)); - SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j; - for(i = 0; i < n; ++i) { - if(0 < (j = SA[i])) { - SA[i] = ~(c0 = T.get(--j)); - if(c0 != c1) { B.set(c1, b); b = B.get(c1 = c0); } + if (C == B) { + getCounts(T, C, n, k); + } + getBuckets(C, B, k, false); /* find starts of buckets */ + j = n - 1; + b = B.get(c1 = T.get(j)); SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j; - } else if(j != 0) { - SA[i] = ~j; - } - } + for (i = 0; i < n; ++i) { + if (0 < (j = SA[i])) { + SA[i] = ~(c0 = T.get(--j)); + if (c0 != c1) { + B.set(c1, b); + b = B.get(c1 = c0); + } + SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j; + } else if (j != 0) { + SA[i] = ~j; + } + } /* compute SAs */ - if(C == B) { getCounts(T, C, n, k); } - getBuckets(C, B, k, true); /* find ends of buckets */ - for(i = n - 1, b = B.get(c1 = 0); 0 <= i; --i) { - if(0 < (j = SA[i])) { - SA[i] = (c0 = T.get(--j)); - if(c0 != c1) { B.set(c1, b); b = B.get(c1 = c0); } - SA[--b] = ((0 < j) && (T.get(j - 1) > c1)) ? ~((int)T.get(j - 1)) : j; - } else if(j != 0) { - SA[i] = ~j; - } else { - pidx = i; - } + if (C == B) { + getCounts(T, C, n, k); + } + getBuckets(C, B, k, true); /* find ends of buckets */ + for (i = n - 1, b = B.get(c1 = 0); 0 <= i; --i) { + if (0 < (j = SA[i])) { + SA[i] = (c0 = T.get(--j)); + if (c0 != c1) { + B.set(c1, b); + b = B.get(c1 = c0); + } + SA[--b] = ((0 < j) && (T.get(j - 1) > c1)) ? ~((int) T.get(j - 1)) : j; + } else if (j != 0) { + SA[i] = ~j; + } else { + pidx = i; + } + } + return pidx; } - return pidx; - } - /* find the suffix array SA of T[0..n-1] in {0..k-1}^n - use a working space (excluding T and SA) of at most 2n+O(1) for a constant alphabet */ - private static int SA_IS(BaseArray T, int[] SA, int fs, int n, int k, boolean isbwt) { - BaseArray C, B, RA; - int i, j, c, m, p, q, plen, qlen, name, pidx = 0; - int c0, c1; - boolean diff; + /* find the suffix array SA of T[0..n-1] in {0..k-1}^n + use a working space (excluding T and SA) of at most 2n+O(1) for a constant alphabet */ + private static int SA_IS(BaseArray T, int[] SA, int fs, int n, int k, boolean isbwt) { + BaseArray C, B, RA; + int i, j, c, m, p, q, plen, qlen, name, pidx = 0; + int c0, c1; + boolean diff; /* stage 1: reduce the problem by at least 1/2 sort all the S-substrings */ - if(k <= fs) { - C = new IntArray(SA, n); - B = (k <= (fs - k)) ? new IntArray(SA, n + k) : C; - } else { - B = C = new IntArray(new int[k], 0); - } - getCounts(T, C, n, k); getBuckets(C, B, k, true); /* find ends of buckets */ - for(i = 0; i < n; ++i) { SA[i] = 0; } - for(i = n - 2, c = 0, c1 = T.get(n - 1); 0 <= i; --i, c1 = c0) { - if((c0 = T.get(i)) < (c1 + c)) { c = 1; } - else if(c != 0) { SA[B.update(c1, -1)] = i + 1; c = 0; } - } - induceSA(T, SA, C, B, n, k); - C = null; B = null; + if (k <= fs) { + C = new IntArray(SA, n); + B = (k <= (fs - k)) ? new IntArray(SA, n + k) : C; + } else { + B = C = new IntArray(new int[k], 0); + } + getCounts(T, C, n, k); + getBuckets(C, B, k, true); /* find ends of buckets */ + for (i = 0; i < n; ++i) { + SA[i] = 0; + } + for (i = n - 2, c = 0, c1 = T.get(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = T.get(i)) < (c1 + c)) { + c = 1; + } else if (c != 0) { + SA[B.update(c1, -1)] = i + 1; + c = 0; + } + } + induceSA(T, SA, C, B, n, k); + C = null; + B = null; /* compact all the sorted substrings into the first m items of SA 2*m must be not larger than n (proveable) */ - for(i = 0, m = 0; i < n; ++i) { - p = SA[i]; - if((0 < p) && (T.get(p - 1) > (c0 = T.get(p)))) { - for(j = p + 1; (j < n) && (c0 == (c1 = T.get(j))); ++j) { } - if((j < n) && (c0 < c1)) { SA[m++] = p; } - } - } - j = m + (n >> 1); - for(i = m; i < j; ++i) { SA[i] = 0; } /* init the name array buffer */ + for (i = 0, m = 0; i < n; ++i) { + p = SA[i]; + if ((0 < p) && (T.get(p - 1) > (c0 = T.get(p)))) { + for (j = p + 1; (j < n) && (c0 == (c1 = T.get(j))); ++j) { + } + if ((j < n) && (c0 < c1)) { + SA[m++] = p; + } + } + } + j = m + (n >> 1); + for (i = m; i < j; ++i) { + SA[i] = 0; + } /* init the name array buffer */ /* store the length of all substrings */ - for(i = n - 2, j = n, c = 0, c1 = T.get(n - 1); 0 <= i; --i, c1 = c0) { - if((c0 = T.get(i)) < (c1 + c)) { c = 1; } - else if(c != 0) { SA[m + ((i + 1) >> 1)] = j - i - 1; j = i + 1; c = 0; } - } + for (i = n - 2, j = n, c = 0, c1 = T.get(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = T.get(i)) < (c1 + c)) { + c = 1; + } else if (c != 0) { + SA[m + ((i + 1) >> 1)] = j - i - 1; + j = i + 1; + c = 0; + } + } /* find the lexicographic names of all substrings */ - for(i = 0, name = 0, q = n, qlen = 0; i < m; ++i) { - p = SA[i]; plen = SA[m + (p >> 1)]; diff = true; - if(plen == qlen) { - for(j = 0; (j < plen) && (T.get(p + j) == T.get(q + j)); ++j) { } - if(j == plen) { diff = false; } - } - if(diff != false) { ++name; q = p; qlen = plen; } - SA[m + (p >> 1)] = name; - } + for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) { + p = SA[i]; + plen = SA[m + (p >> 1)]; + diff = true; + if (plen == qlen) { + for (j = 0; (j < plen) && (T.get(p + j) == T.get(q + j)); ++j) { + } + if (j == plen) { + diff = false; + } + } + if (diff != false) { + ++name; + q = p; + qlen = plen; + } + SA[m + (p >> 1)] = name; + } /* stage 2: solve the reduced problem recurse if names are not yet unique */ - if(name < m) { - RA = new IntArray(SA, n + fs - m); - for(i = m + (n >> 1) - 1, j = n + fs - 1; m <= i; --i) { - if(SA[i] != 0) { SA[j--] = SA[i] - 1; } - } - SA_IS(RA, SA, fs + n - m * 2, m, name, false); - RA = null; - for(i = n - 2, j = m * 2 - 1, c = 0, c1 = T.get(n - 1); 0 <= i; --i, c1 = c0) { - if((c0 = T.get(i)) < (c1 + c)) { c = 1; } - else if(c != 0) { SA[j--] = i + 1; c = 0; } /* get p1 */ - } - for(i = 0; i < m; ++i) { SA[i] = SA[SA[i] + m]; } /* get index */ - } + if (name < m) { + RA = new IntArray(SA, n + fs - m); + for (i = m + (n >> 1) - 1, j = n + fs - 1; m <= i; --i) { + if (SA[i] != 0) { + SA[j--] = SA[i] - 1; + } + } + SA_IS(RA, SA, fs + n - m * 2, m, name, false); + RA = null; + for (i = n - 2, j = m * 2 - 1, c = 0, c1 = T.get(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = T.get(i)) < (c1 + c)) { + c = 1; + } else if (c != 0) { + SA[j--] = i + 1; + c = 0; + } /* get p1 */ + } + for (i = 0; i < m; ++i) { + SA[i] = SA[SA[i] + m]; + } /* get index */ + } /* stage 3: induce the result for the original problem */ - if(k <= fs) { - C = new IntArray(SA, n); - B = (k <= (fs - k)) ? new IntArray(SA, n + k) : C; - } else { - B = C = new IntArray(new int[k], 0); - } + if (k <= fs) { + C = new IntArray(SA, n); + B = (k <= (fs - k)) ? new IntArray(SA, n + k) : C; + } else { + B = C = new IntArray(new int[k], 0); + } /* put all left-most S characters into their buckets */ - getCounts(T, C, n, k); getBuckets(C, B, k, true); /* find ends of buckets */ - for(i = m; i < n; ++i) { SA[i] = 0; } /* init SA[m..n-1] */ - for(i = m - 1; 0 <= i; --i) { - j = SA[i]; SA[i] = 0; - SA[B.update(T.get(j), -1)] = j; + getCounts(T, C, n, k); + getBuckets(C, B, k, true); /* find ends of buckets */ + for (i = m; i < n; ++i) { + SA[i] = 0; + } /* init SA[m..n-1] */ + for (i = m - 1; 0 <= i; --i) { + j = SA[i]; + SA[i] = 0; + SA[B.update(T.get(j), -1)] = j; + } + if (isbwt == false) { + induceSA(T, SA, C, B, n, k); + } else { + pidx = computeBWT(T, SA, C, B, n, k); + } + C = null; + B = null; + return pidx; } - if(isbwt == false) { induceSA(T, SA, C, B, n, k); } - else { pidx = computeBWT(T, SA, C, B, n, k); } - C = null; B = null; - return pidx; - } - /** Suffixsorting **/ + /** + * Suffixsorting * + */ /* byte */ - public static - int - suffixsort(byte[] T, int[] SA, int n) { - if((T == null) || (SA == null) || (T.length < n) || (SA.length < n)) { return -1; } - if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; } - return SA_IS(new ByteArray(T, 0), SA, 0, n, 256, false); - } - /* char */ - public static - int - suffixsort(char[] T, int[] SA, int n) { - if((T == null) || (SA == null) || (T.length < n) || (SA.length < n)) { return -1; } - if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; } - return SA_IS(new CharArray(T, 0), SA, 0, n, 65536, false); - } - /* short */ - public static - int - suffixsort(short[] T, int[] SA, int n, int k) { - if((T == null) || (SA == null) || - (T.length < n) || (SA.length < n) || - (k <= 0) || (65536 < k)) { return -1; } - if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; } - return SA_IS(new ShortArray(T, 0), SA, 0, n, k, false); - } - /* int */ - public static - int - suffixsort(int[] T, int[] SA, int n, int k) { - if((T == null) || (SA == null) || - (T.length < n) || (SA.length < n) || - (k <= 0)) { return -1; } - if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; } - return SA_IS(new IntArray(T, 0), SA, 0, n, k, false); - } - /* String */ - public static - int - suffixsort(String T, int[] SA, int n) { - if((T == null) || (SA == null) || - (T.length() < n) || (SA.length < n)) { return -1; } - if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; } - return SA_IS(new StringArray(T, 0), SA, 0, n, 65536, false); - } - - /** Burrows-Wheeler Transform **/ + public static int + suffixsort(byte[] T, int[] SA, int n) { + if ((T == null) || (SA == null) || (T.length < n) || (SA.length < n)) { + return -1; + } + if (n <= 1) { + if (n == 1) { + SA[0] = 0; + } + return 0; + } + return SA_IS(new ByteArray(T, 0), SA, 0, n, 256, false); + } + + /* char */ + public static int + suffixsort(char[] T, int[] SA, int n) { + if ((T == null) || (SA == null) || (T.length < n) || (SA.length < n)) { + return -1; + } + if (n <= 1) { + if (n == 1) { + SA[0] = 0; + } + return 0; + } + return SA_IS(new CharArray(T, 0), SA, 0, n, 65536, false); + } + + /* short */ + public static int + suffixsort(short[] T, int[] SA, int n, int k) { + if ((T == null) || (SA == null) || + (T.length < n) || (SA.length < n) || + (k <= 0) || (65536 < k)) { + return -1; + } + if (n <= 1) { + if (n == 1) { + SA[0] = 0; + } + return 0; + } + return SA_IS(new ShortArray(T, 0), SA, 0, n, k, false); + } + + /* int */ + public static int + suffixsort(int[] T, int[] SA, int n, int k) { + if ((T == null) || (SA == null) || + (T.length < n) || (SA.length < n) || + (k <= 0)) { + return -1; + } + if (n <= 1) { + if (n == 1) { + SA[0] = 0; + } + return 0; + } + return SA_IS(new IntArray(T, 0), SA, 0, n, k, false); + } + + /* String */ + public static int + suffixsort(String T, int[] SA, int n) { + if ((T == null) || (SA == null) || + (T.length() < n) || (SA.length < n)) { + return -1; + } + if (n <= 1) { + if (n == 1) { + SA[0] = 0; + } + return 0; + } + return SA_IS(new StringArray(T, 0), SA, 0, n, 65536, false); + } + + /** + * Burrows-Wheeler Transform * + */ /* byte */ - public static - int - bwtransform(byte[] T, byte[] U, int[] A, int n) { - int i, pidx; - if((T == null) || (U == null) || (A == null) || - (T.length < n) || (U.length < n) || (A.length < n)) { return -1; } - if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } - pidx = SA_IS(new ByteArray(T, 0), A, 0, n, 256, true); - U[0] = T[n - 1]; - for(i = 0; i < pidx; ++i) { U[i + 1] = (byte)(A[i] & 0xff); } - for(i += 1; i < n; ++i) { U[i] = (byte)(A[i] & 0xff); } - return pidx + 1; - } - /* char */ - public static - int - bwtransform(char[] T, char[] U, int[] A, int n) { - int i, pidx; - if((T == null) || (U == null) || (A == null) || - (T.length < n) || (U.length < n) || (A.length < n)) { return -1; } - if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } - pidx = SA_IS(new CharArray(T, 0), A, 0, n, 65536, true); - U[0] = T[n - 1]; - for(i = 0; i < pidx; ++i) { U[i + 1] = (char)(A[i] & 0xffff); } - for(i += 1; i < n; ++i) { U[i] = (char)(A[i] & 0xffff); } - return pidx + 1; - } - /* short */ - public static - int - bwtransform(short[] T, short[] U, int[] A, int n, int k) { - int i, pidx; - if((T == null) || (U == null) || (A == null) || - (T.length < n) || (U.length < n) || (A.length < n) || - (0 <= k) || (65536 < k)) { return -1; } - if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } - pidx = SA_IS(new ShortArray(T, 0), A, 0, n, k, true); - U[0] = T[n - 1]; - for(i = 0; i < pidx; ++i) { U[i + 1] = (short)(A[i] & 0xffff); } - for(i += 1; i < n; ++i) { U[i] = (short)(A[i] & 0xffff); } - return pidx + 1; - } - /* int */ - public static - int - bwtransform(int[] T, int[] U, int[] A, int n, int k) { - int i, pidx; - if((T == null) || (U == null) || (A == null) || - (T.length < n) || (U.length < n) || (A.length < n) || - (0 <= k)) { return -1; } - if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } - pidx = SA_IS(new IntArray(T, 0), A, 0, n, k, true); - U[0] = T[n - 1]; - for(i = 0; i < pidx; ++i) { U[i + 1] = A[i]; } - for(i += 1; i < n; ++i) { U[i] = A[i]; } - return pidx + 1; - } - - @Override - public int [] buildSuffixArray(int [] input, int start, int length) - { - // TODO: [dw] add constraints here. - final int [] SA = new int [length]; - MinMax mm = Tools.minmax(input, start, length); - suffixsort(input, SA, length, mm.max + 1); - return SA; - } + public static int + bwtransform(byte[] T, byte[] U, int[] A, int n) { + int i, pidx; + if ((T == null) || (U == null) || (A == null) || + (T.length < n) || (U.length < n) || (A.length < n)) { + return -1; + } + if (n <= 1) { + if (n == 1) { + U[0] = T[0]; + } + return n; + } + pidx = SA_IS(new ByteArray(T, 0), A, 0, n, 256, true); + U[0] = T[n - 1]; + for (i = 0; i < pidx; ++i) { + U[i + 1] = (byte) (A[i] & 0xff); + } + for (i += 1; i < n; ++i) { + U[i] = (byte) (A[i] & 0xff); + } + return pidx + 1; + } + + /* char */ + public static int + bwtransform(char[] T, char[] U, int[] A, int n) { + int i, pidx; + if ((T == null) || (U == null) || (A == null) || + (T.length < n) || (U.length < n) || (A.length < n)) { + return -1; + } + if (n <= 1) { + if (n == 1) { + U[0] = T[0]; + } + return n; + } + pidx = SA_IS(new CharArray(T, 0), A, 0, n, 65536, true); + U[0] = T[n - 1]; + for (i = 0; i < pidx; ++i) { + U[i + 1] = (char) (A[i] & 0xffff); + } + for (i += 1; i < n; ++i) { + U[i] = (char) (A[i] & 0xffff); + } + return pidx + 1; + } + + /* short */ + public static int + bwtransform(short[] T, short[] U, int[] A, int n, int k) { + int i, pidx; + if ((T == null) || (U == null) || (A == null) || + (T.length < n) || (U.length < n) || (A.length < n) || + (0 <= k) || (65536 < k)) { + return -1; + } + if (n <= 1) { + if (n == 1) { + U[0] = T[0]; + } + return n; + } + pidx = SA_IS(new ShortArray(T, 0), A, 0, n, k, true); + U[0] = T[n - 1]; + for (i = 0; i < pidx; ++i) { + U[i + 1] = (short) (A[i] & 0xffff); + } + for (i += 1; i < n; ++i) { + U[i] = (short) (A[i] & 0xffff); + } + return pidx + 1; + } + + /* int */ + public static int + bwtransform(int[] T, int[] U, int[] A, int n, int k) { + int i, pidx; + if ((T == null) || (U == null) || (A == null) || + (T.length < n) || (U.length < n) || (A.length < n) || + (0 <= k)) { + return -1; + } + if (n <= 1) { + if (n == 1) { + U[0] = T[0]; + } + return n; + } + pidx = SA_IS(new IntArray(T, 0), A, 0, n, k, true); + U[0] = T[n - 1]; + for (i = 0; i < pidx; ++i) { + U[i + 1] = A[i]; + } + for (i += 1; i < n; ++i) { + U[i] = A[i]; + } + return pidx + 1; + } + + @Override + public int[] buildSuffixArray(int[] input, int start, int length) { + // TODO: [dw] add constraints here. + final int[] SA = new int[length]; + MinMax mm = Tools.minmax(input, start, length); + suffixsort(input, SA, length, mm.max + 1); + return SA; + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Skew.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Skew.java index f7e60ad73..d0071b8e8 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Skew.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Skew.java @@ -6,7 +6,7 @@ *

    * Straightforward reimplementation of the recursive algorithm given in: * J. Kärkkäinen and P. Sanders. Simple linear work suffix array construction. - * In Proc. 13th International Conference on Automata, Languages and Programming, + * In Proc. 13th International Conference on Automata, Languages and Programming, * Springer, 2003 * *

    @@ -19,21 +19,18 @@ * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -public final class Skew implements ISuffixArrayBuilder -{ +public final class Skew implements ISuffixArrayBuilder { /** * Lexicographic order for pairs. */ - private final static boolean leq(int a1, int a2, int b1, int b2) - { + private final static boolean leq(int a1, int a2, int b1, int b2) { return (a1 < b1 || (a1 == b1 && a2 <= b2)); } /** * Lexicographic order for triples. */ - private final static boolean leq(int a1, int a2, int a3, int b1, int b2, int b3) - { + private final static boolean leq(int a1, int a2, int a3, int b1, int b2, int b3) { return (a1 < b1 || (a1 == b1 && leq(a2, a3, b2, b3))); } @@ -41,9 +38,8 @@ private final static boolean leq(int a1, int a2, int a3, int b1, int b2, int b3) * Stably sort indexes from src[0..n-1] to dst[0..n-1] with values in 0..K from v. A * constant offset of vi is added to indexes from src. */ - private final static void radixPass(int [] src, int [] dst, int [] v, int vi, - final int n, final int K, int start, int [] cnt) - { + private final static void radixPass(int[] src, int[] dst, int[] v, int vi, + final int n, final int K, int start, int[] cnt) { // check counter array's size. assert cnt.length >= K + 1; Arrays.fill(cnt, 0, K + 1, 0); @@ -53,8 +49,7 @@ private final static void radixPass(int [] src, int [] dst, int [] v, int vi, cnt[v[start + vi + src[i]]]++; // exclusive prefix sums - for (int i = 0, sum = 0; i <= K; i++) - { + for (int i = 0, sum = 0; i <= K; i++) { final int t = cnt[i]; cnt[i] = sum; sum += t; @@ -69,16 +64,15 @@ private final static void radixPass(int [] src, int [] dst, int [] v, int vi, * Find the suffix array SA of s[0..n-1] in {1..K}^n. require s[n] = s[n+1] = s[n+2] = * 0, n >= 2. */ - static final int[] suffixArray(int [] s, int [] SA, int n, final int K, int start, int [] cnt) - { + static final int[] suffixArray(int[] s, int[] SA, int n, final int K, int start, int[] cnt) { final int n0 = (n + 2) / 3, n1 = (n + 1) / 3, n2 = n / 3, n02 = n0 + n2; - final int [] s12 = new int [n02 + 3]; + final int[] s12 = new int[n02 + 3]; s12[n02] = s12[n02 + 1] = s12[n02 + 2] = 0; - final int [] SA12 = new int [n02 + 3]; + final int[] SA12 = new int[n02 + 3]; SA12[n02] = SA12[n02 + 1] = SA12[n02 + 2] = 0; - final int [] s0 = new int [n0]; - final int [] SA0 = new int [n0]; + final int[] s0 = new int[n0]; + final int[] SA0 = new int[n0]; /* * generate positions of mod 1 and mod 2 suffixes the "+(n0-n1)" adds a dummy mod @@ -95,39 +89,31 @@ static final int[] suffixArray(int [] s, int [] SA, int n, final int K, int star // find lexicographic names of triples int name = 0, c0 = -1, c1 = -1, c2 = -1; - for (int i = 0; i < n02; i++) - { + for (int i = 0; i < n02; i++) { if (s[start + SA12[i]] != c0 || s[start + SA12[i] + 1] != c1 - || s[start + SA12[i] + 2] != c2) - { + || s[start + SA12[i] + 2] != c2) { name++; c0 = s[start + SA12[i]]; c1 = s[start + SA12[i] + 1]; c2 = s[start + SA12[i] + 2]; } - if ((SA12[i] % 3) == 1) - { + if ((SA12[i] % 3) == 1) { // left half s12[SA12[i] / 3] = name; - } - else - { + } else { // right half s12[SA12[i] / 3 + n0] = name; } } // recurse if names are not yet unique - if (name < n02) - { + if (name < n02) { cnt = suffixArray(s12, SA12, n02, name, start, cnt); // store unique names in s12 using the suffix array for (int i = 0; i < n02; i++) s12[SA12[i]] = i + 1; - } - else - { + } else { // generate the suffix array of s12 directly for (int i = 0; i < n02; i++) SA12[s12[i] - 1] = i; @@ -139,8 +125,7 @@ static final int[] suffixArray(int [] s, int [] SA, int n, final int K, int star radixPass(s0, SA0, s, 0, n0, K, start, cnt); // merge sorted SA0 suffixes and sorted SA12 suffixes - for (int p = 0, t = n0 - n1, k = 0; k < n; k++) - { + for (int p = 0, t = n0 - n1, k = 0; k < n; k++) { // pos of current offset 12 suffix final int i = (SA12[t] < n0 ? SA12[t] * 3 + 1 : (SA12[t] - n0) * 3 + 2); // pos of current offset 0 suffix @@ -148,27 +133,21 @@ static final int[] suffixArray(int [] s, int [] SA, int n, final int K, int star if (SA12[t] < n0 ? leq(s[start + i], s12[SA12[t] + n0], s[start + j], s12[j / 3]) : leq(s[start + i], s[start + i + 1], s12[SA12[t] - n0 + 1], - s[start + j], s[start + j + 1], s12[j / 3 + n0])) - { + s[start + j], s[start + j + 1], s12[j / 3 + n0])) { // suffix from SA12 is smaller SA[k] = i; t++; - if (t == n02) - { + if (t == n02) { // done --- only SA0 suffixes left for (k++; p < n0; p++, k++) SA[k] = SA0[p]; } - } - else - { + } else { SA[k] = j; p++; - if (p == n0) - { + if (p == n0) { // done --- only SA12 suffixes left - for (k++; t < n02; t++, k++) - { + for (k++; t < n02; t++, k++) { SA[k] = (SA12[t] < n0 ? SA12[t] * 3 + 1 : (SA12[t] - n0) * 3 + 2); } } @@ -181,14 +160,12 @@ static final int[] suffixArray(int [] s, int [] SA, int n, final int K, int star /** * Ensure array is large enough or reallocate (no copying). */ - private static final int [] ensureSize(int [] tab, int length) - { - if (tab.length < length) - { + private static final int[] ensureSize(int[] tab, int length) { + if (tab.length < length) { tab = null; - tab = new int [length]; + tab = new int[length]; } - + return tab; } @@ -197,41 +174,40 @@ static final int[] suffixArray(int [] s, int [] SA, int n, final int K, int star *

    * Additional constraints enforced by Karkkainen-Sanders algorithm: *

      - *
    • non-negative (>0) symbols in the input (because of radix sort)
    • , + *
    • non-negative (>0) symbols in the input (because of radix sort)
    • *
    • input.length >= start + length + 3 (to simplify * border cases)
    • - *
    • length >= 2
    • + *
    • length >= 2
    • *
    *

    * If the input contains zero or negative values, or has no extra trailing cells, * adapters can be used in the following way: - * + *

    *

          * return new {@link DensePositiveDecorator}(
          *      new {@link ExtraTrailingCellsDecorator}(
          *          new {@link Skew}(), 3));
          * 
    - * + * * @see ExtraTrailingCellsDecorator * @see DensePositiveDecorator */ @Override - public int [] buildSuffixArray(int [] input, int start, int length) - { + public int[] buildSuffixArray(int[] input, int start, int length) { Tools.assertAlways(input != null, "input must not be null"); Tools.assertAlways(length >= 2, "input length must be >= 2"); Tools.assertAlways(input.length >= start + length + 3, "no extra space after input end"); assert Tools.allPositive(input, start, length); final int alphabetSize = Tools.max(input, start, length); - final int [] SA = new int [length + 3]; + final int[] SA = new int[length + 3]; // Preserve the tail of the input (destroyed when constructing the array). - final int [] tail = new int [3]; + final int[] tail = new int[3]; System.arraycopy(input, start + length, tail, 0, 3); Arrays.fill(input, start + length, start + length + 3, 0); - suffixArray(input, SA, length, alphabetSize, start, new int [alphabetSize + 2]); + suffixArray(input, SA, length, alphabetSize, start, new int[alphabetSize + 2]); // Reconstruct the input's tail. System.arraycopy(tail, 0, input, start + length, 3); diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SuffixArrays.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SuffixArrays.java index 7e5188e2f..a85e2e217 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SuffixArrays.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SuffixArrays.java @@ -4,8 +4,6 @@ import java.util.Comparator; import java.util.List; -import com.google.common.collect.Lists; - /* * TODO: ultimately, this class should be "intelligent" enough to pick the best * algorithm, depending on the distribution and properties of the input (alphabet size, @@ -28,28 +26,25 @@ * * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) - * @author Anton Olsson for friprogramvarusyndikatet.se + * @author Anton Olsson for friprogramvarusyndikatet.se */ -public final class SuffixArrays -{ +public final class SuffixArrays { /** * Maximum required trailing space in the input array (certain algorithms need it). */ final static int MAX_EXTRA_TRAILING_SPACE = DeepShallow.OVERSHOOT; /* - * + * */ - private SuffixArrays() - { + private SuffixArrays() { // no instances. } /** * Create a suffix array for a given character sequence with the default algorithm. */ - public static int [] create(CharSequence s) - { + public static int[] create(CharSequence s) { return create(s, defaultAlgorithm()); } @@ -57,40 +52,36 @@ private SuffixArrays() * Create a suffix array for a given character sequence, using the provided suffix * array building strategy. */ - public static int [] create(CharSequence s, ISuffixArrayBuilder builder) - { + public static int[] create(CharSequence s, ISuffixArrayBuilder builder) { return new CharSequenceAdapter(builder).buildSuffixArray(s); } /** * Create a suffix array and an LCP array for a given character sequence. - * + * * @see #computeLCP(int[], int, int, int[]) */ - public static SuffixData createWithLCP(CharSequence s) - { + public static SuffixData createWithLCP(CharSequence s) { return createWithLCP(s, defaultAlgorithm()); } /** * Create a suffix array and an LCP array for a given character sequence, use the * given algorithm for building the suffix array. - * + * * @see #computeLCP(int[], int, int, int[]) */ - public static SuffixData createWithLCP(CharSequence s, ISuffixArrayBuilder builder) - { + public static SuffixData createWithLCP(CharSequence s, ISuffixArrayBuilder builder) { final CharSequenceAdapter adapter = new CharSequenceAdapter(builder); - final int [] sa = adapter.buildSuffixArray(s); - final int [] lcp = computeLCP(adapter.input, 0, s.length(), sa); + final int[] sa = adapter.buildSuffixArray(s); + final int[] lcp = computeLCP(adapter.input, 0, s.length(), sa); return new SuffixData(sa, lcp); } /** * Create a suffix array and an LCP array for a given input sequence of symbols. */ - public static SuffixData createWithLCP(int [] input, int start, int length) - { + public static SuffixData createWithLCP(int[] input, int start, int length) { final ISuffixArrayBuilder builder = new DensePositiveDecorator( new ExtraTrailingCellsDecorator(defaultAlgorithm(), 3)); return createWithLCP(input, start, length, builder); @@ -100,11 +91,10 @@ public static SuffixData createWithLCP(int [] input, int start, int length) * Create a suffix array and an LCP array for a given input sequence of symbols and a * custom suffix array building strategy. */ - public static SuffixData createWithLCP(int [] input, int start, int length, - ISuffixArrayBuilder builder) - { - final int [] sa = builder.buildSuffixArray(input, start, length); - final int [] lcp = computeLCP(input, start, length, sa); + public static SuffixData createWithLCP(int[] input, int start, int length, + ISuffixArrayBuilder builder) { + final int[] sa = builder.buildSuffixArray(input, start, length); + final int[] lcp = computeLCP(input, start, length, sa); return new SuffixData(sa, lcp); } @@ -123,7 +113,7 @@ public static SuffixData createWithLCP(T[] input, ISuffixArrayBuilder builde /** * Calculate longest prefix (LCP) array for an existing suffix array and input. Index * i of the returned array indicates the length of the common prefix - * between suffix i and i-1. The 0-th + * between suffix i and i-1. The 0-th * index has a constant value of -1. *

    * The algorithm used to compute the LCP comes from @@ -131,27 +121,21 @@ public static SuffixData createWithLCP(T[] input, ISuffixArrayBuilder builde * computation in suffix arrays and its applications. In Proc. 12th Symposium on Combinatorial * Pattern Matching (CPM ’01), pages 181–192. Springer-Verlag LNCS n. 2089, 2001. */ - public static int [] computeLCP(int [] input, final int start, final int length, - int [] sa) - { - final int [] rank = new int [length]; + public static int[] computeLCP(int[] input, final int start, final int length, + int[] sa) { + final int[] rank = new int[length]; for (int i = 0; i < length; i++) rank[sa[i]] = i; int h = 0; - final int [] lcp = new int [length]; - for (int i = 0; i < length; i++) - { + final int[] lcp = new int[length]; + for (int i = 0; i < length; i++) { int k = rank[i]; - if (k == 0) - { + if (k == 0) { lcp[k] = -1; - } - else - { + } else { final int j = sa[k - 1]; while (i + h < length && j + h < length - && input[start + i + h] == input[start + j + h]) - { + && input[start + i + h] == input[start + j + h]) { h++; } lcp[k] = h; @@ -166,20 +150,17 @@ public static SuffixData createWithLCP(T[] input, ISuffixArrayBuilder builde * @return Return a new instance of the default algorithm for use in other methods. At * the moment {@link QSufSort} is used. */ - private static ISuffixArrayBuilder defaultAlgorithm() - { + private static ISuffixArrayBuilder defaultAlgorithm() { return new QSufSort(); } /** * Utility method converting all suffixes of a given sequence to a list of strings. */ - public static List toString(CharSequence input, int [] suffixes) - { + public static List toString(CharSequence input, int[] suffixes) { final String full = input.toString(); - final ArrayList result = Lists.newArrayList(); - for (int i = 0; i < input.length(); i++) - { + final ArrayList result = new ArrayList<>(); + for (int i = 0; i < input.length(); i++) { result.add(full.subSequence(suffixes[i], full.length())); } return result; diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SuffixData.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SuffixData.java index 91de3df8b..6d44d1810 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SuffixData.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/SuffixData.java @@ -2,29 +2,25 @@ /** * A holder structure for a suffix array and longest common prefix array of - * a given sequence. + * a given sequence. * * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -public final class SuffixData -{ - private final int [] suffixArray; - private final int [] lcp; +public final class SuffixData { + private final int[] suffixArray; + private final int[] lcp; - SuffixData(int [] sa, int [] lcp) - { + SuffixData(int[] sa, int[] lcp) { this.suffixArray = sa; this.lcp = lcp; } - public int [] getSuffixArray() - { + public int[] getSuffixArray() { return suffixArray; } - public int [] getLCP() - { + public int[] getLCP() { return lcp; } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Tools.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Tools.java index fea26f726..00c135265 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Tools.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Tools.java @@ -6,10 +6,8 @@ * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -final class Tools -{ - private Tools() - { +final class Tools { + private Tools() { // No instances. } @@ -17,12 +15,9 @@ private Tools() * Check if all symbols in the given range are greater than 0, return * true if so, false otherwise. */ - static final boolean allPositive(int [] input, int start, int length) - { - for (int i = length - 1, index = start; i >= 0; i--, index++) - { - if (input[index] <= 0) - { + static final boolean allPositive(int[] input, int start, int length) { + for (int i = length - 1, index = start; i >= 0; i--, index++) { + if (input[index] <= 0) { return false; } } @@ -33,16 +28,13 @@ static final boolean allPositive(int [] input, int start, int length) /** * Determine the maximum value in a slice of an array. */ - static final int max(int [] input, int start, int length) - { + static final int max(int[] input, int start, int length) { assert length >= 1; int max = input[start]; - for (int i = length - 2, index = start + 1; i >= 0; i--, index++) - { + for (int i = length - 2, index = start + 1; i >= 0; i--, index++) { final int v = input[index]; - if (v > max) - { + if (v > max) { max = v; } } @@ -53,16 +45,13 @@ static final int max(int [] input, int start, int length) /** * Determine the minimum value in a slice of an array. */ - static final int min(int [] input, int start, int length) - { + static final int min(int[] input, int start, int length) { assert length >= 1; int min = input[start]; - for (int i = length - 2, index = start + 1; i >= 0; i--, index++) - { + for (int i = length - 2, index = start + 1; i >= 0; i--, index++) { final int v = input[index]; - if (v < min) - { + if (v < min) { min = v; } } @@ -73,19 +62,15 @@ static final int min(int [] input, int start, int length) /** * Calculate minimum and maximum value for a slice of an array. */ - static MinMax minmax(int [] input, final int start, final int length) - { + static MinMax minmax(int[] input, final int start, final int length) { int max = input[start]; int min = max; - for (int i = length - 2, index = start + 1; i >= 0; i--, index++) - { + for (int i = length - 2, index = start + 1; i >= 0; i--, index++) { final int v = input[index]; - if (v > max) - { + if (v > max) { max = v; } - if (v < min) - { + if (v < min) { min = v; } } @@ -96,14 +81,12 @@ static MinMax minmax(int [] input, final int start, final int length) /** * Throw {@link AssertionError} if a condition is false. This should * be called when the assertion must be always verified (as in the case of verifying - * the algorithm's preconditions). For other, internal assertions, one should use + * the algorithm's preconditions). For other, internal assertions, one should use * assert keyword so that such assertions can be disabled at run-time (for * performance reasons). */ - static final void assertAlways(boolean condition, String msg) - { - if (!condition) - { + static final void assertAlways(boolean condition, String msg) { + if (!condition) { throw new AssertionError(msg); } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Traversals.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Traversals.java index 1cdc99b19..6371431cb 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Traversals.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixarray/Traversals.java @@ -10,20 +10,18 @@ * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ -public final class Traversals -{ +public final class Traversals { /** * Visitor interface for post-order traversal methods in {@link Traversals}. */ - public interface IPostOrderVisitor - { + public interface IPostOrderVisitor { /** * Visits a node in the (virtual) suffix tree, labeled with length * objects starting at start in the input sequence. - * - * @param start The node label's starting offset in the input sequence. + * + * @param start The node label's starting offset in the input sequence. * @param length The node label's length (number of symbols). - * @param leaf true if this node is a leaf. + * @param leaf true if this node is a leaf. */ public void visitNode(int start, int length, boolean leaf); } @@ -32,8 +30,7 @@ public interface IPostOrderVisitor * Visitor interface for post-order traversal methods that compute an aggregated value * during the traversal. */ - public interface IPostOrderComputingVisitor - { + public interface IPostOrderComputingVisitor { /** * Aggregate two values into the result. The aggregation function should be * symmetric, that is: value1 + value2 = value2 + value1. @@ -42,10 +39,10 @@ public interface IPostOrderComputingVisitor /** * Compute the initial value for a leaf node. - * - * @param saIndex Index of the leaf node in the suffix array. + * + * @param saIndex Index of the leaf node in the suffix array. * @param symbolIndex The node label's starting offset in the input sequence. - * @param length The node label's length (number of symbols). + * @param length The node label's length (number of symbols). * @return Returns the initial function value for the leaf node. */ public E leafValue(int saIndex, int symbolIndex, int length); @@ -53,11 +50,11 @@ public interface IPostOrderComputingVisitor /** * Visits a node in the (virtual) suffix tree, labeled with length * objects starting at start in the input sequence. - * - * @param start The node label's starting offset in the input sequence. + * + * @param start The node label's starting offset in the input sequence. * @param length The node label's length (number of symbols). - * @param leaf true if this node is a leaf. - * @param value Aggregated value for all sub-nodes of the given node. + * @param leaf true if this node is a leaf. + * @param value Aggregated value for all sub-nodes of the given node. */ public void visitNode(int start, int length, boolean leaf, E value); } @@ -72,15 +69,14 @@ public interface IPostOrderComputingVisitor * The algorithm implemented here is from Efficient Substring Traversal with Suffix * Arrays by Toru Kasai, Hiroki Arimura and Setsuo Arikawa, Dept of Informatics, * Kyushu University, Japan. - * + * * @param sequenceLength Input sequence length for the suffix array and LCP array. - * @param sa Suffix array. - * @param lcp Corresponding LCP array for a given suffix array. - * @param visitor Callback visitor. + * @param sa Suffix array. + * @param lcp Corresponding LCP array for a given suffix array. + * @param visitor Callback visitor. */ - public static void postorder(final int sequenceLength, int [] sa, int [] lcp, - IPostOrderVisitor visitor) - { + public static void postorder(final int sequenceLength, int[] sa, int[] lcp, + IPostOrderVisitor visitor) { assert sequenceLength <= sa.length && sequenceLength <= lcp.length : "Input sequence length larger than suffix array or the LCP."; final Deque stack = new ArrayDeque(); @@ -91,12 +87,10 @@ public static void postorder(final int sequenceLength, int [] sa, int [] lcp, // Process every leaf. int top_h; - for (int i = 0; i <= sequenceLength; i++) - { + for (int i = 0; i <= sequenceLength; i++) { final int h = (sequenceLength == i ? -1 : lcp[i]); - while (true) - { + while (true) { top_h = stack.peek(); if (top_h <= h) break; stack.pop(); @@ -108,14 +102,12 @@ public static void postorder(final int sequenceLength, int [] sa, int [] lcp, visitor.visitNode(sa[leaf ? -(top_i + 1) : top_i], top_h, leaf); } - if (top_h < h) - { + if (top_h < h) { stack.push(i); stack.push(h); } - if (i < sequenceLength) - { + if (i < sequenceLength) { // Mark leaf nodes in the stack. stack.push(-(i + 1)); stack.push(sequenceLength - sa[i]); @@ -133,17 +125,16 @@ public static void postorder(final int sequenceLength, int [] sa, int [] lcp, * The algorithm implemented here is from Efficient Substring Traversal with Suffix * Arrays by Toru Kasai, Hiroki Arimura and Setsuo Arikawa, Dept of Informatics, * Kyushu University, Japan. - * + * * @param sequenceLength Input sequence length for the suffix array and LCP array. - * @param sa Suffix array. - * @param lcp Corresponding LCP array for a given suffix array. - * @param visitor Callback visitor computing aggregate values when traversing the - * tree. - * @param epsilon "Zero" value (epsilon) for computations. + * @param sa Suffix array. + * @param lcp Corresponding LCP array for a given suffix array. + * @param visitor Callback visitor computing aggregate values when traversing the + * tree. + * @param epsilon "Zero" value (epsilon) for computations. */ - public static void postorder(final int sequenceLength, int [] sa, int [] lcp, - E epsilon, IPostOrderComputingVisitor visitor) - { + public static void postorder(final int sequenceLength, int[] sa, int[] lcp, + E epsilon, IPostOrderComputingVisitor visitor) { assert sequenceLength <= sa.length && sequenceLength <= lcp.length : "Input sequence length larger than suffix array or the LCP."; final Deque stack = new ArrayDeque(); @@ -157,13 +148,11 @@ public static void postorder(final int sequenceLength, int [] sa, int [] lcp // Process every leaf. int top_h; E top_c; - for (int i = 0; i <= sequenceLength; i++) - { + for (int i = 0; i <= sequenceLength; i++) { final int h = (sequenceLength == i ? -1 : lcp[i]); E ci = epsilon; - while (true) - { + while (true) { top_h = stack.peek(); if (top_h <= h) break; stack.pop(); @@ -179,21 +168,17 @@ public static void postorder(final int sequenceLength, int [] sa, int [] lcp top_c = values.get(values.size() - 1); } - if (top_h < h) - { + if (top_h < h) { stack.push(i); stack.push(h); values.add(ci); - } - else - { + } else { assert top_h == h; final int index = values.size() - 1; values.set(index, visitor.aggregate(ci, values.get(index))); } - if (i < sequenceLength) - { + if (i < sequenceLength) { // Mark leaf nodes in the stack. stack.push(-(i + 1)); stack.push(sequenceLength - sa[i]); diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/ActivePoint.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/ActivePoint.java index 9c81f8f62..b9d8a67bd 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/ActivePoint.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/ActivePoint.java @@ -4,202 +4,196 @@ * Represents the Active Point used in Ukonnen's algorithm. This consists of the * triple active node, active edge and active length, which is used to identify * the point at which the next insertion should be considered. - * - * @author Max Garfinkel - * + * * @param + * @author Max Garfinkel */ -class ActivePoint> { - - private Node activeNode; - private Edge activeEdge; - private int activeLength; - private final Node root; - - /** - * Initialize the active point to the root of a suffix tree. This sets the - * active point to {root,null,0} - * - * @param root - */ - ActivePoint(Node root) { - activeNode = root; - activeEdge = null; - activeLength = 0; - this.root = root; - } - - /** - * Sets the active point to a new node, edge, length tripple. - * - * @param node - * @param edge - * @param length - */ - void setPosition(Node node, Edge edge, int length) { - activeNode = node; - activeEdge = edge; - activeLength = length; - } - - /** - * Sets the active edge. - * - * @param edge - * The edge to which we set the active edge. - */ - void setEdge(Edge edge) { - activeEdge = edge; - } - - /** - * Increments the active length. - */ - void incrementLength() { - activeLength++; - resetActivePointToTerminal(); - } - - /** - * Decrements the active length. - */ - void decrementLength() { - if (activeLength > 0) - activeLength--; - resetActivePointToTerminal(); - } - - /** - * - * @return True if the active point is the root node. False if not. - */ - boolean isRootNode() { - return activeNode.equals(root) && activeEdge == null - && activeLength == 0; - } - - /** - * - * @return True if active point is on a node. False if not. - */ - boolean isNode() { - return activeEdge == null && activeLength == 0; - } - - /** - * Retrieves the active node. - * - * @return The active node. - */ - Node getNode() { - return activeNode; - } - - /** - * - * @return True if the active point is on an edge. False if not. - */ - boolean isEdge() { - return activeEdge != null; - } - - /** - * Retrieves the current active edge. - * - * @return The active edge. - */ - Edge getEdge() { - return activeEdge; - } - - /** - * Retrieves the current active length. - * - * @return The active length. - */ - int getLength() { - return activeLength; - } - - /** - * Resets the active point after an insert. - * - * @param suffix - * The remaining suffix to be inserted. - */ - public void updateAfterInsert(Suffix suffix) { - if (activeNode == root && suffix.isEmpty()) { - activeNode = root; - activeEdge = null; - activeLength = 0; - } else if (activeNode == root) { - Object item = suffix.getStart(); - activeEdge = root.getEdgeStarting(item); - decrementLength(); - fixActiveEdgeAfterSuffixLink(suffix); - if (activeLength == 0) - activeEdge = null; - } else if (activeNode.hasSuffixLink()) { - activeNode = activeNode.getSuffixLink(); - findTrueActiveEdge(); - fixActiveEdgeAfterSuffixLink(suffix); - if (activeLength == 0) - activeEdge = null; - } else{ - activeNode = root; - findTrueActiveEdge(); - fixActiveEdgeAfterSuffixLink(suffix); - if (activeLength == 0) - activeEdge = null; - } - } - - /** - * Deal with the case when we follow a suffix link but the active length is - * greater than the new active edge length. In this situation we must walk - * down the tree updating the entire active point. - */ - private void fixActiveEdgeAfterSuffixLink(Suffix suffix) { - while (activeEdge != null && activeLength > activeEdge.getLength()) { - activeLength = activeLength - activeEdge.getLength(); - activeNode = activeEdge.getTerminal(); - Object item = suffix.getItemXFromEnd(activeLength + 1); - activeEdge = activeNode.getEdgeStarting(item); - } - resetActivePointToTerminal(); - } - - /** - * Finds the edge instance who's start item matches the current active edge - * start item but comes from the current active node. - */ - private void findTrueActiveEdge() { - if (activeEdge != null) { - Object item = activeEdge.getStartItem(); - activeEdge = activeNode.getEdgeStarting(item); - } - } - - /** - * Resizes the active length in the case where we are sitting on a terminal. - * - * @return true if reset occurs false otherwise. - */ - private boolean resetActivePointToTerminal() { - if (activeEdge != null && activeEdge.getLength() == activeLength - && activeEdge.isTerminating()) { - activeNode = activeEdge.getTerminal(); - activeEdge = null; - activeLength = 0; - return true; - } else { - return false; - } - } - - @Override - public String toString() { - return "{" + activeNode.toString() + ", " + activeEdge + ", " - + activeLength + "}"; - } +class ActivePoint> { + + private Node activeNode; + private Edge activeEdge; + private int activeLength; + private final Node root; + + /** + * Initialize the active point to the root of a suffix tree. This sets the + * active point to {root,null,0} + * + * @param root + */ + ActivePoint(Node root) { + activeNode = root; + activeEdge = null; + activeLength = 0; + this.root = root; + } + + /** + * Sets the active point to a new node, edge, length tripple. + * + * @param node + * @param edge + * @param length + */ + void setPosition(Node node, Edge edge, int length) { + activeNode = node; + activeEdge = edge; + activeLength = length; + } + + /** + * Sets the active edge. + * + * @param edge The edge to which we set the active edge. + */ + void setEdge(Edge edge) { + activeEdge = edge; + } + + /** + * Increments the active length. + */ + void incrementLength() { + activeLength++; + resetActivePointToTerminal(); + } + + /** + * Decrements the active length. + */ + void decrementLength() { + if (activeLength > 0) + activeLength--; + resetActivePointToTerminal(); + } + + /** + * @return True if the active point is the root node. False if not. + */ + boolean isRootNode() { + return activeNode.equals(root) && activeEdge == null + && activeLength == 0; + } + + /** + * @return True if active point is on a node. False if not. + */ + boolean isNode() { + return activeEdge == null && activeLength == 0; + } + + /** + * Retrieves the active node. + * + * @return The active node. + */ + Node getNode() { + return activeNode; + } + + /** + * @return True if the active point is on an edge. False if not. + */ + boolean isEdge() { + return activeEdge != null; + } + + /** + * Retrieves the current active edge. + * + * @return The active edge. + */ + Edge getEdge() { + return activeEdge; + } + + /** + * Retrieves the current active length. + * + * @return The active length. + */ + int getLength() { + return activeLength; + } + + /** + * Resets the active point after an insert. + * + * @param suffix The remaining suffix to be inserted. + */ + public void updateAfterInsert(Suffix suffix) { + if (activeNode == root && suffix.isEmpty()) { + activeNode = root; + activeEdge = null; + activeLength = 0; + } else if (activeNode == root) { + Object item = suffix.getStart(); + activeEdge = root.getEdgeStarting(item); + decrementLength(); + fixActiveEdgeAfterSuffixLink(suffix); + if (activeLength == 0) + activeEdge = null; + } else if (activeNode.hasSuffixLink()) { + activeNode = activeNode.getSuffixLink(); + findTrueActiveEdge(); + fixActiveEdgeAfterSuffixLink(suffix); + if (activeLength == 0) + activeEdge = null; + } else { + activeNode = root; + findTrueActiveEdge(); + fixActiveEdgeAfterSuffixLink(suffix); + if (activeLength == 0) + activeEdge = null; + } + } + + /** + * Deal with the case when we follow a suffix link but the active length is + * greater than the new active edge length. In this situation we must walk + * down the tree updating the entire active point. + */ + private void fixActiveEdgeAfterSuffixLink(Suffix suffix) { + while (activeEdge != null && activeLength > activeEdge.getLength()) { + activeLength = activeLength - activeEdge.getLength(); + activeNode = activeEdge.getTerminal(); + Object item = suffix.getItemXFromEnd(activeLength + 1); + activeEdge = activeNode.getEdgeStarting(item); + } + resetActivePointToTerminal(); + } + + /** + * Finds the edge instance who's start item matches the current active edge + * start item but comes from the current active node. + */ + private void findTrueActiveEdge() { + if (activeEdge != null) { + Object item = activeEdge.getStartItem(); + activeEdge = activeNode.getEdgeStarting(item); + } + } + + /** + * Resizes the active length in the case where we are sitting on a terminal. + * + * @return true if reset occurs false otherwise. + */ + private boolean resetActivePointToTerminal() { + if (activeEdge != null && activeEdge.getLength() == activeLength + && activeEdge.isTerminating()) { + activeNode = activeEdge.getTerminal(); + activeEdge = null; + activeLength = 0; + return true; + } else { + return false; + } + } + + @Override + public String toString() { + return "{" + activeNode.toString() + ", " + activeEdge + ", " + + activeLength + "}"; + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Cursor.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Cursor.java index 339ebef32..cffc929a8 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Cursor.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Cursor.java @@ -5,100 +5,98 @@ import java.util.HashSet; /** - * * @param * @param - * * @author Max Garfinkel */ -public class Cursor> { +public class Cursor> { + + private final SuffixTree tree; + private Node node; + private Edge edge; + private int length; + + + Cursor(SuffixTree tree) { + this.tree = tree; + node = tree.getRoot(); + edge = null; + length = 0; + } + + boolean proceedTo(T item) { + if (edge == null) { + Edge tmpEdge = node.getEdgeStarting(item); + if (tmpEdge != null) { + edge = tmpEdge; + length = 1; + return true; + } else { + return false; + } + } else if (edge.getLength() > length) { + T nextItem = edge.getItemAt(length); + if (nextItem != null && item.equals(nextItem)) { + length++; + return true; + } else { + return false; + } + } else { + Node terminal = edge.getTerminal(); + if (terminal == null) + return false; + else { + Edge tmpEdge = terminal.getEdgeStarting(item); + if (tmpEdge != null) { + edge = tmpEdge; + length = 1; + node = terminal; + return true; + } else { + return false; + } + } + } + } + + Collection> getSequenceTerminals() { + if (edge == null) { + return node.getSuffixTerminals(); + } else { + if ((edge.getLength() - 1 == length && !edge.isTerminating()) + || (edge.getItemAt(length).getClass().equals(SequenceTerminal.class)) && !edge.isTerminating()) { + Object seqTerminal = edge.getItemAt(length); + @SuppressWarnings("unchecked") + SequenceTerminal term = (SequenceTerminal) seqTerminal; + Collection> collection = new HashSet>(); + collection.add(term); + return collection; + } else { + Node terminal = edge.getTerminal(); + if (terminal == null) + return Collections.emptySet(); + else { + Collection> edges = terminal.getEdges(); + Collection> returnCollection = new HashSet>(); + for (Edge edge : edges) { + Object o = edge.getStartItem(); + if (o.getClass().equals(SequenceTerminal.class)) { + @SuppressWarnings("unchecked") + SequenceTerminal returnTerminal = (SequenceTerminal) o; + returnCollection.add(returnTerminal); + } + } + return returnCollection; + } + } + } + } + + void returnToRoot() { + node = tree.getRoot(); + edge = null; + length = 0; + } - private final SuffixTree tree; - private Node node; - private Edge edge; - private int length; - - - Cursor(SuffixTree tree){ - this.tree = tree; - node = tree.getRoot(); - edge = null; - length = 0; - } - - boolean proceedTo(T item){ - if(edge == null){ - Edge tmpEdge = node.getEdgeStarting(item); - if(tmpEdge != null){ - edge = tmpEdge; - length = 1; - return true; - }else{ - return false; - } - }else if(edge.getLength() > length){ - T nextItem = edge.getItemAt(length); - if(nextItem != null && item.equals(nextItem)){ - length++; - return true; - }else{ - return false; - } - }else{ - Node terminal = edge.getTerminal(); - if(terminal == null) - return false; - else{ - Edge tmpEdge = terminal.getEdgeStarting(item); - if(tmpEdge != null){ - edge = tmpEdge; - length = 1; - node = terminal; - return true; - }else{ - return false; - } - } - } - } - - Collection> getSequenceTerminals(){ - if(edge == null){ - return node.getSuffixTerminals(); - }else{ - if((edge.getLength()-1 == length && !edge.isTerminating()) - || (edge.getItemAt(length).getClass().equals(SequenceTerminal.class)) && !edge.isTerminating()){ - Object seqTerminal = edge.getItemAt(length); - @SuppressWarnings("unchecked") - SequenceTerminal term = (SequenceTerminal)seqTerminal; - Collection> collection = new HashSet>(); - collection.add(term); - return collection; - }else{ - Node terminal = edge.getTerminal(); - if(terminal == null) - return Collections.emptySet(); - else{ - Collection> edges = terminal.getEdges(); - Collection> returnCollection = new HashSet>(); - for(Edge edge : edges){ - Object o = edge.getStartItem(); - if(o.getClass().equals(SequenceTerminal.class)){ - @SuppressWarnings("unchecked") - SequenceTerminal returnTerminal = (SequenceTerminal)o; - returnCollection.add(returnTerminal); - } - } - return returnCollection; - } - } - } - } - - void returnToRoot(){ - node = tree.getRoot(); - edge = null; - length = 0; - } - } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Edge.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Edge.java index cad27f976..bbb5ba9c9 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Edge.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Edge.java @@ -3,211 +3,198 @@ import java.util.Iterator; /** - * * @param * @param - * * @author Max Garfinkel */ class Edge> implements Iterable { - private final int start; - private int end = -1; - private final Node parentNode; - private final Sequence sequence; - - private Node terminal = null; - private SuffixTree tree = null; - - /** - * Create a new Edge object. - * - * @param start - * The position in the master sequence of the first item in this - * suffix. - * @param parent - * The parent {@link Node} - * @param sequence - * The master sequence which the {@link SuffixTree} indexes. - * @param tree - * The master {@link SuffixTree} containing the root element - * which this edge is a child of. - */ - Edge(int start, Node parent, Sequence sequence, SuffixTree tree) { - this.start = start; - this.parentNode = parent; - this.sequence = sequence; - this.tree = tree; - } - - /** - * Checks to see if the edge starts with the given item. - * - * @param item - * The possible start item. - * @return True if this edge starts with item. False if not. - */ - boolean isStarting(Object item) { - return sequence.getItem(start).equals(item); - } - - /** - * Insert the given suffix at the supplied active point. - * - * @param suffix - * The suffix to insert. - * @param activePoint - * The active point to insert it at. - * @return - */ - void insert(Suffix suffix, ActivePoint activePoint) { - Object item = suffix.getEndItem(); - Object nextItem = getItemAt(activePoint.getLength()); - if (item.equals(nextItem)) { - activePoint.incrementLength(); - } else { - split(suffix, activePoint); - suffix.decrement(); - activePoint.updateAfterInsert(suffix); - - if (suffix.isEmpty()) - return; - else - tree.insert(suffix); - } - } - - /** - * Splits the edge to enable the insertion of supplied suffix at the - * supplied active point. - * - * @param suffix - * The suffix to insert. - * @param activePoint - * The active point to insert it at. - */ - private void split(Suffix suffix, ActivePoint activePoint) { - Node breakNode = new Node(this, sequence, tree); - Edge newEdge = new Edge(suffix.getEndPosition()-1, breakNode, - sequence, tree); - breakNode.insert(newEdge); - Edge oldEdge = new Edge(start + activePoint.getLength(), - breakNode, sequence, tree); - oldEdge.end = end; - oldEdge.terminal = this.terminal; - breakNode.insert(oldEdge); - this.terminal = breakNode; - end = start + activePoint.getLength(); - tree.setSuffixLink(breakNode); - tree.incrementInsertCount(); - } - - /** - * Gets the index of the true end of the edge. - * - * @return The index of the end item, of this edge, in the original - * sequence. - */ - int getEnd() { - tree.getCurrentEnd(); - return end != -1 ? end : tree.getCurrentEnd(); - } - - /** - * Tests if this edge is terminates at a node. - * - * @return True if this edge ends at a node. False if not. - */ - boolean isTerminating() { - return terminal != null; - } - - /** - * Retrieves the length of this edge. - * - * @return - */ - int getLength() { - int realEnd = getEnd(); - return realEnd - start; - } - - /** - * Retrieves the terminating node of this edge if it has any, null if not. - * - * @return The terminating node if any exists, null otherwise. - */ - Node getTerminal() { - return terminal; - } - - /** - * Retrieves the item at given position within the current edge. - * - * @param position - * The index of the item to retrieve relative to the start of - * edge. - * @return The item at position. - * @throws IllegalArgumentException - * when the position exceeds the length of the current edge. - */ - @SuppressWarnings("unchecked") - T getItemAt(int position) { - if (position > getLength()) - throw new IllegalArgumentException("Index " + position - + " is greater than " + getLength() - + " - the length of this edge."); - return (T) sequence.getItem(start + position); - } - - /** - * Retrieves the starting item of this edge. - * - * @return The item at index 0 of this edge. - */ - @SuppressWarnings("unchecked") - T getStartItem() { - return (T) sequence.getItem(start); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - for (int i = start; i < getEnd(); i++) { - sb.append(sequence.getItem(i).toString()).append(", "); - if(sequence.getItem(i).getClass().equals(SequenceTerminal.class)) - break; - } - return sb.toString(); - } - - /** - * Retrieves an iterator that steps over the items in this edge. - * - * @return An iterator that walks this edge up to the end or terminating - * node. - */ - public Iterator iterator() { - return new Iterator() { - private int currentPosition = start; - private boolean hasNext = true; - - public boolean hasNext() { - return hasNext; - } - - @SuppressWarnings("unchecked") - public T next() { - if(end == -1) - hasNext = !sequence.getItem(currentPosition).getClass().equals(SequenceTerminal.class); - else - hasNext = currentPosition < getEnd()-1; - return (T) sequence.getItem(currentPosition++); - } - - public void remove() { - throw new UnsupportedOperationException( - "The remove method is not supported."); - } - }; - } + private final int start; + private int end = -1; + private final Node parentNode; + private final Sequence sequence; + + private Node terminal = null; + private SuffixTree tree = null; + + /** + * Create a new Edge object. + * + * @param start The position in the master sequence of the first item in this + * suffix. + * @param parent The parent {@link Node} + * @param sequence The master sequence which the {@link SuffixTree} indexes. + * @param tree The master {@link SuffixTree} containing the root element + * which this edge is a child of. + */ + Edge(int start, Node parent, Sequence sequence, SuffixTree tree) { + this.start = start; + this.parentNode = parent; + this.sequence = sequence; + this.tree = tree; + } + + /** + * Checks to see if the edge starts with the given item. + * + * @param item The possible start item. + * @return True if this edge starts with item. False if not. + */ + boolean isStarting(Object item) { + return sequence.getItem(start).equals(item); + } + + /** + * Insert the given suffix at the supplied active point. + * + * @param suffix The suffix to insert. + * @param activePoint The active point to insert it at. + * @return + */ + void insert(Suffix suffix, ActivePoint activePoint) { + Object item = suffix.getEndItem(); + Object nextItem = getItemAt(activePoint.getLength()); + if (item.equals(nextItem)) { + activePoint.incrementLength(); + } else { + split(suffix, activePoint); + suffix.decrement(); + activePoint.updateAfterInsert(suffix); + + if (suffix.isEmpty()) + return; + else + tree.insert(suffix); + } + } + + /** + * Splits the edge to enable the insertion of supplied suffix at the + * supplied active point. + * + * @param suffix The suffix to insert. + * @param activePoint The active point to insert it at. + */ + private void split(Suffix suffix, ActivePoint activePoint) { + Node breakNode = new Node(this, sequence, tree); + Edge newEdge = new Edge(suffix.getEndPosition() - 1, breakNode, + sequence, tree); + breakNode.insert(newEdge); + Edge oldEdge = new Edge(start + activePoint.getLength(), + breakNode, sequence, tree); + oldEdge.end = end; + oldEdge.terminal = this.terminal; + breakNode.insert(oldEdge); + this.terminal = breakNode; + end = start + activePoint.getLength(); + tree.setSuffixLink(breakNode); + tree.incrementInsertCount(); + } + + /** + * Gets the index of the true end of the edge. + * + * @return The index of the end item, of this edge, in the original + * sequence. + */ + int getEnd() { + tree.getCurrentEnd(); + return end != -1 ? end : tree.getCurrentEnd(); + } + + /** + * Tests if this edge is terminates at a node. + * + * @return True if this edge ends at a node. False if not. + */ + boolean isTerminating() { + return terminal != null; + } + + /** + * Retrieves the length of this edge. + * + * @return + */ + int getLength() { + int realEnd = getEnd(); + return realEnd - start; + } + + /** + * Retrieves the terminating node of this edge if it has any, null if not. + * + * @return The terminating node if any exists, null otherwise. + */ + Node getTerminal() { + return terminal; + } + + /** + * Retrieves the item at given position within the current edge. + * + * @param position The index of the item to retrieve relative to the start of + * edge. + * @return The item at position. + * @throws IllegalArgumentException when the position exceeds the length of the current edge. + */ + @SuppressWarnings("unchecked") + T getItemAt(int position) { + if (position > getLength()) + throw new IllegalArgumentException("Index " + position + + " is greater than " + getLength() + + " - the length of this edge."); + return (T) sequence.getItem(start + position); + } + + /** + * Retrieves the starting item of this edge. + * + * @return The item at index 0 of this edge. + */ + @SuppressWarnings("unchecked") + T getStartItem() { + return (T) sequence.getItem(start); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for (int i = start; i < getEnd(); i++) { + sb.append(sequence.getItem(i).toString()).append(", "); + if (sequence.getItem(i).getClass().equals(SequenceTerminal.class)) + break; + } + return sb.toString(); + } + + /** + * Retrieves an iterator that steps over the items in this edge. + * + * @return An iterator that walks this edge up to the end or terminating + * node. + */ + public Iterator iterator() { + return new Iterator() { + private int currentPosition = start; + private boolean hasNext = true; + + public boolean hasNext() { + return hasNext; + } + + @SuppressWarnings("unchecked") + public T next() { + if (end == -1) + hasNext = !sequence.getItem(currentPosition).getClass().equals(SequenceTerminal.class); + else + hasNext = currentPosition < getEnd() - 1; + return (T) sequence.getItem(currentPosition++); + } + + public void remove() { + throw new UnsupportedOperationException( + "The remove method is not supported."); + } + }; + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Node.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Node.java index f39409a86..6618a0acb 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Node.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Node.java @@ -8,165 +8,154 @@ import java.util.Set; /** - * * @param * @param - * * @author Max Garfinkel */ -class Node> implements Iterable> { - private final Map> edges = new HashMap>(); - private final Edge incomingEdge; - private Set> sequenceTerminals = new HashSet>(); - private final Sequence sequence; - private final SuffixTree tree; - private Node link = null; - - /** - * Create a new node, for the supplied tree and sequence. - * - * @param incomingEdge - * The parent edge, unless this is a root node. - * @param sequence - * The sequence this tree is indexing. - * @param tree - * The tree to which this node belongs. - */ - Node(Edge incomingEdge, Sequence sequence, SuffixTree tree) { - this.incomingEdge = incomingEdge; - this.sequence = sequence; - this.tree = tree; - } - - /** - * Inserts the suffix at the given active point. - * - * @param suffix - * The suffix to insert. - * @param activePoint - * The active point to insert it at. - */ - @SuppressWarnings("unchecked") - void insert(Suffix suffix, ActivePoint activePoint) { - Object item = suffix.getEndItem(); - - if (edges.containsKey(item)) { - if (tree.isNotFirstInsert() && activePoint.getNode() != tree.getRoot()) - tree.setSuffixLink(activePoint.getNode()); - activePoint.setEdge(edges.get(item)); - activePoint.incrementLength(); - } else { - saveSequenceTerminal(item); - Edge newEdge = new Edge(suffix.getEndPosition()-1, this, - sequence, tree); - edges.put((T) suffix.getEndItem(), newEdge); - suffix.decrement(); - activePoint.updateAfterInsert(suffix); - - if(tree.isNotFirstInsert() && !this.equals(tree.getRoot())){ - tree.getLastNodeInserted().setSuffixLink(this); - } - if (suffix.isEmpty()) - return; - else - tree.insert(suffix); - } - } - - private void saveSequenceTerminal(Object item) { - if(item.getClass().equals(SequenceTerminal.class)){ - @SuppressWarnings("unchecked") - SequenceTerminal terminal = (SequenceTerminal) item; - sequenceTerminals.add(terminal); - } - } - - /** - * Inserts the given edge as a child of this node. The edge must not already - * exist as child or an IllegalArgumentException will be thrown. - * - * @param edge - * The edge to be inserted. - * @throws IllegalArgumentException - * This is thrown when the edge already exists as an out bound - * edge of this node. - */ - void insert(Edge edge) { - if (edges.containsKey(edge.getStartItem())) - throw new IllegalArgumentException("Item " + edge.getStartItem() - + " already exists in node " + toString()); - edges.put(edge.getStartItem(), edge); - } - - /** - * Retrieves the edge starting with item or null if none exists. - * - * @param item - * @return The edge extending from this node starting with item. - */ - Edge getEdgeStarting(Object item) { - return edges.get(item); - } - - /** - * True if the node has a suffix link extending from it. - * - * @return True if node has suffix link. False if not. - */ - boolean hasSuffixLink() { - return link != null; - } - - /** - * Gets the number of edges extending from this node. - * - * @return The count of the number edges extending from this node. - */ - int getEdgeCount() { - return edges.size(); - } - - /** - * @return An iterator which iterates over the child edges. No order is - * guaranteed. - */ - public Iterator> iterator() { - return edges.values().iterator(); - } - - /** - * - * @return The node that this nodes suffix link points to if it has one. - * Null if not. - */ - Node getSuffixLink() { - return link; - } - - /** - * Sets the suffix link of this node to point to the supplied node. - * - * @param node - * The node this suffix link should point to. - */ - void setSuffixLink(Node node) { - link = node; - } - - @Override - public String toString() { - if (incomingEdge == null) - return "root"; - else { - return "end of edge [" + incomingEdge.toString() + "]"; - } - } - - public Collection> getSuffixTerminals() { - return sequenceTerminals; - } - - public Collection> getEdges(){ - return edges.values(); - } +class Node> implements Iterable> { + private final Map> edges = new HashMap>(); + private final Edge incomingEdge; + private Set> sequenceTerminals = new HashSet>(); + private final Sequence sequence; + private final SuffixTree tree; + private Node link = null; + + /** + * Create a new node, for the supplied tree and sequence. + * + * @param incomingEdge The parent edge, unless this is a root node. + * @param sequence The sequence this tree is indexing. + * @param tree The tree to which this node belongs. + */ + Node(Edge incomingEdge, Sequence sequence, SuffixTree tree) { + this.incomingEdge = incomingEdge; + this.sequence = sequence; + this.tree = tree; + } + + /** + * Inserts the suffix at the given active point. + * + * @param suffix The suffix to insert. + * @param activePoint The active point to insert it at. + */ + @SuppressWarnings("unchecked") + void insert(Suffix suffix, ActivePoint activePoint) { + Object item = suffix.getEndItem(); + + if (edges.containsKey(item)) { + if (tree.isNotFirstInsert() && activePoint.getNode() != tree.getRoot()) + tree.setSuffixLink(activePoint.getNode()); + activePoint.setEdge(edges.get(item)); + activePoint.incrementLength(); + } else { + saveSequenceTerminal(item); + Edge newEdge = new Edge(suffix.getEndPosition() - 1, this, + sequence, tree); + edges.put((T) suffix.getEndItem(), newEdge); + suffix.decrement(); + activePoint.updateAfterInsert(suffix); + + if (tree.isNotFirstInsert() && !this.equals(tree.getRoot())) { + tree.getLastNodeInserted().setSuffixLink(this); + } + if (suffix.isEmpty()) + return; + else + tree.insert(suffix); + } + } + + private void saveSequenceTerminal(Object item) { + if (item.getClass().equals(SequenceTerminal.class)) { + @SuppressWarnings("unchecked") + SequenceTerminal terminal = (SequenceTerminal) item; + sequenceTerminals.add(terminal); + } + } + + /** + * Inserts the given edge as a child of this node. The edge must not already + * exist as child or an IllegalArgumentException will be thrown. + * + * @param edge The edge to be inserted. + * @throws IllegalArgumentException This is thrown when the edge already exists as an out bound + * edge of this node. + */ + void insert(Edge edge) { + if (edges.containsKey(edge.getStartItem())) + throw new IllegalArgumentException("Item " + edge.getStartItem() + + " already exists in node " + toString()); + edges.put(edge.getStartItem(), edge); + } + + /** + * Retrieves the edge starting with item or null if none exists. + * + * @param item + * @return The edge extending from this node starting with item. + */ + Edge getEdgeStarting(Object item) { + return edges.get(item); + } + + /** + * True if the node has a suffix link extending from it. + * + * @return True if node has suffix link. False if not. + */ + boolean hasSuffixLink() { + return link != null; + } + + /** + * Gets the number of edges extending from this node. + * + * @return The count of the number edges extending from this node. + */ + int getEdgeCount() { + return edges.size(); + } + + /** + * @return An iterator which iterates over the child edges. No order is + * guaranteed. + */ + public Iterator> iterator() { + return edges.values().iterator(); + } + + /** + * @return The node that this nodes suffix link points to if it has one. + * Null if not. + */ + Node getSuffixLink() { + return link; + } + + /** + * Sets the suffix link of this node to point to the supplied node. + * + * @param node The node this suffix link should point to. + */ + void setSuffixLink(Node node) { + link = node; + } + + @Override + public String toString() { + if (incomingEdge == null) + return "root"; + else { + return "end of edge [" + incomingEdge.toString() + "]"; + } + } + + public Collection> getSuffixTerminals() { + return sequenceTerminals; + } + + public Collection> getEdges() { + return edges.values(); + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Sequence.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Sequence.java index f464c54d4..1ab0eff67 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Sequence.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Sequence.java @@ -10,92 +10,91 @@ * generic suffix tree implementation. This object automatically appends a * terminating item to the end of the instance which is included in all * operations. - * + * * @author Max Garfinkel - * - * @param */ public class Sequence> implements Iterable { - private List masterSequence = new ArrayList(); - - Sequence(){ - } - - /** - * Initialize the sequence. - * - * @param sequence - */ - Sequence(S sequence) { - for(Object item : sequence) - masterSequence.add(item); - SequenceTerminal sequenceTerminal = new SequenceTerminal(sequence); - masterSequence.add(sequenceTerminal); - } - - /** - * Retrieve the item at the position specified by index. - * - * @param index - * @return - */ - Object getItem(int index) { - return masterSequence.get(index); - } - - /** - * Adds a Sequence to the suffix tree. - * @param sequence - */ - void add(S sequence){ - for(I item : sequence){ - masterSequence.add(item); - } - SequenceTerminal terminal = new SequenceTerminal(sequence); - masterSequence.add(terminal); - } - - /** - * Retrieves an iterator for the sequence. - */ - public Iterator iterator() { - return new Iterator() { - - int currentPosition = 0; - - public boolean hasNext() { - return masterSequence.size() > currentPosition; - } - - public Object next() { - if (currentPosition <= masterSequence.size()) - return masterSequence.get(currentPosition++); - else { - return null; - } - } - - public void remove() { - throw new UnsupportedOperationException( - "Remove is not supported."); - - } - - }; - } - - int getLength(){ - return masterSequence.size(); - } - - public String toString(){ - StringBuilder sb = new StringBuilder("Sequence = ["); - for(Object i : masterSequence){ - sb.append(i).append(", "); - } - sb.append("]"); - return sb.toString(); - } - + private List masterSequence = new ArrayList(); + + Sequence() { + } + + /** + * Initialize the sequence. + * + * @param sequence + */ + Sequence(S sequence) { + for (Object item : sequence) + masterSequence.add(item); + SequenceTerminal sequenceTerminal = new SequenceTerminal(sequence); + masterSequence.add(sequenceTerminal); + } + + /** + * Retrieve the item at the position specified by index. + * + * @param index + * @return + */ + Object getItem(int index) { + return masterSequence.get(index); + } + + /** + * Adds a Sequence to the suffix tree. + * + * @param sequence + */ + void add(S sequence) { + for (I item : sequence) { + masterSequence.add(item); + } + SequenceTerminal terminal = new SequenceTerminal(sequence); + masterSequence.add(terminal); + } + + /** + * Retrieves an iterator for the sequence. + */ + public Iterator iterator() { + return new Iterator() { + + int currentPosition = 0; + + public boolean hasNext() { + return masterSequence.size() > currentPosition; + } + + public Object next() { + if (currentPosition <= masterSequence.size()) + return masterSequence.get(currentPosition++); + else { + return null; + } + } + + public void remove() { + throw new UnsupportedOperationException( + "Remove is not supported."); + + } + + }; + } + + int getLength() { + return masterSequence.size(); + } + + public String toString() { + StringBuilder sb = new StringBuilder("Sequence = ["); + for (Object i : masterSequence) { + sb.append(i).append(", "); + } + sb.append("]"); + return sb.toString(); + } + } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/SequenceTerminal.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/SequenceTerminal.java index 69856a42b..1cd37536a 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/SequenceTerminal.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/SequenceTerminal.java @@ -2,37 +2,36 @@ /** * Represents the terminating item of a sequence. - * + * * @author Max Garfinkel - * */ class SequenceTerminal { - private final S sequence; - - SequenceTerminal(S sequence){ - this.sequence = sequence; - } - - @SuppressWarnings("unchecked") - @Override - public boolean equals(Object o) { - if(o == null || o.getClass() != this.getClass()) - return false; - return ((SequenceTerminal)o).sequence.equals(this.sequence); - } - - public int hashCode(){ - return sequence.hashCode(); - } - - @Override - public String toString() { - return "$"+sequence.toString()+"$"; - } - - public S getSequence(){ - return sequence; - } + private final S sequence; + + SequenceTerminal(S sequence) { + this.sequence = sequence; + } + + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + if (o == null || o.getClass() != this.getClass()) + return false; + return ((SequenceTerminal) o).sequence.equals(this.sequence); + } + + public int hashCode() { + return sequence.hashCode(); + } + + @Override + public String toString() { + return "$" + sequence.toString() + "$"; + } + + public S getSequence() { + return sequence; + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Suffix.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Suffix.java index 4da0441e3..f80a2f0e6 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Suffix.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Suffix.java @@ -2,156 +2,153 @@ /** * Represents the remaining suffix to be inserted during suffix tree - * construction. This is essentially a start and end pointer into the + * construction. This is essentially a start and end pointer into the * underlying sequence. This is like a kind of sliding window where the head * can never fall behind the tail, and the tail can never fall behind the head. - * - * @author Max Garfinkel - * + * * @param + * @author Max Garfinkel */ -class Suffix> { - private int start; - private int end; - private Sequence sequence; +class Suffix> { + private int start; + private int end; + private Sequence sequence; + + /** + * Construct a subsequence of sequence. The subsequence will be a suffix of + * the sequence UP TO the point in the sequence we have reached whilst + * running Ukonnen's algorithm. In this sense it is not a true suffix of the + * sequence but only a suffix of the portion of the sequence we have so far + * parsed. + * + * @param start The start position of the suffix within the sequence + * @param end The end position of the suffix within the sequence + * @param sequence The main sequence + */ + public Suffix(int start, int end, Sequence sequence) { + testStartAndEndValues(start, end); + testStartEndAgainstSequenceLength(start, end, sequence.getLength()); + this.start = start; + this.end = end; + this.sequence = sequence; + } + + private void testStartEndAgainstSequenceLength(int start, int end, int sequenceLength) { + if (start > sequenceLength || end > sequenceLength) + throw new IllegalArgumentException("Suffix start and end must be less than or equal to sequence length"); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("[("); + sb.append(start).append(", ").append(end).append(")"); + int end = getEndPosition(); + for (int i = start; i < end; i++) { + sb.append(sequence.getItem(i)).append(","); + } + sb.append("]"); + return sb.toString(); + } + + /** + * @return The position in the master sequence of the end item in this + * suffix. This value is inclusive, thus and end of 0 implies the + * suffix contains only the item at sequence[0] + */ + int getEndPosition() { + return end; + } - /** - * Construct a subsequence of sequence. The subsequence will be a suffix of - * the sequence UP TO the point in the sequence we have reached whilst - * running Ukonnen's algorithm. In this sense it is not a true suffix of the - * sequence but only a suffix of the portion of the sequence we have so far - * parsed. - * @param start The start position of the suffix within the sequence - * @param end The end position of the suffix within the sequence - * @param sequence The main sequence - */ - public Suffix(int start, int end, Sequence sequence) { - testStartAndEndValues(start, end); - testStartEndAgainstSequenceLength(start, end, sequence.getLength()); - this.start = start; - this.end = end; - this.sequence = sequence; - } - - private void testStartEndAgainstSequenceLength(int start, int end, int sequenceLength){ - if(start > sequenceLength || end > sequenceLength) - throw new IllegalArgumentException("Suffix start and end must be less than or equal to sequence length"); - } + /** + * Get the end item of this suffix. + * + * @return The end item of sequence + */ + Object getEndItem() { + if (isEmpty()) + return null; + return sequence.getItem(end - 1); + } - @Override - public String toString() { - StringBuilder sb = new StringBuilder("[("); - sb.append(start).append(", ").append(end).append(")"); - int end = getEndPosition(); - for (int i = start; i < end; i++) { - sb.append(sequence.getItem(i)).append(","); - } - sb.append("]"); - return sb.toString(); - } + /** + * Get the start of this suffix. + * + * @return + */ + Object getStart() { + if (isEmpty()) + return null; + return sequence.getItem(start); + } - /** - * - * @return The position in the master sequence of the end item in this - * suffix. This value is inclusive, thus and end of 0 implies the - * suffix contains only the item at sequence[0] - */ - int getEndPosition() { - return end; - } + /** + * Decrement the length of this suffix. This is done by incrementing the + * start position. This is reducing its length from the back. + */ + void decrement() { + if (start == end) + increment(); + start++; + } - /** - * Get the end item of this suffix. - * - * @return The end item of sequence - */ - Object getEndItem() { - if(isEmpty()) - return null; - return sequence.getItem(end-1); - } + /** + * Increments the length of the suffix by incrementing the end position. The + * effectivly moves the suffix forward, along the master sequence. + */ + void increment() { + end++; + if (end > sequence.getLength()) + throw new IndexOutOfBoundsException("Incremented suffix beyond end of sequence"); - /** - * Get the start of this suffix. - * - * @return - */ - Object getStart() { - if(isEmpty()) - return null; - return sequence.getItem(start); - } + } - /** - * Decrement the length of this suffix. This is done by incrementing the - * start position. This is reducing its length from the back. - */ - void decrement() { - if(start==end) - increment(); - start++; - } + /** + * Indicates if the suffix is empty. + * + * @return + */ + boolean isEmpty() { + return start >= end || end > sequence.getLength(); + } - /** - * Increments the length of the suffix by incrementing the end position. The - * effectivly moves the suffix forward, along the master sequence. - */ - void increment() { - end++; - if(end > sequence.getLength()) - throw new IndexOutOfBoundsException("Incremented suffix beyond end of sequence"); - - } + /** + * Retrieves the count of remaining items in the suffix. + * + * @return The number of items in the suffix. + */ + int getRemaining() { + if (isEmpty()) + return 0; + else + return (end - start); + } - /** - * Indicates if the suffix is empty. - * - * @return - */ - boolean isEmpty() { - return start >= end || end > sequence.getLength(); - } + /** + * Retrieves the item the given distance from the end of the suffix. + * + * @param distanceFromEnd The distance from the end. + * @return The item the given distance from the end. + * @throws IllegalArgumentException if the distance from end is greater than the length of the + * suffix. + */ + public Object getItemXFromEnd(int distanceFromEnd) { + if ((end - (distanceFromEnd)) < start) { + throw new IllegalArgumentException(distanceFromEnd + + " extends before the start of this suffix: "); + } + return sequence.getItem(end - distanceFromEnd); + } - /** - * Retrieves the count of remaining items in the suffix. - * - * @return The number of items in the suffix. - */ - int getRemaining() { - if(isEmpty()) - return 0; - else - return (end - start); - } + void reset(int start, int end) { + testStartAndEndValues(start, end); + this.start = start; + this.end = end; + } - /** - * Retrieves the item the given distance from the end of the suffix. - * - * @param distanceFromEnd - * The distance from the end. - * @return The item the given distance from the end. - * @throws IllegalArgumentException - * if the distance from end is greater than the length of the - * suffix. - */ - public Object getItemXFromEnd(int distanceFromEnd) { - if ((end - (distanceFromEnd)) < start){ - throw new IllegalArgumentException(distanceFromEnd - + " extends before the start of this suffix: "); - } - return sequence.getItem(end - distanceFromEnd); - } - - void reset(int start, int end){ - testStartAndEndValues(start, end); - this.start = start; - this.end = end; - } - - private void testStartAndEndValues(int start, int end){ - if(start < 0 || end < 0) - throw new IllegalArgumentException("You cannot set a suffix start or end to less than zero."); - if(end < start) - throw new IllegalArgumentException("A suffix end position cannot be less than its start position."); - } + private void testStartAndEndValues(int start, int end) { + if (start < 0 || end < 0) + throw new IllegalArgumentException("You cannot set a suffix start or end to less than zero."); + if (end < start) + throw new IllegalArgumentException("A suffix end position cannot be less than its start position."); + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/SuffixTree.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/SuffixTree.java index ab237da00..bd3c451a5 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/SuffixTree.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/SuffixTree.java @@ -2,171 +2,164 @@ /** * A suffix tree implementation using Ukkonen's algorithm capable of generating a generialised suffix tree. - * - * The type of both character and the word can be specified, and we call these items + *

    + * The type of both character and the word can be specified, and we call these items * and sequences respectively. - * + * + * @param The type of the item within the sequence. + * @param The sequence type, which must iterate over items of type I * @author Max Garfinkel - * - * @param - * The type of the item within the sequence. - * @param - * The sequence type, which must iterate over items of type I */ -public class SuffixTree> { - - private final Node root; - private final Sequence sequence; - - private Suffix suffix; - private final ActivePoint activePoint; - private int currentEnd = 0; - private int insertsThisStep = 0; - private Node lastNodeInserted = null; - - /** - * Constructs an empty suffix tree. - */ - public SuffixTree(){ - sequence = new Sequence(); - root = new Node(null, this.sequence, this); - activePoint = new ActivePoint(root); - } - - /** - * Construct and represent a suffix tree representation of the given - * sequence using Ukkonen's algorithm. - * - * @param sequenceArray - * the array of items for which we are going to generate a suffix - * tree. - * @throws Exception - */ - public SuffixTree(S sequenceArray) { - sequence = new Sequence(sequenceArray); - root = new Node(null, this.sequence, this); - activePoint = new ActivePoint(root); - suffix = new Suffix(0, 0, this.sequence); - extendTree(0,sequence.getLength()); - } - - /** - * Add a sequence to the suffix tree. It is immediately processed - * and added to the tree. - * @param sequence A sequence to be added. - */ - public void add(S sequence){ - int start = currentEnd; - this.sequence.add(sequence); - suffix = new Suffix(currentEnd,currentEnd,this.sequence); - activePoint.setPosition(root, null, 0); - extendTree(start, this.sequence.getLength()); - } - - private void extendTree(int from, int to) { - for (int i = from; i < to; i++){ - suffix.increment(); - insertsThisStep = 0; - insert(suffix); - currentEnd++; - } - } - - - /** - * Inserts the given suffix into this tree. - * - * @param suffix - * The suffix to insert. - */ - void insert(Suffix suffix) { - if (activePoint.isNode()) { - Node node = activePoint.getNode(); - node.insert(suffix, activePoint); - } else if (activePoint.isEdge()) { - Edge edge = activePoint.getEdge(); - edge.insert(suffix, activePoint); - } - } - - /** - * Retrieves the point in the sequence for which all proceeding item have - * been inserted into the tree. - * - * @return The index of the current end point of tree. - */ - int getCurrentEnd() { - return currentEnd; - } - - /** - * Retrieves the root node for this tree. - * - * @return The root node of the tree. - */ - Node getRoot() { - return root; - } - - /** - * Increments the inserts counter for this step. - */ - void incrementInsertCount() { - insertsThisStep++; - } - - /** - * Indecates if there have been inserts during the current step. - * - * @return - */ - boolean isNotFirstInsert() { - return insertsThisStep > 0; - } - - /** - * Retrieves the last node to be inserted, null if none has. - * - * @return The last node inserted or null. - */ - Node getLastNodeInserted() { - return lastNodeInserted; - } - - /** - * Sets the last node inserted to the supplied node. - * - * @param node - * The node representing the last node inserted. - */ - void setLastNodeInserted(Node node) { - lastNodeInserted = node; - } - - /** - * Sets the suffix link of the last inserted node to point to the supplied - * node. This method checks the state of the step and only applies the - * suffix link if there is a previous node inserted during this step. This - * method also set the last node inserted to the supplied node after - * applying any suffix linking. - * - * @param node - * The node to which the last node inserted's suffix link should - * point to. - */ - void setSuffixLink(Node node) { - if (isNotFirstInsert()) { - lastNodeInserted.setSuffixLink(node); - } - lastNodeInserted = node; - } - - @Override - public String toString() { - return Utils.printTreeForGraphViz(this); - } - - Sequence getSequence(){ - return sequence; - } +public class SuffixTree> { + + private final Node root; + private final Sequence sequence; + + private Suffix suffix; + private final ActivePoint activePoint; + private int currentEnd = 0; + private int insertsThisStep = 0; + private Node lastNodeInserted = null; + + /** + * Constructs an empty suffix tree. + */ + public SuffixTree() { + sequence = new Sequence<>(); + root = new Node<>(null, this.sequence, this); + activePoint = new ActivePoint<>(root); + } + + /** + * Construct and represent a suffix tree representation of the given + * sequence using Ukkonen's algorithm. + * + * @param sequenceArray the array of items for which we are going to generate a suffix + * tree. + */ + public SuffixTree(S sequenceArray) { + sequence = new Sequence<>(sequenceArray); + root = new Node<>(null, this.sequence, this); + activePoint = new ActivePoint<>(root); + suffix = new Suffix<>(0, 0, this.sequence); + extendTree(0, sequence.getLength()); + } + + /** + * Add a sequence to the suffix tree. It is immediately processed + * and added to the tree. + * + * @param sequence A sequence to be added. + */ + public void add(S sequence) { + int start = currentEnd; + this.sequence.add(sequence); + suffix = new Suffix<>(currentEnd, currentEnd, this.sequence); + activePoint.setPosition(root, null, 0); + extendTree(start, this.sequence.getLength()); + } + + private void extendTree(int from, int to) { + for (int i = from; i < to; i++) { + suffix.increment(); + insertsThisStep = 0; + insert(suffix); + currentEnd++; + } + } + + + /** + * Inserts the given suffix into this tree. + * + * @param suffix The suffix to insert. + */ + void insert(Suffix suffix) { + if (activePoint.isNode()) { + Node node = activePoint.getNode(); + node.insert(suffix, activePoint); + } else if (activePoint.isEdge()) { + Edge edge = activePoint.getEdge(); + edge.insert(suffix, activePoint); + } + } + + /** + * Retrieves the point in the sequence for which all proceeding item have + * been inserted into the tree. + * + * @return The index of the current end point of tree. + */ + int getCurrentEnd() { + return currentEnd; + } + + /** + * Retrieves the root node for this tree. + * + * @return The root node of the tree. + */ + Node getRoot() { + return root; + } + + /** + * Increments the inserts counter for this step. + */ + void incrementInsertCount() { + insertsThisStep++; + } + + /** + * Indecates if there have been inserts during the current step. + * + * @return + */ + boolean isNotFirstInsert() { + return insertsThisStep > 0; + } + + /** + * Retrieves the last node to be inserted, null if none has. + * + * @return The last node inserted or null. + */ + Node getLastNodeInserted() { + return lastNodeInserted; + } + + /** + * Sets the last node inserted to the supplied node. + * + * @param node The node representing the last node inserted. + */ + void setLastNodeInserted(Node node) { + lastNodeInserted = node; + } + + /** + * Sets the suffix link of the last inserted node to point to the supplied + * node. This method checks the state of the step and only applies the + * suffix link if there is a previous node inserted during this step. This + * method also set the last node inserted to the supplied node after + * applying any suffix linking. + * + * @param node The node to which the last node inserted's suffix link should + * point to. + */ + void setSuffixLink(Node node) { + if (isNotFirstInsert()) { + lastNodeInserted.setSuffixLink(node); + } + lastNodeInserted = node; + } + + @Override + public String toString() { + return Utils.printTreeForGraphViz(this); + } + + Sequence getSequence() { + return sequence; + } } \ No newline at end of file diff --git a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Utils.java b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Utils.java index 4906fefba..1ad26c7d3 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Utils.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/suffixtree/Utils.java @@ -3,7 +3,6 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; -import java.util.List; import java.util.Map; /** @@ -11,94 +10,91 @@ */ public class Utils { - /** - * Appends a SequenceTerminal element to a supplied array. - * - * @param sequence - * The sequence to which we are applying the terminating object. - * @param terminatingObject - * The instance of the terminating object. - * @return A new sequence with an extra element at the end containing the - * terminating object. - */ - static > Object[] addTerminalToSequence(S sequence, - SequenceTerminal terminatingObject) { - - ArrayList list = new ArrayList(); - for(I item : sequence) - list.add(item); - - Object[] newSequence = new Object[list.size() + 1]; - - int i = 0; - for (; i < list.size(); i++) - newSequence[i] = list.get(i); - newSequence[i] = terminatingObject; - return newSequence; - } + /** + * Appends a SequenceTerminal element to a supplied array. + * + * @param sequence The sequence to which we are applying the terminating object. + * @param terminatingObject The instance of the terminating object. + * @return A new sequence with an extra element at the end containing the + * terminating object. + */ + static > Object[] addTerminalToSequence(S sequence, + SequenceTerminal terminatingObject) { - static > String printTreeForGraphViz(SuffixTree tree) { - return printTreeForGraphViz(tree, true); - } - - /** - * Generates a .dot format string for visualizing a suffix tree. - * - * @param tree - * The tree for which we are generating a dot file. - * @return A string containing the contents of a .dot representation of the - * tree. - */ - static > String printTreeForGraphViz(SuffixTree tree, boolean printSuffixLinks) { - LinkedList> stack = new LinkedList>(); - stack.add(tree.getRoot()); - Map, Integer> nodeMap = new HashMap, Integer>(); - nodeMap.put(tree.getRoot(), 0); - int nodeId = 1; + ArrayList list = new ArrayList(); + for (I item : sequence) + list.add(item); - StringBuilder sb = new StringBuilder( - "\ndigraph suffixTree{\n node [shape=circle, label=\"\", fixedsize=true, width=0.1, height=0.1]\n"); + Object[] newSequence = new Object[list.size() + 1]; - while (stack.size() > 0) { - LinkedList> childNodes = new LinkedList>(); - for (Node node : stack) { + int i = 0; + for (; i < list.size(); i++) + newSequence[i] = list.get(i); + newSequence[i] = terminatingObject; + return newSequence; + } - // List edges = node.getEdges(); - for (Edge edge : node) { - int id = nodeId++; - if (edge.isTerminating()) { - childNodes.push(edge.getTerminal()); - nodeMap.put(edge.getTerminal(), id); - } + static > String printTreeForGraphViz(SuffixTree tree) { + return printTreeForGraphViz(tree, true); + } - sb.append(nodeMap.get(node)).append(" -> ").append(id) - .append(" [label=\""); - - for (T item : edge) { - //if(item != null) - sb.append(item.toString()); - } - sb.append("\"];\n"); - } - } - stack = childNodes; - } - if(printSuffixLinks){ - // loop again to find all suffix links. - sb.append("edge [color=red]\n"); - for (Map.Entry, Integer> entry : nodeMap.entrySet()) { - Node n1 = entry.getKey(); - int id1 = entry.getValue(); - - if (n1.hasSuffixLink()) { - Node n2 = n1.getSuffixLink(); - Integer id2 = nodeMap.get(n2); - // if(id2 != null) - sb.append(id1).append(" -> ").append(id2).append(" ;\n"); - } - } - } - sb.append("}"); - return (sb.toString()); - } + /** + * Generates a .dot format string for visualizing a suffix tree. + * + * @param tree The tree for which we are generating a dot file. + * @return A string containing the contents of a .dot representation of the + * tree. + */ + static > String printTreeForGraphViz(SuffixTree tree, boolean printSuffixLinks) { + LinkedList> stack = new LinkedList<>(); + stack.add(tree.getRoot()); + Map, Integer> nodeMap = new HashMap<>(); + nodeMap.put(tree.getRoot(), 0); + int nodeId = 1; + + StringBuilder sb = new StringBuilder( + "\ndigraph suffixTree{\n node [shape=circle, label=\"\", fixedsize=true, width=0.1, height=0.1]\n"); + + while (stack.size() > 0) { + LinkedList> childNodes = new LinkedList<>(); + for (Node node : stack) { + + // List edges = node.getEdges(); + for (Edge edge : node) { + int id = nodeId++; + if (edge.isTerminating()) { + childNodes.push(edge.getTerminal()); + nodeMap.put(edge.getTerminal(), id); + } + + sb.append(nodeMap.get(node)).append(" -> ").append(id) + .append(" [label=\""); + + for (T item : edge) { + //if(item != null) + sb.append(item.toString()); + } + sb.append("\"];\n"); + } + } + stack = childNodes; + } + if (printSuffixLinks) { + // loop again to find all suffix links. + sb.append("edge [color=red]\n"); + for (Map.Entry, Integer> entry : nodeMap.entrySet()) { + Node n1 = entry.getKey(); + int id1 = entry.getValue(); + + if (n1.hasSuffixLink()) { + Node n2 = n1.getSuffixLink(); + Integer id2 = nodeMap.get(n2); + // if(id2 != null) + sb.append(id1).append(" -> ").append(id2).append(" ;\n"); + } + } + } + sb.append("}"); + return (sb.toString()); + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/util/GreedyStringTilingAlgorithm.java b/collatex-core/src/main/java/eu/interedition/collatex/util/GreedyStringTilingAlgorithm.java index 65b574f3c..bc610e6ed 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/util/GreedyStringTilingAlgorithm.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/util/GreedyStringTilingAlgorithm.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,8 +19,6 @@ package eu.interedition.collatex.util; -import com.google.common.base.Objects; -import com.google.common.collect.Iterables; import eu.interedition.collatex.CollationAlgorithm; import eu.interedition.collatex.Token; import eu.interedition.collatex.VariantGraph; @@ -35,11 +33,12 @@ import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; +import java.util.stream.StreamSupport; /** * Greedy String Tiling. - *

    + *

    * Implements the Greedy String Tiling algorithm as proposed by Michael J. Wise in his paper: * "String Similarity via Greedy String Tiling and Running Karp-Rabin Matching" * @@ -49,143 +48,143 @@ */ public class GreedyStringTilingAlgorithm extends CollationAlgorithm.Base { - private final Comparator comparator; - private final int minimumTileLength; + private final Comparator comparator; + private final int minimumTileLength; - private final Equality equality = new Equality() { + private final Equality equality = new Equality() { - @Override - public boolean isEqual(VariantGraph.Vertex[] a, Token b) { - for (VariantGraph.Vertex vertex : a) { - final Set tokens = vertex.tokens(); - if (!tokens.isEmpty() && comparator.compare(Iterables.getFirst(tokens, null), b) == 0) { - return true; + @Override + public boolean isEqual(VariantGraph.Vertex[] a, Token b) { + for (VariantGraph.Vertex vertex : a) { + final Set tokens = vertex.tokens(); + if (!tokens.isEmpty() && comparator.compare(tokens.stream().findFirst().get(), b) == 0) { + return true; + } + } + return false; } - } - return false; - } - }; - - public GreedyStringTilingAlgorithm(Comparator comparator, int minimumTileLength) { - this.comparator = comparator; - this.minimumTileLength = minimumTileLength; - } - - @Override - public void collate(VariantGraph graph, Iterable witness) { - final VariantGraph.Vertex[][] vertices = VariantGraphRanking.of(graph).asArray(); - final Token[] tokens = Iterables.toArray(witness, Token.class); - - final SortedSet> matches = new TreeSet>(VertexMatch.setComparator()); - for (Match match : match(vertices, tokens, equality, minimumTileLength)) { - final SortedSet phrase = new TreeSet(); - for (int mc = 0, ml = match.length; mc < ml; mc++) { - final int rank = match.left + mc; - phrase.add(new VertexMatch.WithTokenIndex(vertices[rank][0], rank, match.right + mc)); - } - matches.add(phrase); + }; + + public GreedyStringTilingAlgorithm(Comparator comparator, int minimumTileLength) { + this.comparator = comparator; + this.minimumTileLength = minimumTileLength; } - merge(graph, vertices, tokens, matches); - } - - public static SortedSet match(A[] left, B[] right, Equality equality, int minimumTileLength) { - final boolean[] markedLeft = new boolean[left.length]; - final boolean[] markedRight = new boolean[right.length]; - - Arrays.fill(markedLeft, false); - Arrays.fill(markedRight, false); - - final SortedSet matches = new TreeSet(); - final Map> matchesByLength = new HashMap>(); - - int maxMatchLength; - do { - maxMatchLength = minimumTileLength; - for (int rc = 0; rc < right.length; rc++) { - for (int lc = 0; lc < left.length; lc++) { - int matchLength = 0; - for (int tc = 0; - (tc + lc) < left.length && (tc + rc) < right.length && - !markedLeft[lc + tc] && !markedRight[rc + tc] && - equality.isEqual(left[lc + tc], right[rc + tc]); - tc++) { - matchLength++; - } - - if (matchLength >= maxMatchLength) { - List theMatches = matchesByLength.get(matchLength); - if (theMatches == null) { - matchesByLength.put(matchLength, theMatches = new ArrayList()); + @Override + public void collate(VariantGraph graph, Iterable witness) { + final VariantGraph.Vertex[][] vertices = VariantGraphRanking.of(graph).asArray(); + final Token[] tokens = StreamSupport.stream(witness.spliterator(), false).toArray(Token[]::new); + + final SortedSet> matches = new TreeSet<>(VertexMatch.setComparator()); + for (Match match : match(vertices, tokens, equality, minimumTileLength)) { + final SortedSet phrase = new TreeSet<>(); + for (int mc = 0, ml = match.length; mc < ml; mc++) { + final int rank = match.left + mc; + phrase.add(new VertexMatch.WithTokenIndex(vertices[rank][0], rank, match.right + mc)); } - theMatches.add(new Match(lc, rc)); - } - - if (matchLength > maxMatchLength) { - maxMatchLength = matchLength; - } + matches.add(phrase); } - } - for (Match match : Objects.firstNonNull(matchesByLength.get(maxMatchLength), Collections.emptyList())) { - boolean occluded = false; + merge(graph, vertices, tokens, matches); + } - for (int tc = 0; tc < maxMatchLength; tc++) { - if (markedLeft[match.left + tc] || markedRight[match.right + tc]) { - occluded = true; - break; - } - } + public static SortedSet match(A[] left, B[] right, Equality equality, int minimumTileLength) { + final boolean[] markedLeft = new boolean[left.length]; + final boolean[] markedRight = new boolean[right.length]; + + Arrays.fill(markedLeft, false); + Arrays.fill(markedRight, false); + + final SortedSet matches = new TreeSet<>(); + final Map> matchesByLength = new HashMap<>(); + + int maxMatchLength; + do { + maxMatchLength = minimumTileLength; + for (int rc = 0; rc < right.length; rc++) { + for (int lc = 0; lc < left.length; lc++) { + int matchLength = 0; + for (int tc = 0; + (tc + lc) < left.length && (tc + rc) < right.length && + !markedLeft[lc + tc] && !markedRight[rc + tc] && + equality.isEqual(left[lc + tc], right[rc + tc]); + tc++) { + matchLength++; + } + + if (matchLength >= maxMatchLength) { + List theMatches = matchesByLength.get(matchLength); + if (theMatches == null) { + matchesByLength.put(matchLength, theMatches = new ArrayList<>()); + } + theMatches.add(new Match(lc, rc)); + } + + if (matchLength > maxMatchLength) { + maxMatchLength = matchLength; + } + } + } - if (!occluded) { - for (int tc = 0; tc < maxMatchLength; tc++) { - markedLeft[match.left + tc] = true; - markedRight[match.right + tc] = true; - } - matches.add(new Match(match.left, match.right, maxMatchLength)); - } - } + for (Match match : matchesByLength.getOrDefault(maxMatchLength, Collections.emptyList())) { + boolean occluded = false; + + for (int tc = 0; tc < maxMatchLength; tc++) { + if (markedLeft[match.left + tc] || markedRight[match.right + tc]) { + occluded = true; + break; + } + } + + if (!occluded) { + for (int tc = 0; tc < maxMatchLength; tc++) { + markedLeft[match.left + tc] = true; + markedRight[match.right + tc] = true; + } + matches.add(new Match(match.left, match.right, maxMatchLength)); + } + } - } while (maxMatchLength > minimumTileLength); + } while (maxMatchLength > minimumTileLength); - return matches; - } + return matches; + } - public static interface Equality { - boolean isEqual(A a, B b); - } + public static interface Equality { + boolean isEqual(A a, B b); + } - public static class Match implements Comparable { - public final int left; - public final int right; - public final int length; + public static class Match implements Comparable { + public final int left; + public final int right; + public final int length; - public Match(int left, int right, int length) { - this.left = left; - this.right = right; - this.length = length; - } + public Match(int left, int right, int length) { + this.left = left; + this.right = right; + this.length = length; + } - public Match(int left, int right) { - this(left, right, 0); - } + public Match(int left, int right) { + this(left, right, 0); + } - @Override - public boolean equals(Object obj) { - if (obj != null && obj instanceof Match) { - return (left == ((Match) obj).left); - } - return super.equals(obj); - } + @Override + public boolean equals(Object obj) { + if (obj != null && obj instanceof Match) { + return (left == ((Match) obj).left); + } + return super.equals(obj); + } - @Override - public int hashCode() { - return left; - } + @Override + public int hashCode() { + return left; + } - @Override - public int compareTo(Match o) { - return left - o.left; + @Override + public int compareTo(Match o) { + return left - o.left; + } } - } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/util/IntegerRangeSet.java b/collatex-core/src/main/java/eu/interedition/collatex/util/IntegerRangeSet.java deleted file mode 100644 index d6042e7ae..000000000 --- a/collatex-core/src/main/java/eu/interedition/collatex/util/IntegerRangeSet.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2013 The Interedition Development Group. - * - * This file is part of CollateX. - * - * CollateX is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * CollateX is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with CollateX. If not, see . - */ - -package eu.interedition.collatex.util; - -import com.google.common.base.Predicate; -import com.google.common.collect.Range; - -import javax.annotation.Nullable; -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; - -/** - * @author Gregor Middell - */ -public class IntegerRangeSet extends HashSet> implements Predicate { - - public IntegerRangeSet() { - } - - public IntegerRangeSet(Range c) { - this(Collections.singleton(c)); - } - - public IntegerRangeSet(Collection> c) { - super(c); - } - - @Override - public boolean apply(@Nullable Integer input) { - for (Range range : this) { - if (range.contains(input)) { - return true; - } - } - return false; - } -} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/util/ParallelSegmentationApparatus.java b/collatex-core/src/main/java/eu/interedition/collatex/util/ParallelSegmentationApparatus.java index 7b5313287..d3feef373 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/util/ParallelSegmentationApparatus.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/util/ParallelSegmentationApparatus.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,85 +19,81 @@ package eu.interedition.collatex.util; -import com.google.common.collect.HashMultimap; -import com.google.common.collect.Iterables; -import com.google.common.collect.LinkedHashMultimap; -import com.google.common.collect.Maps; -import com.google.common.collect.Multimap; -import com.google.common.collect.Ordering; -import com.google.common.collect.SetMultimap; -import com.google.common.collect.Sets; import eu.interedition.collatex.Token; import eu.interedition.collatex.VariantGraph; import eu.interedition.collatex.Witness; -import javax.xml.stream.XMLStreamWriter; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; +import java.util.TreeMap; /** - * @author Gregor Middell + * @author Gregor Middell */ public class ParallelSegmentationApparatus { - public interface GeneratorCallback { + public interface GeneratorCallback { - void start(); + void start(); - void segment(SortedMap> contents); + void segment(SortedMap> contents); - void end(); - } - - public static void generate(VariantGraphRanking ranking, GeneratorCallback callback) { - - callback.start(); - - final Set allWitnesses = ranking.witnesses(); - for (Iterator>> rowIt = ranking.getByRank().asMap().entrySet().iterator(); rowIt.hasNext(); ) { - final Map.Entry> row = rowIt.next(); - final int rank = row.getKey(); - final Collection vertices = row.getValue(); - - if (vertices.size() == 1 && Iterables.getOnlyElement(vertices).tokens().isEmpty()) { - // skip start and end vertex - continue; - } - - // spreading vertices with same rank according to their registered transpositions - final Multimap verticesByTranspositionRank = HashMultimap.create(); - for (VariantGraph.Vertex v : vertices) { - int transpositionRank = 0; - for (VariantGraph.Transposition transposition : v.transpositions()) { - for (VariantGraph.Vertex tv : transposition) { - transpositionRank += (ranking.apply(tv).intValue() - rank); - } - } - verticesByTranspositionRank.put(transpositionRank, v); - } - - // render segments - for (Iterator transpositionRankIt = Ordering.natural().immutableSortedCopy(verticesByTranspositionRank.keySet()).iterator(); transpositionRankIt.hasNext() ;) { - final Multimap tokensByWitness = HashMultimap.create(); - for (VariantGraph.Vertex v : verticesByTranspositionRank.get(transpositionRankIt.next())) { - for (Token token : v.tokens()) { - tokensByWitness.put(token.getWitness(), token); - } - } + void end(); + } - final SortedMap> cellContents = Maps.newTreeMap(Witness.SIGIL_COMPARATOR); - for (Witness witness : allWitnesses) { - cellContents.put(witness, tokensByWitness.containsKey(witness) ? Iterables.unmodifiableIterable(tokensByWitness.get(witness)) : Collections.emptySet()); + public static void generate(VariantGraphRanking ranking, GeneratorCallback callback) { + + callback.start(); + + final Set allWitnesses = ranking.witnesses(); + for (Iterator>> rowIt = ranking.getByRank().entrySet().iterator(); rowIt.hasNext(); ) { + final Map.Entry> row = rowIt.next(); + final int rank = row.getKey(); + final Collection verticesOfRank = row.getValue(); + + + if (verticesOfRank.size() == 1 && verticesOfRank.stream().findFirst().map(VariantGraph.Vertex::tokens).map(Set::isEmpty).orElse(false)) { + // skip start and end vertex + continue; + } + + // spreading vertices with same rank according to their registered transpositions + final SortedMap> verticesByTranspositionRank = new TreeMap<>(); + for (VariantGraph.Vertex v : verticesOfRank) { + int transpositionRank = 0; + for (Set transposition : v.transpositions()) { + for (VariantGraph.Vertex tv : transposition) { + transpositionRank += (ranking.apply(tv).intValue() - rank); + } + } + verticesByTranspositionRank.computeIfAbsent(transpositionRank, r -> new LinkedList<>()).add(v); + } + + // render segments + verticesByTranspositionRank.values().forEach(vertices -> { + final Map> tokensByWitness = new HashMap<>(); + for (VariantGraph.Vertex v : vertices) { + for (Token token : v.tokens()) { + tokensByWitness.computeIfAbsent(token.getWitness(), w -> new LinkedList<>()).add(token); + } + } + + final SortedMap> cellContents = new TreeMap<>(Witness.SIGIL_COMPARATOR); + for (Witness witness : allWitnesses) { + cellContents.put(witness, Collections.unmodifiableCollection(tokensByWitness.getOrDefault(witness, Collections.emptyList()))); + } + + callback.segment(cellContents); + }); } - callback.segment(cellContents); - } + callback.end(); } - - callback.end(); - } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/util/VariantGraphRanking.java b/collatex-core/src/main/java/eu/interedition/collatex/util/VariantGraphRanking.java index 7bd280d30..5693b3c31 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/util/VariantGraphRanking.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/util/VariantGraphRanking.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,152 +19,112 @@ package eu.interedition.collatex.util; +import eu.interedition.collatex.Token; +import eu.interedition.collatex.VariantGraph; +import eu.interedition.collatex.VariantGraph.Vertex; +import eu.interedition.collatex.Witness; + import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.Set; -import java.util.SortedSet; - -import javax.annotation.Nullable; - -import com.google.common.base.Function; -import com.google.common.base.Objects; -import com.google.common.base.Preconditions; -import com.google.common.collect.AbstractIterator; -import com.google.common.collect.Maps; -import com.google.common.collect.Multimaps; -import com.google.common.collect.Ordering; -import com.google.common.collect.RowSortedTable; -import com.google.common.collect.Sets; -import com.google.common.collect.SortedSetMultimap; -import com.google.common.collect.TreeBasedTable; -import com.google.common.collect.TreeMultimap; - -import eu.interedition.collatex.Token; -import eu.interedition.collatex.VariantGraph; -import eu.interedition.collatex.VariantGraph.Vertex; -import eu.interedition.collatex.Witness; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.function.Function; +import java.util.stream.Collectors; /** - * @author Gregor Middell + * @author Gregor Middell * @author Ronald Haentjens Dekker */ -public class VariantGraphRanking implements Iterable>, Function, Comparator { - - private final Map byVertex = Maps.newHashMap(); - private final SortedSetMultimap byRank = TreeMultimap.create(Ordering.natural(), Ordering.arbitrary()); - private final VariantGraph graph; - private final Set witnesses; - - VariantGraphRanking(VariantGraph graph, Set witnesses) { - this.graph = graph; - this.witnesses = witnesses; - } - - public static VariantGraphRanking of(VariantGraph graph) { - return of(graph, null); - } - - public static VariantGraphRanking of(VariantGraph graph, Set witnesses) { - final VariantGraphRanking ranking = new VariantGraphRanking(graph, witnesses); - for (VariantGraph.Vertex v : graph.vertices(witnesses)) { - int rank = -1; - for (VariantGraph.Edge e : v.incoming(witnesses)) { - rank = Math.max(rank, ranking.byVertex.get(e.from())); - } - rank++; - ranking.byVertex.put(v, rank); - ranking.byRank.put(rank, v); +public class VariantGraphRanking implements Iterable>, Function { + + private final Map byVertex = new HashMap<>(); + private final SortedMap> byRank = new TreeMap<>(); + private final VariantGraph graph; + + VariantGraphRanking(VariantGraph graph) { + this.graph = graph; } - return ranking; - } - - public static VariantGraphRanking ofOnlyCertainVertices(VariantGraph graph, Set witnesses, Set vertices) { - final VariantGraphRanking ranking = new VariantGraphRanking(graph, witnesses); - for (VariantGraph.Vertex v : graph.vertices(witnesses)) { - int rank = -1; - for (VariantGraph.Edge e : v.incoming(witnesses)) { - rank = Math.max(rank, ranking.byVertex.get(e.from())); - } - if (vertices.contains(v)) { - rank++; - } - ranking.byVertex.put(v, rank); - ranking.byRank.put(rank, v); + + public static VariantGraphRanking of(VariantGraph graph) { + final VariantGraphRanking ranking = new VariantGraphRanking(graph); + for (VariantGraph.Vertex v : graph.vertices()) { + int rank = -1; + for (VariantGraph.Vertex incoming : v.incoming().keySet()) { + rank = Math.max(rank, ranking.byVertex.get(incoming)); + } + rank++; + ranking.byVertex.put(v, rank); + ranking.byRank.computeIfAbsent(rank, r -> new HashSet<>()).add(v); + } + return ranking; } - return ranking; - } - - public Set witnesses() { - return Objects.firstNonNull(witnesses, graph.witnesses()); - } - - public Map getByVertex() { - return Collections.unmodifiableMap(byVertex); - } - - public SortedSetMultimap getByRank() { - return Multimaps.unmodifiableSortedSetMultimap(byRank); - } - - public int size() { - return byRank.keySet().size(); - } - - @Override - public Iterator> iterator() { - return new AbstractIterator>() { - private final Iterator it = byRank.keySet().iterator(); - - @Override - protected Set computeNext() { - return (it.hasNext() ? byRank.get(it.next()) : endOfData()); - } - }; - } - - public RowSortedTable> asTable() { - final TreeBasedTable> table = TreeBasedTable.create(Ordering.natural(), Witness.SIGIL_COMPARATOR); - for (Map.Entry rank : byVertex.entrySet()) { - final int row = rank.getValue(); - for (Token token : rank.getKey().tokens(witnesses)) { - final Witness column = token.getWitness(); - - Set cell = table.get(row, column); - if (cell == null) { - table.put(row, column, cell = Sets.newHashSet()); + + public static VariantGraphRanking ofOnlyCertainVertices(VariantGraph graph, Set vertices) { + final VariantGraphRanking ranking = new VariantGraphRanking(graph); + for (VariantGraph.Vertex v : graph.vertices()) { + int rank = -1; + for (VariantGraph.Vertex incoming : v.incoming().keySet()) { + rank = Math.max(rank, ranking.byVertex.get(incoming)); + } + if (vertices.contains(v)) { + rank++; + } + ranking.byVertex.put(v, rank); + ranking.byRank.computeIfAbsent(rank, r -> new HashSet<>()).add(v); } - cell.add(token); - } + return ranking; + } + + public Set witnesses() { + return graph.witnesses(); + } + + public Map getByVertex() { + return Collections.unmodifiableMap(byVertex); + } + + public Map> getByRank() { + return Collections.unmodifiableMap(byRank); } - return table; - } - - public VariantGraph.Vertex[][] asArray() { - final Set ranks = byRank.keySet(); - final VariantGraph.Vertex[][] arr = new VariantGraph.Vertex[ranks.size()][]; - for (final Iterator it = ranks.iterator(); it.hasNext(); ) { - final Integer rank = it.next(); - final SortedSet vertices = byRank.get(rank); - arr[rank] = vertices.toArray(new Vertex[vertices.size()]); + + public int size() { + return byRank.keySet().size(); } - return arr; - } - @Override - public Integer apply(@Nullable VariantGraph.Vertex vertex) { - return byVertex.get(vertex); - } + @Override + public Iterator> iterator() { + return byRank.values().iterator(); + } - @Override - public int compare(VariantGraph.Vertex o1, VariantGraph.Vertex o2) { - final Integer o1Rank = byVertex.get(o1); - final Integer o2Rank = byVertex.get(o2); + public List>> asTable() { + return byRank.values().stream() + .filter(rank -> rank.stream().anyMatch(v -> !v.tokens().isEmpty())) + .map(vertices -> { + final SortedMap> row = new TreeMap<>(Witness.SIGIL_COMPARATOR); + vertices.stream().flatMap(v -> v.tokens().stream()).forEach(token -> row.computeIfAbsent(token.getWitness(), w -> new HashSet<>()).add(token)); + return row; + }) + .collect(Collectors.toList()); + } - Preconditions.checkState(o1Rank != null, o1); - Preconditions.checkState(o2Rank != null, o2); + public VariantGraph.Vertex[][] asArray() { + final VariantGraph.Vertex[][] arr = new VariantGraph.Vertex[byRank.size()][]; + byRank.forEach((rank, vertices) -> arr[rank] = vertices.toArray(new Vertex[vertices.size()])); + return arr; + } - return (o1Rank.intValue() - o2Rank.intValue()); - } + @Override + public Integer apply(VariantGraph.Vertex vertex) { + return byVertex.get(vertex); + } + + public Comparator comparator() { + return Comparator.comparingInt(byVertex::get); + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/util/VariantGraphTraversal.java b/collatex-core/src/main/java/eu/interedition/collatex/util/VariantGraphTraversal.java index 1480b885f..0bc93e284 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/util/VariantGraphTraversal.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/util/VariantGraphTraversal.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,92 +19,73 @@ package eu.interedition.collatex.util; -import com.google.common.base.Objects; -import com.google.common.collect.AbstractIterator; -import com.google.common.collect.Iterables; -import com.google.common.collect.Maps; import eu.interedition.collatex.VariantGraph; import eu.interedition.collatex.Witness; import java.util.ArrayDeque; +import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import java.util.Optional; import java.util.Queue; import java.util.Set; -import static java.util.Collections.singleton; - /** - * @author Gregor Middell + * @author Gregor Middell */ public class VariantGraphTraversal implements Iterable { - private final VariantGraph graph; - private final Set witnesses; - - private VariantGraphTraversal(VariantGraph graph, Set witnesses) { - this.graph = graph; - this.witnesses = witnesses; - } - - public static VariantGraphTraversal of(VariantGraph graph, Set witnesses) { - return new VariantGraphTraversal(graph, witnesses); - } + private final VariantGraph graph; + private final Set witnesses; - public static VariantGraphTraversal of(VariantGraph graph) { - return new VariantGraphTraversal(graph, null); - } + private VariantGraphTraversal(VariantGraph graph, Set witnesses) { + this.graph = graph; + this.witnesses = witnesses; + } - @Override - public Iterator iterator() { - return new AbstractIterator() { - private final Map encountered = Maps.newHashMap(); - private final Queue queue = new ArrayDeque(singleton(graph.getStart())); + public static VariantGraphTraversal of(VariantGraph graph, Set witnesses) { + return new VariantGraphTraversal(graph, witnesses); + } - @Override - protected VariantGraph.Vertex computeNext() { - if (queue.isEmpty()) { - return endOfData(); - } - final VariantGraph.Vertex next = queue.remove(); - for (VariantGraph.Edge edge : next.outgoing(witnesses)) { - final VariantGraph.Vertex end = edge.to(); + public static VariantGraphTraversal of(VariantGraph graph) { + return new VariantGraphTraversal(graph, null); + } - final int endEncountered = Objects.firstNonNull(encountered.get(end), 0); - final int endIncoming = Iterables.size(end.incoming(witnesses)); + @Override + public Iterator iterator() { + return new Iterator() { - if (endIncoming == endEncountered) { - throw new IllegalStateException(String.format("Encountered cycle traversing %s to %s", edge, end)); - } else if ((endIncoming - endEncountered) == 1) { - queue.add(end); - } + private final Map encountered = new HashMap<>(); + private final Queue queue = new ArrayDeque<>(); + private Optional next = Optional.of(graph.getStart()); - encountered.put(end, endEncountered + 1); - } - return next; - } - }; - } - - public Iterable edges() { - return new Iterable() { - - @Override - public Iterator iterator() { - return new AbstractIterator() { - private final Iterator vertexIt = VariantGraphTraversal.this.iterator(); - private final Queue queue = new ArrayDeque(); + @Override + public boolean hasNext() { + return next.isPresent(); + } - @Override - protected VariantGraph.Edge computeNext() { - if (queue.isEmpty()) { - if (vertexIt.hasNext()) { - Iterables.addAll(queue, vertexIt.next().outgoing(witnesses)); - } + @Override + public VariantGraph.Vertex next() { + final VariantGraph.Vertex next = this.next.get(); + for (Map.Entry> edge : next.outgoing().entrySet()) { + if (witnesses != null && !edge.getValue().stream().anyMatch(witnesses::contains)) { + continue; + } + final VariantGraph.Vertex end = edge.getKey(); + + final long endEncountered = Optional.ofNullable(encountered.get(end)).orElse(0L); + final long endIncoming = end.incoming().entrySet().stream().filter(e -> witnesses == null || e.getValue().stream().anyMatch(witnesses::contains)).count(); + + if (endIncoming == endEncountered) { + throw new IllegalStateException(String.format("Encountered cycle traversing %s to %s", edge, end)); + } else if ((endIncoming - endEncountered) == 1) { + queue.add(end); + } + + encountered.put(end, endEncountered + 1); + } + this.next = Optional.ofNullable(queue.poll()); + return next; } - return (queue.isEmpty() ? endOfData() : queue.remove()); - } }; - } - }; - } + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/util/VertexMatch.java b/collatex-core/src/main/java/eu/interedition/collatex/util/VertexMatch.java index 8712ef99b..0b633a3fc 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/util/VertexMatch.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/util/VertexMatch.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * @@ -19,110 +19,90 @@ package eu.interedition.collatex.util; -import com.google.common.base.Function; -import com.google.common.base.Predicate; import eu.interedition.collatex.Token; import eu.interedition.collatex.VariantGraph; -import javax.annotation.Nullable; +import java.util.BitSet; import java.util.Comparator; import java.util.SortedSet; +import java.util.function.Function; +import java.util.function.Predicate; /** -* @author Gregor Middell -*/ + * @author Gregor Middell + */ public abstract class VertexMatch implements Comparable { - public final VariantGraph.Vertex vertex; - public final int vertexRank; - - VertexMatch(VariantGraph.Vertex vertex, int vertexRank) { - this.vertex = vertex; - this.vertexRank = vertexRank; - } - - @Override - public int compareTo(VertexMatch o) { - return (vertexRank - o.vertexRank); - } - - @Override - public boolean equals(Object obj) { - if (obj != null && obj instanceof VertexMatch) { - return vertexRank == ((VertexMatch)obj).vertexRank; - } - return super.equals(obj); - } - - @Override - public int hashCode() { - return vertexRank; - } - - public static Comparator> setComparator() { - return new Comparator>() { - @Override - public int compare(SortedSet o1, SortedSet o2) { - return o1.first().compareTo(o2.first()); - } - }; - } - - /** - * @author Gregor Middell - */ - public static class WithToken extends VertexMatch { - - public final Token token; - - public WithToken(VariantGraph.Vertex vertex, int vertexRank, Token token) { - super(vertex, vertexRank); - this.token = token; + public final VariantGraph.Vertex vertex; + public final int vertexRank; + + VertexMatch(VariantGraph.Vertex vertex, int vertexRank) { + this.vertex = vertex; + this.vertexRank = vertexRank; } @Override - public String toString() { - return "{" + vertex + " -> " + token + "}"; + public int compareTo(VertexMatch o) { + return (vertexRank - o.vertexRank); } - } - /** - * @author Gregor Middell - */ - public static class WithTokenIndex extends VertexMatch { + @Override + public boolean equals(Object obj) { + if (obj != null && obj instanceof VertexMatch) { + return vertexRank == ((VertexMatch) obj).vertexRank; + } + return super.equals(obj); + } - public final int token; + @Override + public int hashCode() { + return vertexRank; + } - public WithTokenIndex(VariantGraph.Vertex vertex, int vertexRank, int token) { - super(vertex, vertexRank); - this.token = token; + public static Comparator> setComparator() { + return (o1, o2) -> o1.first().compareTo(o2.first()); } - @Override - public String toString() { - return "{" + vertex + " -> " + token + "}"; + /** + * @author Gregor Middell + */ + public static class WithToken extends VertexMatch { + + public final Token token; + + public WithToken(VariantGraph.Vertex vertex, int vertexRank, Token token) { + super(vertex, vertexRank); + this.token = token; + } + + @Override + public String toString() { + return "{" + vertex + " -> " + token + "}"; + } } - } - - public static Function tokenResolver(final Token[] tokens) { - return new Function() { - @Override - public WithToken apply(@Nullable WithTokenIndex input) { - return new WithToken(input.vertex, input.vertexRank, tokens[input.token]); - } - }; - } - - public static final Predicate> filter(final IntegerRangeSet rankFilter, final IntegerRangeSet tokenFilter) { - return new Predicate>() { - @Override - public boolean apply(@Nullable SortedSet input) { - for (WithTokenIndex match : input) { - if (tokenFilter.apply(match.token) || rankFilter.apply(match.vertexRank)) { - return true; - } + + /** + * @author Gregor Middell + */ + public static class WithTokenIndex extends VertexMatch { + + public final int token; + + public WithTokenIndex(VariantGraph.Vertex vertex, int vertexRank, int token) { + super(vertex, vertexRank); + this.token = token; + } + + @Override + public String toString() { + return "{" + vertex + " -> " + token + "}"; } - return false; - } - }; - } + } + + public static Function tokenResolver(final Token[] tokens) { + return input -> new WithToken(input.vertex, input.vertexRank, tokens[input.token]); + } + + public static Predicate> filter(final BitSet rankFilter, final BitSet tokenFilter) { + return input -> input.stream().anyMatch(match -> tokenFilter.get(match.token) || rankFilter.get(match.vertexRank)); + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/util/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/util/package-info.java index ea2d0f32b..e3d780285 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/util/package-info.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/util/package-info.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The Interedition Development Group. + * Copyright (c) 2015 The Interedition Development Group. * * This file is part of CollateX. * diff --git a/collatex-core/src/main/javadoc/overview.html b/collatex-core/src/main/javadoc/overview.html index d2c72b872..29e7765d5 100644 --- a/collatex-core/src/main/javadoc/overview.html +++ b/collatex-core/src/main/javadoc/overview.html @@ -1,5 +1,5 @@