diff --git a/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/impl/JusTextBoilerplateRemoval.java b/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/impl/JusTextBoilerplateRemoval.java index f139695..c1c22cc 100644 --- a/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/impl/JusTextBoilerplateRemoval.java +++ b/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/impl/JusTextBoilerplateRemoval.java @@ -32,11 +32,11 @@ import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; +import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedHashSet; -import java.util.LinkedList; import java.util.List; import java.util.Locale; import java.util.Map; @@ -113,7 +113,7 @@ private Document convertHtmlToDoc(String html) * Initialize the Paragraph explorer class in order to convert a document to * a list of blocks (paragraphs) */ - private LinkedList makeParagraphs(Node node) + private ArrayList makeParagraphs(Node node) { ParagraphsExplorer pe = new ParagraphsExplorer(); node.traverse(pe); //begin the traversal of the doc @@ -274,12 +274,11 @@ private PARAGRAPH_TYPE getNextNeighbourOptimized(int i, List paragrap *
  • postprocessing of header blocks * * - * FIXME: This can behave pathologically in the presence of large lists of "paragraphs" - * with no textual content. In this case the maxHeadingDistance parameter isn't adequate - * to short circuit large amounts of processing. We may need to max number of elements - * to search (10? 20?). + * NOTE: Normally we'd use List in the definition, but this method makes extensive + * use of List.get() which is very inefficient with other list implementations such as LinkedList. + * This could be rewritten to use ListIterators to generalize it, but I don't see the point. */ - private void reclassifyContextSensitive(List paragraphs, int maxHeadingDistance) + private void reclassifyContextSensitive(ArrayList paragraphs, int maxHeadingDistance) { // Default classification is the same as the context-free classification for (Paragraph p : paragraphs) { @@ -408,7 +407,7 @@ private List classify(String htmlText, Set stopwordsSet, int } Document jSoupDoc = convertHtmlToDoc(htmlText); - LinkedList paragraphs = makeParagraphs(jSoupDoc); + ArrayList paragraphs = makeParagraphs(jSoupDoc); //context-free classification classifyContextFree(paragraphs, stopwordsSet, lengthLow, lengthHigh, stopwordsLow, stopwordsHigh, maxLinkDensity); diff --git a/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/impl/ParagraphsExplorer.java b/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/impl/ParagraphsExplorer.java index c637864..8ecd4cb 100644 --- a/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/impl/ParagraphsExplorer.java +++ b/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/impl/ParagraphsExplorer.java @@ -23,10 +23,10 @@ import org.jsoup.nodes.TextNode; import org.jsoup.select.NodeVisitor; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; -import java.util.LinkedList; import java.util.Set; import java.util.regex.Pattern; @@ -48,7 +48,7 @@ public class ParagraphsExplorer new HashSet(Arrays.asList(new String[] { "blockquote", "caption", "center", "col", "colgroup", "dd", "div", "dl", "dt", "fieldset", "form", "legend", "optgroup", "option", "p", "pre", "table", "td", "textarea", "tfoot", "th", "thead", "tr", "ul", "li", "h1", "h2", "h3", "h4", "h5", "h6" }))); - private final LinkedList paragraphs; + private final ArrayList paragraphs; private Paragraph currentParagraph = null; private boolean lastBR = false; private boolean inHeading = false; @@ -62,7 +62,7 @@ public enum AncestorState public ParagraphsExplorer() { - this.paragraphs = new LinkedList<>(); + this.paragraphs = new ArrayList<>(); } @Override @@ -106,7 +106,7 @@ public void tail(Node node, int depth) * * @return paragraphs */ - public LinkedList getParagraphs() + public ArrayList getParagraphs() { return paragraphs; }