Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix O(n!) in tag depth issue #28

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions dkpro-c4corpus-boilerplate/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
<artifactId>commons-io</artifactId>
</dependency>

<dependency>
<groupId>org.dkpro.c4corpus</groupId>
<artifactId>dkpro-c4corpus-language</artifactId>
</dependency>
</dependencies>

<!-- for a standalone application -->
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ public static Node nearestCommonAncestor(Node node1, Node node2)
{
Node ancestor = node1;
while (ancestor != null) {
// FIXME: Inefficient!
if (isAncestor(ancestor, node2)) {
return ancestor;
}
Expand All @@ -65,9 +66,6 @@ public static Node nearestCommonAncestor(Node node1, Node node2)
*/
public static boolean isAncestor(Node node1, Node node2)
{
if (node1 == node2) {
return true;
}
Node ancestor = node2;

while (ancestor != null) {
Expand All @@ -88,6 +86,7 @@ public static boolean isAncestor(Node node1, Node node2)
*/
public static boolean isLink(Node node)
{
// TODO: This is continually traversing the tree & recomputing stuff
Node ancestor = node;

while (ancestor != null) {
Expand All @@ -100,9 +99,8 @@ public static boolean isLink(Node node)
return false;
}

public enum TagType
private enum TagType
{

IGNORABLE, INNER_TEXT, BLOCK_LEVEL, BLOCK_LEVEL_CONTENT, BLOCK_LEVEL_TITLE
}

Expand All @@ -117,11 +115,11 @@ public enum TagType
TAGS_TYPE.put("applet", TagType.IGNORABLE);
TAGS_TYPE.put("link", TagType.IGNORABLE);
TAGS_TYPE.put("button", TagType.IGNORABLE);
TAGS_TYPE.put("select", TagType.IGNORABLE);
TAGS_TYPE.put("inTAGS_TYPE.put", TagType.IGNORABLE);
TAGS_TYPE.put("textarea", TagType.IGNORABLE);
TAGS_TYPE.put("keygen", TagType.IGNORABLE);

TAGS_TYPE.put("select", TagType.BLOCK_LEVEL);
TAGS_TYPE.put("blockquote", TagType.BLOCK_LEVEL);
TAGS_TYPE.put("caption", TagType.BLOCK_LEVEL);
TAGS_TYPE.put("center", TagType.BLOCK_LEVEL);
Expand Down Expand Up @@ -156,30 +154,33 @@ public enum TagType
TAGS_TYPE.put("b", TagType.INNER_TEXT); //count as text inside block
TAGS_TYPE.put("u", TagType.INNER_TEXT); //count as text inside block
TAGS_TYPE.put("i", TagType.INNER_TEXT);//count as text inside block
TAGS_TYPE.put("em", TagType.INNER_TEXT);
TAGS_TYPE.put("strong", TagType.INNER_TEXT);
TAGS_TYPE.put("span", TagType.INNER_TEXT);
TAGS_TYPE.put("a", TagType.INNER_TEXT);
//the <br><br> is a paragraph separator and should
TAGS_TYPE.put("br", TagType.INNER_TEXT); //count as text inside block
}

public static boolean isInnerText(Node tag)
{
return !(tag == null || !(tag instanceof Element))
&& TAGS_TYPE.get(tag.nodeName()) == TagType.INNER_TEXT;
return tag instanceof Element && TAGS_TYPE.get(tag.nodeName()) == TagType.INNER_TEXT;
}

public static boolean isBlockTag(Node tag)
{
return !(tag == null || !(tag instanceof Element)) && ((Element) tag).isBlock();
// FIXME: This doesn't use the tag list above
return tag instanceof Element && ((Element) tag).isBlock();
}

public static boolean isInlineTag(Node tag)
{
return !(tag == null || !(tag instanceof Element)) && ((Element) tag).tag().isInline();
return tag instanceof Element && ((Element) tag).tag().isInline();
}

public static boolean isLinkTag(Node elem)
{
return !(elem == null || !(elem instanceof Element)) && (
"a".equalsIgnoreCase(elem.nodeName()) || "link".equalsIgnoreCase(elem.nodeName()));
return elem instanceof Element && !"".equals(((Element) elem).attr("href"));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,28 @@

package de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.impl;

import de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.impl.Paragraph.PARAGRAPH_TYPE;

/**
* Data structure representing a pair of integer and string
* Data structure representing a pair of integer and paragraph type
*
* @author Omnia Zayed
*/
public class Pair {

public final Integer id;
public final String classType;
public final PARAGRAPH_TYPE classType;

public Pair(Integer id, String classType) {
public Pair(Integer id, PARAGRAPH_TYPE c) {
this.id = id;
this.classType = classType;
this.classType = c;
}

public Integer getID() {
return this.id;
}

public String getClassType() {
public PARAGRAPH_TYPE getClassType() {
return this.classType;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,59 +35,61 @@
public class Paragraph
extends LinkedList<Node>
{
private static final long serialVersionUID = 1L;


public enum PARAGRAPH_TYPE {UNKNOWN, SHORT, GOOD, NEAR_GOOD, BAD};

// private ArrayList<String> textNodes;
int charsCountInLinks = 0;
private String classType = "";
private String contextFreeClass = "";
private PARAGRAPH_TYPE classType = PARAGRAPH_TYPE.UNKNOWN;
private PARAGRAPH_TYPE contextFreeClass = PARAGRAPH_TYPE.UNKNOWN;
private String tagName = "";
private String rawText = "";
private boolean isHeading = false;

public Paragraph(Node firstNode)
public Paragraph(Node firstNode, boolean heading)
{
add(firstNode);
}

public void initRawInfo()
{
StringBuilder sb = new StringBuilder();
for (Node n : this) {
// NodeHelper.cleanEmptyElements(n);
if (n instanceof TextNode) {
this.setTagName(getPath(n));
String nodeRawText = ((TextNode) n).text();
sb.append(Utils.normalizeBreaks(nodeRawText).trim());
Node node = firstNode;
while (NodeHelper.isInnerText(node) || node instanceof TextNode) {
node = node.parent();
}
if (node != null) {
this.tagName = node.nodeName();
}
this.isHeading = heading;
if (firstNode instanceof TextNode) {
String nodeRawText = ((TextNode) firstNode).text();
this.rawText = nodeRawText.trim();

if (NodeHelper.isLink(n)) {
charsCountInLinks += nodeRawText.length();
}
if (NodeHelper.isLink(firstNode)) {
charsCountInLinks += nodeRawText.length();
}
}

rawText = sb.toString();
}


public int getLinksLength()
{
return this.charsCountInLinks;
}

public String getClassType()
public PARAGRAPH_TYPE getClassType()
{
return this.classType;
}

public void setClassType(String classType)
public void setClassType(PARAGRAPH_TYPE classType)
{
this.classType = classType;
}

public String getContextFreeClass()
public PARAGRAPH_TYPE getContextFreeClass()
{
return this.contextFreeClass;
}

public void setContextFreeClass(String contextFreeClass)
public void setContextFreeClass(PARAGRAPH_TYPE contextFreeClass)
{
this.contextFreeClass = contextFreeClass;
}
Expand All @@ -97,29 +99,6 @@ public String getTagName()
return this.tagName;
}

public String getPath(Node n)
{
String nodePath = "";
while (n != null) {
if (n instanceof TextNode) {
n = n.parent();
}
if (NodeHelper.isInnerText(n)) {
n = n.parent();
}
String parentNodeName = n.nodeName();
nodePath = parentNodeName + "." + nodePath;

if (!parentNodeName.equalsIgnoreCase("html")) {
n = n.parent();
}
else {
break;
}
}

return nodePath;
}

public void setTagName(String name)
{
Expand All @@ -128,24 +107,22 @@ public void setTagName(String name)

public boolean isHeading()
{
return this.getTagName().matches(".*\\.h\\d\\.");
return isHeading;
}

public boolean isBoilerplate()
{
return !this.getClassType().equalsIgnoreCase("good");
return this.getClassType() != PARAGRAPH_TYPE.GOOD;
}

public String getRawText()
{

return Utils.normalizeBreaks(rawText.trim());
return rawText;
}

public void setRawText(String rawText)
{
this.rawText = Utils.normalizeBreaks(rawText.trim());

this.rawText = rawText;
}

public int getWordsCount()
Expand Down
Loading