From f001d8157034e4a9505694f6ae50a19f2fc5c67f Mon Sep 17 00:00:00 2001 From: Erich Schubert Date: Tue, 1 Aug 2017 14:58:44 +0200 Subject: [PATCH] Reduce code duplication. --- .../heideltime/utilities/ContextAnalyzer.java | 117 +++++++++--------- 1 file changed, 56 insertions(+), 61 deletions(-) diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ContextAnalyzer.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ContextAnalyzer.java index 8ae28638..25d22f5b 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ContextAnalyzer.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ContextAnalyzer.java @@ -1,8 +1,8 @@ package de.unihd.dbs.uima.annotator.heideltime.utilities; +import java.util.ArrayList; +import java.util.Comparator; import java.util.List; -import java.util.Map; -import java.util.TreeMap; import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -21,11 +21,9 @@ import de.unihd.dbs.uima.types.heideltime.Token; /** - * * This class contains methods that work with the dependence of a subject with its surrounding data; namely via the jcas element or a subset list. * * @author jannik stroetgen - * */ public class ContextAnalyzer { /** Class logger */ @@ -331,30 +329,12 @@ public static Tense getClosestTense(Timex3 timex, JCas jcas, Language language) int lastid = 0, nextid = 0; int tid = 0; - // Get the sentence - AnnotationIndex sentences = jcas.getAnnotationIndex(Sentence.type); - Sentence s = null; - for (FSIterator iterSentence = sentences.iterator(); iterSentence.hasNext();) { - s = iterSentence.next(); - if (s.getBegin() <= timex.getBegin() && s.getEnd() >= timex.getEnd()) { - break; - } - } - - // Get the tokens - TreeMap tmToken = new TreeMap(); - AnnotationIndex tokens = jcas.getAnnotationIndex(Token.type); - FSIterator iter = (s != null) ? tokens.subiterator(s) : tokens.iterator(); - while (iter.hasNext()) { - Token token = iter.next(); - tmToken.put(token.getEnd(), token); - } + ArrayList tmToken = getCloseTokens(timex, jcas); // Get the last VERB token - for (Map.Entry ent : tmToken.entrySet()) { + for (Token token : tmToken) { tokenCounter++; - if (ent.getKey() < timex.getBegin()) { - Token token = ent.getValue(); + if (token.getEnd() < timex.getBegin()) { String pos = token.getPos(); if (pos == null) continue; // POS not available? @@ -385,11 +365,10 @@ public static Tense getClosestTense(Timex3 timex, JCas jcas, Language language) } } tokenCounter = 0; - for (Map.Entry ent : tmToken.entrySet()) { + for (Token token : tmToken) { tokenCounter++; if (nextTense == null) { - if (ent.getKey() > timex.getEnd()) { - Token token = ent.getValue(); + if (token.getEnd() > timex.getEnd()) { String pos = token.getPos(); if (pos == null) continue; // No POS available? @@ -453,27 +432,11 @@ public static Tense getLastTense(Timex3 timex, JCas jcas, Language language) { Tense lastTense = null; // Get the sentence - AnnotationIndex sentences = jcas.getAnnotationIndex(Sentence.type); - Sentence s = null; - for (FSIterator iterSentence = sentences.iterator(); iterSentence.hasNext();) { - s = iterSentence.next(); - if (s.getBegin() <= timex.getBegin() && s.getEnd() >= timex.getEnd()) - break; - } - - // Get the tokens - TreeMap tmToken = new TreeMap(); - AnnotationIndex tokens = jcas.getAnnotationIndex(Token.type); - FSIterator iter = (s != null) ? tokens.subiterator(s) : tokens.iterator(); - while (iter.hasNext()) { - Token token = iter.next(); - tmToken.put(token.getEnd(), token); - } + ArrayList tmToken = getCloseTokens(timex, jcas); // Get the last VERB token - for (Map.Entry ent : tmToken.entrySet()) { - if (ent.getKey() < timex.getBegin()) { - Token token = ent.getValue(); + for (Token token : tmToken) { + if (token.getEnd() < timex.getBegin()) { String coveredText = token.getCoveredText(); String pos = token.getPos(); if (pos == null) @@ -500,8 +463,7 @@ public static Tense getLastTense(Timex3 timex, JCas jcas, Language language) { lastTense = Tense.PAST; } } - if (lastTense == null && ent.getKey() > timex.getEnd()) { - Token token = ent.getValue(); + if (lastTense == null && token.getEnd() > timex.getEnd()) { String pos = token.getPos(); if (LOG.isTraceEnabled()) { @@ -524,16 +486,15 @@ public static Tense getLastTense(Timex3 timex, JCas jcas, Language language) { } } if (lastTense != null) - LOG.trace("this tense: {} {}", ent.getValue().getCoveredText(), lastTense); + LOG.trace("this tense: {} {}", token.getCoveredText(), lastTense); } // check for double POS Constraints (not included in the rule language, yet) TODO // VHZ VNN and VHZ VNN and VHP VNN and VBP VVN String prevPos = ""; Tense longTense = null; if (lastTense == Tense.PRESENTFUTURE) { - for (Map.Entry ent : tmToken.entrySet()) { - if (ent.getKey() < timex.getBegin()) { - Token token = ent.getValue(); + for (Token token : tmToken) { + if (token.getEnd() < timex.getBegin()) { String pos = token.getPos(); if ("VHZ".equals(prevPos) || "VBZ".equals(prevPos) || "VHP".equals(prevPos) || "VBP".equals(prevPos) || prevPos.equals("VER:pres")) { if ("VVN".equals(pos) || "VER:pper".equals(pos)) { @@ -546,8 +507,7 @@ public static Tense getLastTense(Timex3 timex, JCas jcas, Language language) { } prevPos = pos; } - if (longTense == null && ent.getKey() > timex.getEnd()) { - Token token = ent.getValue(); + if (longTense == null && token.getEnd() > timex.getEnd()) { if ("VHZ".equals(prevPos) || "VBZ".equals(prevPos) || "VHP".equals(prevPos) || "VBP".equals(prevPos) || "VER:pres".equals(prevPos)) { if ("VVN".equals(token.getPos()) || "VER:pper".equals(token.getPos())) { String covered = token.getCoveredText(); @@ -563,9 +523,8 @@ public static Tense getLastTense(Timex3 timex, JCas jcas, Language language) { } // French: VER:pres VER:pper if (lastTense == Tense.PAST) { - for (Map.Entry ent : tmToken.entrySet()) { - if (ent.getKey() < timex.getBegin()) { - Token token = ent.getValue(); + for (Token token : tmToken) { + if (token.getEnd() < timex.getBegin()) { String pos = token.getPos(); if ("VER:pres".equals(prevPos) && "VER:pper".equals(pos)) { if (PREVUE_ENVISAGEE.matcher(token.getCoveredText()).matches()) { @@ -576,8 +535,7 @@ public static Tense getLastTense(Timex3 timex, JCas jcas, Language language) { prevPos = pos; } if (longTense == null) { - if (ent.getKey() > timex.getEnd()) { - Token token = ent.getValue(); + if (token.getEnd() > timex.getEnd()) { String pos = token.getPos(); if ("VER:pres".equals(prevPos) && "VER:pper".equals(pos)) { if (PREVUE_ENVISAGEE.matcher(token.getCoveredText()).matches()) { @@ -591,7 +549,44 @@ public static Tense getLastTense(Timex3 timex, JCas jcas, Language language) { } } LOG.trace("TENSE: {}", lastTense); - return lastTense; } + + /** + * Get the tokens close to the given timex (i.e. the same sentence). + * + * @param timex + * Timex + * @param jcas + * Cas + * @return Tokens, sorted by end. + */ + private static ArrayList getCloseTokens(Timex3 timex, JCas jcas) { + // Get the sentence + AnnotationIndex sentences = jcas.getAnnotationIndex(Sentence.type); + Sentence s = null; + for (FSIterator iterSentence = sentences.iterator(); iterSentence.hasNext();) { + s = iterSentence.next(); + if (s.getBegin() <= timex.getBegin() && s.getEnd() >= timex.getEnd()) + break; + } + + // Get the tokens + AnnotationIndex tokens = jcas.getAnnotationIndex(Token.type); + FSIterator iter = (s != null) ? tokens.subiterator(s) : tokens.iterator(); + ArrayList tmToken = new ArrayList(); + while (iter.hasNext()) + tmToken.add(iter.next()); + tmToken.sort(SORT_TOKENS); + return tmToken; + } + + /** + * Sort tokens by the token end. + */ + private static final Comparator SORT_TOKENS = new Comparator() { + public int compare(Token o1, Token o2) { + return Integer.compare(o1.getEnd(), o2.getEnd()); + } + }; }