htm-community · pmenn36 · Oct 2, 2020 · Oct 2, 2020
diff --git a/nab/labeler.py b/nab/labeler.py
@@ -106,14 +106,15 @@ class CorpusLabel(object):
   benchmark corpus.
   """
 
-  def __init__(self, path, corpus):
+  def __init__(self, path, corpus, remove_duplicates=False):
     """
     Initializes a CorpusLabel object by getting the anomaly windows and labels.
     When this is done for combining raw user labels, we skip getLabels()
     because labels are not yet created.
 
     @param path    (string)      Name of file containing the set of labels.
     @param corpus  (nab.Corpus)  Corpus object.
+    @param remove_duplicates (bool) Whether to remove duplicate rows in the label data
     """
     self.path = path
 
@@ -125,7 +126,7 @@ def __init__(self, path, corpus):
 
     if "raw" not in self.path:
       # Do not get labels from files in the path nab/labels/raw
-      self.getLabels()
+      self.getLabels(remove_duplicates)
 
 
   def getWindows(self):
@@ -192,7 +193,7 @@ def validateLabels(self):
           raise ValueError("In the label file %s, windows overlap." % self.path)
 
 
-  def getLabels(self):
+  def getLabels(self, remove_duplicates=False):
     """
     Get Labels as a dictionary of key-value pairs of a relative path and its
     corresponding binary vector of anomaly labels. Labels are simply a more
@@ -213,6 +214,10 @@ def getLabels(self):
           indices = betweenT1AndT2.loc[:,"label"].index
           labels["label"].values[indices.values] = 1
 
+        # remove duplicate rows (somehow they snuck in for certain datasets)
+        if remove_duplicates:
+            labels = labels.drop_duplicates(subset=['timestamp'])
+
         self.labels[relativePath] = labels
 
       else:

diff --git a/nab/runner.py b/nab/runner.py
@@ -87,10 +87,10 @@ def __init__(self,
     self.profiles = None
 
 
-  def initialize(self):
+  def initialize(self, remove_duplicate_labels=False):
     """Initialize all the relevant objects for the run."""
     self.corpus = Corpus(self.dataDir)
-    self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus)
+    self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus, remove_duplicates=remove_duplicate_labels)
 
     with open(self.profilesPath) as p:
       self.profiles = json.load(p)

diff --git a/nab/sweeper.py b/nab/sweeper.py
@@ -301,17 +301,18 @@ def scoreDataSet(
       scores      (list) List of per-row scores, to be saved in score file
       matchingRow (ThresholdScore)
     """
+    threshold = float(threshold)
     anomalyList = self.calcSweepScore(
       timestamps, anomalyScores, windowLimits, dataSetName)
     scoresByThreshold = self.calcScoreByThreshold(anomalyList)
 
     matchingRow = None
     prevRow = None
     for thresholdScore in scoresByThreshold:
-      if thresholdScore.threshold == threshold:
+      if float(thresholdScore.threshold) == threshold:
         matchingRow = thresholdScore
         break
-      elif thresholdScore.threshold < threshold:
+      elif float(thresholdScore.threshold) < threshold:
         matchingRow = prevRow
         break
 

diff --git a/run.py b/run.py
@@ -89,7 +89,7 @@ def main(args):
                   thresholdPath=thresholdsFile,
                   numCPUs=numCPUs)
 
-  runner.initialize()
+  runner.initialize(args.removeDuplicateLabels)
 
   if args.detect:
     detectorConstructors = getDetectorClassConstructors(args.detectors)
@@ -142,6 +142,11 @@ def main(args):
                     default=False,
                     action="store_true")
 
+  parser.add_argument("--removeDuplicateLabels",
+                    help="If specified will remove any duplicate rows from the labeled NAB data",
+                    default=False,
+                    action="store_true")
+
   parser.add_argument("--dataDir",
                     default="data",
                     help="This holds all the label windows for the corpus.")