Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix 3rdparty detector bugs #45

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions nab/labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,15 @@ class CorpusLabel(object):
benchmark corpus.
"""

def __init__(self, path, corpus):
def __init__(self, path, corpus, remove_duplicates=False):
"""
Initializes a CorpusLabel object by getting the anomaly windows and labels.
When this is done for combining raw user labels, we skip getLabels()
because labels are not yet created.

@param path (string) Name of file containing the set of labels.
@param corpus (nab.Corpus) Corpus object.
@param remove_duplicates (bool) Whether to remove duplicate rows in the label data
"""
self.path = path

Expand All @@ -125,7 +126,7 @@ def __init__(self, path, corpus):

if "raw" not in self.path:
# Do not get labels from files in the path nab/labels/raw
self.getLabels()
self.getLabels(remove_duplicates)


def getWindows(self):
Expand Down Expand Up @@ -192,7 +193,7 @@ def validateLabels(self):
raise ValueError("In the label file %s, windows overlap." % self.path)


def getLabels(self):
def getLabels(self, remove_duplicates=False):
"""
Get Labels as a dictionary of key-value pairs of a relative path and its
corresponding binary vector of anomaly labels. Labels are simply a more
Expand All @@ -213,6 +214,10 @@ def getLabels(self):
indices = betweenT1AndT2.loc[:,"label"].index
labels["label"].values[indices.values] = 1

# remove duplicate rows (somehow they snuck in for certain datasets)
if remove_duplicates:
labels = labels.drop_duplicates(subset=['timestamp'])

self.labels[relativePath] = labels

else:
Expand Down
4 changes: 2 additions & 2 deletions nab/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,10 @@ def __init__(self,
self.profiles = None


def initialize(self):
def initialize(self, remove_duplicate_labels=False):
"""Initialize all the relevant objects for the run."""
self.corpus = Corpus(self.dataDir)
self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus)
self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus, remove_duplicates=remove_duplicate_labels)

with open(self.profilesPath) as p:
self.profiles = json.load(p)
Expand Down
5 changes: 3 additions & 2 deletions nab/sweeper.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,17 +301,18 @@ def scoreDataSet(
scores (list) List of per-row scores, to be saved in score file
matchingRow (ThresholdScore)
"""
threshold = float(threshold)
anomalyList = self.calcSweepScore(
timestamps, anomalyScores, windowLimits, dataSetName)
scoresByThreshold = self.calcScoreByThreshold(anomalyList)

matchingRow = None
prevRow = None
for thresholdScore in scoresByThreshold:
if thresholdScore.threshold == threshold:
if float(thresholdScore.threshold) == threshold:
matchingRow = thresholdScore
break
elif thresholdScore.threshold < threshold:
elif float(thresholdScore.threshold) < threshold:
matchingRow = prevRow
break

Expand Down
7 changes: 6 additions & 1 deletion run.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def main(args):
thresholdPath=thresholdsFile,
numCPUs=numCPUs)

runner.initialize()
runner.initialize(args.removeDuplicateLabels)

if args.detect:
detectorConstructors = getDetectorClassConstructors(args.detectors)
Expand Down Expand Up @@ -142,6 +142,11 @@ def main(args):
default=False,
action="store_true")

parser.add_argument("--removeDuplicateLabels",
help="If specified will remove any duplicate rows from the labeled NAB data",
default=False,
action="store_true")

parser.add_argument("--dataDir",
default="data",
help="This holds all the label windows for the corpus.")
Expand Down