From 54bd25ddfabcc0bc08d87b557b2fc41c08eae9f4 Mon Sep 17 00:00:00 2001 From: Harold Date: Sat, 24 Feb 2024 05:53:40 -0700 Subject: [PATCH] `[column-parsers]` Add an option to disable na/NA as missing (#399) * `[column-parsers]` Add an option to disable na/NA as missing - Not 100% sure this is a great idea or the best implementation - c.f., this slack thread: https://clojurians.slack.com/archives/C0BQDEJ8M/p1708666522549399 - Not sure about the option name, feel free to change - Slightly sketched by this similar-looking macro: https://github.com/techascent/tech.ml.dataset/blob/7b819dd81ccd58812ed189009cede6fe3ec7204b/src/tech/v3/dataset/impl/column_data_process.clj#L19-L26 * Slightly faster formulation - avoid keyword lookups in hot paths. --------- Co-authored-by: Chris Nuernberger --- src/tech/v3/dataset/io/column_parsers.clj | 17 ++++++++++------- test/tech/v3/dataset_test.clj | 7 +++++++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/tech/v3/dataset/io/column_parsers.clj b/src/tech/v3/dataset/io/column_parsers.clj index 004b8783..d02dc04f 100644 --- a/src/tech/v3/dataset/io/column_parsers.clj +++ b/src/tech/v3/dataset/io/column_parsers.clj @@ -172,7 +172,7 @@ (defn- missing-value? "Is this a missing value coming from a CSV file" - [value] + [value disable-na-as-missing?] ;;fastpath for numbers (cond (or (instance? Double value) (instance? Float value)) @@ -181,7 +181,8 @@ (or (nil? value) (.equals "" value) (identical? value :tech.v3.dataset/missing) - (and (string? value) (.equalsIgnoreCase ^String value "na"))))) + (and (not disable-na-as-missing?) + (string? value) (.equalsIgnoreCase ^String value "na"))))) (deftype FixedTypeParser [^IMutList container @@ -210,7 +211,7 @@ ;;be in the space of the container or it could require the parse-fn ;;to make it. (let [parsed-value (cond - (missing-value? value) + (missing-value? value false) :tech.v3.dataset/missing (and (identical? (dtype/datatype value) container-dtype) (not (instance? String value))) @@ -380,7 +381,8 @@ ^List promotion-list column-name ^:unsynchronized-mutable ^long last-idx - options] + options + disable-na-as-missing?] dtype-proto/PECount (ecount [_this] (inc last-idx)) Indexed @@ -395,7 +397,7 @@ (addValue [_p idx value] (let [parsed-value (cond - (missing-value? value) + (missing-value? value disable-na-as-missing?) :tech.v3.dataset/missing @@ -467,7 +469,8 @@ parser-datatype-sequence) column-name -1 - options))) + options + (get options :disable-na-as-missing?)))) (^PParser [column-name options] (promotional-string-parser column-name default-parser-datatype-sequence options))) @@ -494,7 +497,7 @@ PParser (addValue [_p idx value] (set! max-idx idx) - (when-not (missing-value? value) + (when-not (missing-value? value options) (let [val-dtype (fast-dtype value)] ;;setup container for new data (when-not (identical? container-dtype val-dtype) diff --git a/test/tech/v3/dataset_test.clj b/test/tech/v3/dataset_test.clj index 030d4599..c6db978d 100644 --- a/test/tech/v3/dataset_test.clj +++ b/test/tech/v3/dataset_test.clj @@ -1733,6 +1733,13 @@ (-> (ds/select ds :all vec-of-bools) :a))))) +(deftest disable-na-as-missing + (let [expected-column ["foo" "NA"] + ds1 (ds/->dataset {:a expected-column} {:disable-na-as-missing? true}) + ds2 (ds/->dataset (for [v expected-column] {:a v}) {:disable-na-as-missing? true})] + (is (= expected-column (:a ds1))) + (is (= expected-column (:a ds2))))) + (comment (require '[criterium.core :as crit]) (def data (vec (repeatedly 100000 (fn [] {:a (rand-int 20) :b (rand) :c (rand)}))))