Skip to content

Commit

Permalink
[column-parsers] Add an option to disable na/NA as missing
Browse files Browse the repository at this point in the history
 - Not 100% sure this is a great idea or the best implementation
 - c.f., this slack thread: https://clojurians.slack.com/archives/C0BQDEJ8M/p1708666522549399
 - Not sure about the option name, feel free to change
 - Slightly sketched by this similar-looking macro: https://github.com/techascent/tech.ml.dataset/blob/7b819dd81ccd58812ed189009cede6fe3ec7204b/src/tech/v3/dataset/impl/column_data_process.clj#L19-L26
  • Loading branch information
harold committed Feb 23, 2024
1 parent daaf470 commit 69f71f6
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
11 changes: 6 additions & 5 deletions src/tech/v3/dataset/io/column_parsers.clj
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@

(defn- missing-value?
"Is this a missing value coming from a CSV file"
[value]
[value options]
;;fastpath for numbers
(cond
(or (instance? Double value) (instance? Float value))
Expand All @@ -181,7 +181,8 @@
(or (nil? value)
(.equals "" value)
(identical? value :tech.v3.dataset/missing)
(and (string? value) (.equalsIgnoreCase ^String value "na")))))
(and (not (:disable-na-as-missing? options))
(string? value) (.equalsIgnoreCase ^String value "na")))))


(deftype FixedTypeParser [^IMutList container
Expand Down Expand Up @@ -210,7 +211,7 @@
;;be in the space of the container or it could require the parse-fn
;;to make it.
(let [parsed-value (cond
(missing-value? value)
(missing-value? value {})
:tech.v3.dataset/missing
(and (identical? (dtype/datatype value) container-dtype)
(not (instance? String value)))
Expand Down Expand Up @@ -395,7 +396,7 @@
(addValue [_p idx value]
(let [parsed-value
(cond
(missing-value? value)
(missing-value? value options)
:tech.v3.dataset/missing


Expand Down Expand Up @@ -494,7 +495,7 @@
PParser
(addValue [_p idx value]
(set! max-idx idx)
(when-not (missing-value? value)
(when-not (missing-value? value options)
(let [val-dtype (fast-dtype value)]
;;setup container for new data
(when-not (identical? container-dtype val-dtype)
Expand Down
7 changes: 7 additions & 0 deletions test/tech/v3/dataset_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -1733,6 +1733,13 @@
(-> (ds/select ds :all vec-of-bools)
:a)))))

(deftest disable-na-as-missing
(let [expected-column ["foo" "NA"]
ds1 (ds/->dataset {:a expected-column} {:disable-na-as-missing? true})
ds2 (ds/->dataset (for [v expected-column] {:a v}) {:disable-na-as-missing? true})]
(is (= expected-column (:a ds1)))
(is (= expected-column (:a ds2)))))

(comment
(require '[criterium.core :as crit])
(def data (vec (repeatedly 100000 (fn [] {:a (rand-int 20) :b (rand) :c (rand)}))))
Expand Down

0 comments on commit 69f71f6

Please sign in to comment.