-
-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[column-parsers]
Add an option to disable na/NA as missing (#399)
* `[column-parsers]` Add an option to disable na/NA as missing - Not 100% sure this is a great idea or the best implementation - c.f., this slack thread: https://clojurians.slack.com/archives/C0BQDEJ8M/p1708666522549399 - Not sure about the option name, feel free to change - Slightly sketched by this similar-looking macro: https://github.com/techascent/tech.ml.dataset/blob/7b819dd81ccd58812ed189009cede6fe3ec7204b/src/tech/v3/dataset/impl/column_data_process.clj#L19-L26 * Slightly faster formulation - avoid keyword lookups in hot paths. --------- Co-authored-by: Chris Nuernberger <[email protected]>
- Loading branch information
Showing
2 changed files
with
17 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -172,7 +172,7 @@ | |
|
||
(defn- missing-value? | ||
"Is this a missing value coming from a CSV file" | ||
[value] | ||
[value disable-na-as-missing?] | ||
;;fastpath for numbers | ||
(cond | ||
(or (instance? Double value) (instance? Float value)) | ||
|
@@ -181,7 +181,8 @@ | |
(or (nil? value) | ||
(.equals "" value) | ||
(identical? value :tech.v3.dataset/missing) | ||
(and (string? value) (.equalsIgnoreCase ^String value "na"))))) | ||
(and (not disable-na-as-missing?) | ||
(string? value) (.equalsIgnoreCase ^String value "na"))))) | ||
|
||
|
||
(deftype FixedTypeParser [^IMutList container | ||
|
@@ -210,7 +211,7 @@ | |
;;be in the space of the container or it could require the parse-fn | ||
;;to make it. | ||
(let [parsed-value (cond | ||
(missing-value? value) | ||
(missing-value? value false) | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
cnuernber
Author
Collaborator
|
||
:tech.v3.dataset/missing | ||
(and (identical? (dtype/datatype value) container-dtype) | ||
(not (instance? String value))) | ||
|
@@ -380,7 +381,8 @@ | |
^List promotion-list | ||
column-name | ||
^:unsynchronized-mutable ^long last-idx | ||
options] | ||
options | ||
disable-na-as-missing?] | ||
dtype-proto/PECount | ||
(ecount [_this] (inc last-idx)) | ||
Indexed | ||
|
@@ -395,7 +397,7 @@ | |
(addValue [_p idx value] | ||
(let [parsed-value | ||
(cond | ||
(missing-value? value) | ||
(missing-value? value disable-na-as-missing?) | ||
:tech.v3.dataset/missing | ||
|
||
|
||
|
@@ -467,7 +469,8 @@ | |
parser-datatype-sequence) | ||
column-name | ||
-1 | ||
options))) | ||
options | ||
(get options :disable-na-as-missing?)))) | ||
(^PParser [column-name options] | ||
(promotional-string-parser column-name default-parser-datatype-sequence options))) | ||
|
||
|
@@ -494,7 +497,7 @@ | |
PParser | ||
(addValue [_p idx value] | ||
(set! max-idx idx) | ||
(when-not (missing-value? value) | ||
(when-not (missing-value? value options) | ||
(let [val-dtype (fast-dtype value)] | ||
;;setup container for new data | ||
(when-not (identical? container-dtype val-dtype) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
I do think it's potentially confusing that on this path the new option wont work, but more than happy to wait for someone to hit that - this is all pretty rarefied stuff to begin with.