From 878a29f59be3a2ace641bccd3aa336f7c412fb8b Mon Sep 17 00:00:00 2001 From: Justin Bousquin Date: Fri, 2 Aug 2024 19:04:03 -0500 Subject: [PATCH] Py open sci review (#89) * Implementing suggested ruff rules * isort * Fix whitespace (many of these were copied from docs example execution - need to confirm it passes docs tests) * Run test.yml on push to this branch * Whitespace * F401 (redundant alias) * Missed whitespace * First attempt w/ pre-commit * Fix indent * indent/drop name * Rename .pre-commit-config.yml to .pre-commit-config.yaml yAml * Update .pre-commit-config.yaml fix file structure * Reduce .pre-commit-config.yaml Reduce what files it is run on * Update domains.py Doesn't need to be raw string (see Batalex https://github.com/pyOpenSci/software-submission/issues/157#issuecomment-2256694336) * Dict doc strings as module level attributes * Update to main (#88) * Update domains.py 'Field' -> 'Field***' * 62 r test ci (#86) Update test_r.yaml to install conda outside r, specifically miniforge, then run on env from setup with current package (vs pip installing main) * Update .pre-commit-config.yaml From issue: pass_filenames: false in the pre-commit config so that the file discovery is done by Ruff taking into account the includes and excludes configured by the user in their pyproject.toml * Update .pre-commit-config.yaml Try updating to patch version and specify config in args. * Update pyproject.toml try withouth 'docstring-code-format = true' as this may override other settings. * Update pyproject.toml Try to get pre-commit to see config * Update pyproject.toml Warning message, so it is getting these settings from the toml? * Update conf.py E501 * Update basis.py E501 * Update basis.py Moved constant doc-string to module level * Update clean.py E501 * Update convert.py E501 * Update conf.py lint/format edits * Update pyproject.toml Without single checking if double is default * Update pyproject.toml Will move to one or the other (likely default double for ease), but trying to post-pone to work through diff * lint/formating * linted * W293 * black format/lint * W605 - try pulling r str out of test doc-string and instead as a comment. Comment shouldn't cause problems but this one has in the past. * I001 (all whitespace except test_harmonize_WQP.py) * lint conf file * lint * Add white space between module doc-string and imports * Format: add whitespace after mod doc-string * Add assert for actual2 - where the characteristics specific function is used instead of the generic. * Resolved some E501 * Check if new line fails doctest * Revert to get doc-test passing * Spread out example df entry * Spread out dict read out to reduce line length. White space is already normalized for doc-test so this may pass. * Revert * Spread out building df for wq_dat.WQCharData example. * spread out example df for we_date.measure_mask() * Shotern len of dict for wq_data.replace_unit_str() & wq_data.apply_conversion() examples * Attempt to skip E501 on this line * skip rule on line * Last attempt to ignore line too long in docstrings (3) * Update pyproject.toml Drop single quote for lint * '' -> "" * Update test.yml Revert back to testing on main only --- .github/workflows/lint.yml | 26 + .github/workflows/test.yml | 2 +- .pre-commit-config.yaml | 7 + docs/source/conf.py | 61 +- harmonize_wq/__init__.py | 7 +- harmonize_wq/basis.py | 130 ++- harmonize_wq/clean.py | 170 ++-- harmonize_wq/convert.py | 362 +++---- harmonize_wq/domains.py | 674 ++++++------- harmonize_wq/harmonize.py | 171 ++-- harmonize_wq/location.py | 99 +- harmonize_wq/tests/test_harmonize_WQP.py | 1113 ++++++++++++---------- harmonize_wq/visualize.py | 88 +- harmonize_wq/wq_data.py | 487 +++++----- harmonize_wq/wrangle.py | 211 ++-- pyproject.toml | 11 + 16 files changed, 1923 insertions(+), 1696 deletions(-) create mode 100644 .github/workflows/lint.yml create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..4e6d006 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,26 @@ +name: lint + +on: + push: + branches: + [pyOpenSci-review] + pull_request: + branches: + [pyOpenSci-review] + + +jobs: + lint: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.x + + - uses: pre-commit/action@v3.0.1 + with: + extra_args: --all-files --show-diff-on-failure \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7ba7476..c946b0f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,7 +3,7 @@ name: test on: push: branches: - [main, dev] + [main] pull_request: branches: [main] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..1e4b8fd --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,7 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.5.1 + hooks: + - id: ruff + args: ["--fix", "--show-fixes", --config=pyproject.toml] + - id: ruff-format diff --git a/docs/source/conf.py b/docs/source/conf.py index 4c397a4..1bc7d1d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,28 +9,28 @@ # If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. +import doctest import os import sys -import doctest sys.path.insert(0, os.path.abspath("..")) sys.path.insert(0, os.path.abspath("../..")) -#from importlib.metadata import version -#import harmonize_wq +# from importlib.metadata import version +# import harmonize_wq # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -project = 'harmonize_wq' -copyright = '2023, US Environmental Protection Agency' -author = 'Justin Bousquin (US Environmental Protection Agency)' +project = "harmonize_wq" +copyright = "2023, US Environmental Protection Agency" +author = "Justin Bousquin (US Environmental Protection Agency)" # ToDO:single source version version = "0.4.0" -#release = version(project) -#release = harmonize_wq.__version__ -#version = '.'.join(release.split('.')[:2]) +# release = version(project) +# release = harmonize_wq.__version__ +# version = ".".join(release.split(".")[:2]) # -- General configuration --------------------------------------------------- @@ -49,10 +49,12 @@ ] autosummary_generate = True # Turn on sphinx.ext.autosummary -html_show_sourcelink = False # Remove 'view source code' from top of page (for html, not python) -templates_path = ['_templates'] -exclude_patterns = ['_build', '_templates'] +# Remove "view source code" from top of page (for html, not python) +html_show_sourcelink = False + +templates_path = ["_templates"] +exclude_patterns = ["_build", "_templates"] # -- Options for HTML output ------------------------------------------------- @@ -62,13 +64,13 @@ html_static_path = [] # Readthedocs theme (may be useful for actions) -# on_rtd is whether on readthedocs.org, this line of code grabbed from docs.readthedocs.org... -#on_rtd = os.environ.get("READTHEDOCS", None) == "True" -#if not on_rtd: # only import and set the theme if we're building docs locally +# This line from docs.readthedocs.org, on_rtd is whether on readthedocs.org +# on_rtd = os.environ.get("READTHEDOCS", None) == "True" +# if not on_rtd: # only import and set the theme if we're building docs locally # import sphinx_rtd_theme # html_theme = "sphinx_rtd_theme" # html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] -#html_css_files = ["readthedocs-custom.css"] # Override some CSS settings +# html_css_files = ["readthedocs-custom.css"] # Override some CSS settings # -- Options for doctest ------------------------------------------------------ @@ -76,15 +78,18 @@ doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) # Should enable IGNORE_RESULT option -IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT') +IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT") OutputChecker = doctest.OutputChecker + + class CustomOutputChecker(OutputChecker): def check_output(self, want, got, optionflags): if IGNORE_RESULT & optionflags: return True return OutputChecker.check_output(self, want, got, optionflags) + doctest.OutputChecker = CustomOutputChecker # -- Options for Napolean output ---------------------------------------------- @@ -99,10 +104,10 @@ def check_output(self, want, got, optionflags): # -- Options for sphinx-contrib\apidoc ---------------------------------------- # NOT currently using apidoc -#apidoc_separate_modules = True -#apidoc_module_dir = "../harmonize_wq" -#apidoc_excluded_paths = ["tests"] -#apidoc_module_first = True +# apidoc_separate_modules = True +# apidoc_module_dir = "../harmonize_wq" +# apidoc_excluded_paths = ["tests"] +# apidoc_module_first = True # -- Options for sphinxcontrib-spelling --------------------------------------- spelling_lang = "en_US" @@ -123,25 +128,25 @@ def check_output(self, want, got, optionflags): "dataretrieval": ( "https://doi-usgs.github.io/dataretrieval-python/", "https://doi-usgs.github.io/dataretrieval-python/objects.inv", - ), + ), "pint": ( "https://pint.readthedocs.io/en/stable/", "https://pint.readthedocs.io/en/stable/objects.inv", - ), + ), "geopandas": ( "https://geopandas.org/en/stable/", "https://geopandas.org/en/stable/objects.inv", - ), + ), "pandas": ( "https://pandas.pydata.org/pandas-docs/stable/", "https://pandas.pydata.org/pandas-docs/stable/objects.inv", - ), + ), "pyproj": ( "https://pyproj4.github.io/pyproj/stable/", "https://pyproj4.github.io/pyproj/stable/objects.inv", - ), + ), "python": ( "https://docs.python.org/3", "https://docs.python.org/3/objects.inv", - ), - } + ), +} diff --git a/harmonize_wq/__init__.py b/harmonize_wq/__init__.py index 3fec437..0d83601 100644 --- a/harmonize_wq/__init__.py +++ b/harmonize_wq/__init__.py @@ -1,7 +1,8 @@ -from harmonize_wq import harmonize -from importlib.metadata import version, PackageNotFoundError +from importlib.metadata import PackageNotFoundError, version + +from harmonize_wq import harmonize as harmonize try: - __version__ = version('harmonize_wq') + __version__ = version("harmonize_wq") except PackageNotFoundError: __version__ = "version-unknown" diff --git a/harmonize_wq/basis.py b/harmonize_wq/basis.py index 03926d1..123a0fe 100644 --- a/harmonize_wq/basis.py +++ b/harmonize_wq/basis.py @@ -1,36 +1,55 @@ # -*- coding: utf-8 -*- -"""Functions to process characteristic basis or return basis dictionary.""" +"""Functions to process characteristic basis or return basis dictionary. -import numpy -from warnings import warn -from harmonize_wq.clean import add_qa_flag +Attributes +---------- +unit_basis_dict : dict + Characteristic specific basis dictionary to define basis from units. + Notes + ----- + Dictionary with logic for determining basis from units string and + standard :mod:`pint` units to replace those with. + The structure is {Basis: {standard units: [unit strings with basis]}}. -"""Characteristic specific basis dictionary to define basis from units. + The out_col is often derived from :attr:`WQCharData.char_val`. The desired + basis can be used as a key to subset result. -The out_col is often derived from :attr:`WQCharData.char_val`. The desired -basis can be used as a key to subset result. + Examples + -------- + Get dictionary for Phosphorus and subset for 'as P': -Parameters ----------- -out_col : str - Column name where results are written. - -Returns -------- - dict - Dictionary with logic for determining basis from units string and - standard :mod:`pint` units to replace those with. - The structure is {Basis: {standard units: [unit strings with basis]}}. - -Examples --------- -Get dictionary for Phosphorus and subset for 'as P': - ->>> from harmonize_wq import basis ->>> basis.unit_basis_dict['Phosphorus']['as P'] -{'mg/l': ['mg/l as P', 'mg/l P'], 'mg/kg': ['mg/kg as P', 'mg/kg P']} + >>> from harmonize_wq import basis + >>> basis.unit_basis_dict['Phosphorus']['as P'] + {'mg/l': ['mg/l as P', 'mg/l P'], 'mg/kg': ['mg/kg as P', 'mg/kg P']} + +basis_conversion : dict + Get dictionary of conversion factors to convert basis/speciation. + For example, this is used to convert 'as PO4' to 'as P'. + Dictionary structure {basis: conversion factor}. + + See Also + -------- + :func:`convert.moles_to_mass` + + `Best Practices for Submitting Nutrient Data to the Water Quality eXchange + `_ + +stp_dict : dict + Get standard temperature and pressure to define basis from units. + Dictionary structure {'standard temp' : {'units': [values to replace]}}. + + Notes + ----- + This needs to be updated to include pressure or needs to be renamed. """ + +from warnings import warn + +import numpy + +from harmonize_wq.clean import add_qa_flag + unit_basis_dict = { "Phosphorus": { "as P": {"mg/l": ["mg/l as P", "mg/l P"], "mg/kg": ["mg/kg as P", "mg/kg P"]}, @@ -43,22 +62,6 @@ "Carbon": {}, } -"""basis.bass_conversionGet dictionary of conversion factors to convert basis/speciation. - -basis.bass_conversion. For example, this is used to convert 'as PO4' to 'as P'. - -Returns -------- -dict - Dictionary with structure {basis: conversion factor} - -See Also --------- -:func:`convert.moles_to_mass` - -`Best Practices for Submitting Nutrient Data to the Water Quality eXchange -`_ -""" basis_conversion = { "NH3": 0.822, "NH4": 0.776, @@ -67,17 +70,6 @@ "PO4": 0.326, } -"""basis.stp_dict: Get standard temperature and pressure to define basis from units. - -Notes ------ - This needs to be updated to include pressure or needs to be renamed. - -Returns -------- -dict - Dictionary with {'standard temp' : {'units': [values to replace]}}. -""" stp_dict = {"@25C": {"mg/mL": ["mg/mL @25C"]}} @@ -145,26 +137,25 @@ def basis_from_unit(df_in, basis_dict, unit_col="Units", basis_col="Speciation") """ df = df_in.copy() for base in basis_dict.keys(): - for (new_unit, old_units) in basis_dict[base].items(): + for new_unit, old_units in basis_dict[base].items(): for old_unit in old_units: # TODO: Time test if old_unit in unit_col first? mask = df[unit_col] == old_unit # Update mask if basis_col in df.columns: # Add flags anywhere the values are updated - flag1 = f'{basis_col}: updated from ' + flag1 = f"{basis_col}: updated from " # List of unique basis values basis_list = df.loc[mask, basis_col].dropna().unique() # Loop over existing values in basis field for old_basis in basis_list: - flag = f'{flag1}{old_basis} to {base} (units)' + flag = f"{flag1}{old_basis} to {base} (units)" if old_basis != base: qa_mask = mask & (df[basis_col] == old_basis) - warn(f'Mismatched {flag}', UserWarning) + warn(f"Mismatched {flag}", UserWarning) df = add_qa_flag(df, qa_mask, flag) # Add/update basis from unit df = set_basis(df, mask, base, basis_col) - df[unit_col] = [new_unit if x == old_unit else x - for x in df[unit_col]] + df[unit_col] = [new_unit if x == old_unit else x for x in df[unit_col]] return df @@ -203,7 +194,7 @@ def basis_from_method_spec(df_in): 1 Phosphorus NaN NWIS NaN """ # Basis from MethodSpecificationName - old_col = 'MethodSpecificationName' + old_col = "MethodSpecificationName" df = df_in.copy() # TODO: this seems overly-complex to do a pop from one column to another, # consider _coerce_basis() @@ -229,8 +220,8 @@ def update_result_basis(df_in, basis_col, unit_col): Notes ----- - Rather than creating many new empty columns this function currently overwrites the original - basis_col values. The original values are noted in the QA_flag. + Currently overwrites the original basis_col values rather than create many new empty + columns. The original values are noted in the QA_flag. Parameters ---------- @@ -267,7 +258,8 @@ def update_result_basis(df_in, basis_col, unit_col): ... 'ResultTemperatureBasisText', ... 'Units') ... # doctest: +IGNORE_RESULT - UserWarning: Mismatched ResultTemperatureBasisText: updated from 25 deg C to @25C (units) + UserWarning: Mismatched ResultTemperatureBasisText: updated from 25 deg C to @25C + (units) >>> df_temp_basis[['Units']] Units 0 mg/mL @@ -281,23 +273,23 @@ def update_result_basis(df_in, basis_col, unit_col): # df = df_in.copy() # Basis from unit - if basis_col == 'ResultTemperatureBasisText': + if basis_col == "ResultTemperatureBasisText": df_out = basis_from_unit(df_in.copy(), stp_dict, unit_col, basis_col) # NOTE: in the test case 25 deg C -> @25C - elif basis_col == 'ResultParticleSizeBasisText': + elif basis_col == "ResultParticleSizeBasisText": # NOTE: These are normally 'less than x mm', no errors so far to fix df_out = df_in.copy() - elif basis_col == 'ResultWeightBasisText': + elif basis_col == "ResultWeightBasisText": df_out = df_in.copy() - elif basis_col == 'ResultTimeBasisText': + elif basis_col == "ResultTimeBasisText": df_out = df_in.copy() else: - raise ValueError(f'{basis_col} not recognized basis column') + raise ValueError(f"{basis_col} not recognized basis column") return df_out -def set_basis(df_in, mask, basis, basis_col='Speciation'): +def set_basis(df_in, mask, basis, basis_col="Speciation"): """Update or create basis_col with basis as value. Parameters diff --git a/harmonize_wq/clean.py b/harmonize_wq/clean.py index 4007527..08a987f 100644 --- a/harmonize_wq/clean.py +++ b/harmonize_wq/clean.py @@ -1,11 +1,14 @@ # -*- coding: utf-8 -*- """Functions to clean/correct additional columns in subset/entire dataset.""" -#from warnings import warn -from numpy import nan + +# from warnings import warn import dataretrieval.utils +from numpy import nan + from harmonize_wq.convert import convert_unit_series from harmonize_wq.domains import accepted_methods -#from harmonize_wq.wrangle import add_activities_to_df + +# from harmonize_wq.wrangle import add_activities_to_df def datetime(df_in): @@ -24,7 +27,7 @@ def datetime(df_in): Examples -------- Build pandas DataFrame for example: - + >>> from pandas import DataFrame >>> from numpy import nan >>> df = DataFrame({'ActivityStartDate': ['2004-09-01', '2004-07-01',], @@ -44,27 +47,29 @@ def datetime(df_in): [2 rows x 4 columns] """ # Expected columns - date, time, tz = ('ActivityStartDate', - 'ActivityStartTime/Time', - 'ActivityStartTime/TimeZoneCode') + date, time, tz = ( + "ActivityStartDate", + "ActivityStartTime/Time", + "ActivityStartTime/TimeZoneCode", + ) df_out = df_in.copy() # NOTE: even if date, if time is NA datetime is NaT df_out = dataretrieval.utils.format_datetime(df_out, date, time, tz) - df_out = df_out.rename(columns={'datetime': 'Activity_datetime'}) + df_out = df_out.rename(columns={"datetime": "Activity_datetime"}) return df_out -def harmonize_depth(df_in, units='meter'): +def harmonize_depth(df_in, units="meter"): """Create 'Depth' column with result depth values in consistent units. - - The new column is based on values from the 'ResultDepthHeightMeasure/MeasureValue' column and - units from the 'ResultDepthHeightMeasure/MeasureUnitCode' column. - + + New column combines values from the 'ResultDepthHeightMeasure/MeasureValue' column + with units from the 'ResultDepthHeightMeasure/MeasureUnitCode' column. + Notes ----- - If there are errors or unit registry (ureg) updates these are not currently - passed back. In the future activity depth columns may be considered if result depth missing. + Currently unit registry (ureg) updates or errors are not passed back. + In the future activity depth columns may be considered if result depth missing. Parameters ---------- @@ -77,11 +82,11 @@ def harmonize_depth(df_in, units='meter'): ------- df_out : pandas.DataFrame DataFrame with new Depth column replacing 'ResultDepthHeight' columns. - + Examples -------- Build pandas DataFrame for example: - + >>> from pandas import DataFrame >>> from numpy import nan >>> df = DataFrame({'ResultDepthHeightMeasure/MeasureValue': ['3.0', nan, 10], @@ -92,9 +97,9 @@ def harmonize_depth(df_in, units='meter'): 0 3.0 m 1 NaN NaN 2 10 ft - + Get clean 'Depth' column: - + >>> from harmonize_wq import clean >>> clean.harmonize_depth(df)[['ResultDepthHeightMeasure/MeasureValue', ... 'Depth']] @@ -105,16 +110,18 @@ def harmonize_depth(df_in, units='meter'): """ df_out = df_in.copy() # Default columns - meas_col = 'ResultDepthHeightMeasure/MeasureValue' - unit_col = 'ResultDepthHeightMeasure/MeasureUnitCode' + meas_col = "ResultDepthHeightMeasure/MeasureValue" + unit_col = "ResultDepthHeightMeasure/MeasureUnitCode" # Note: there are also 'Activity' cols for both of these & top/bottom depth df_checks(df_out, [meas_col, unit_col]) # Confirm columns in df na_mask = df_out[meas_col].notna() # Mask NA to speed up processing # TODO: if units missing? - params = {'quantity_series': df_out.loc[na_mask, meas_col], - 'unit_series': df_out.loc[na_mask, unit_col], - 'units': units, } + params = { + "quantity_series": df_out.loc[na_mask, meas_col], + "unit_series": df_out.loc[na_mask, unit_col], + "units": units, + } df_out.loc[na_mask, "Depth"] = convert_unit_series(**params) # TODO: where result depth is missing use activity depth? @@ -132,37 +139,39 @@ def df_checks(df_in, columns=None): columns : list, optional List of strings for column names. Default None, uses: 'ResultMeasure/MeasureUnitCode','ResultMeasureValue','CharacteristicName'. - + Examples -------- Build pandas DataFrame for example: - + >>> from pandas import DataFrame >>> df = DataFrame({'CharacteristicName': ['Phosphorus'],}) >>> df CharacteristicName 0 Phosphorus - + Check for existing column: >>> from harmonize_wq import clean >>> clean.df_checks(df, columns=['CharacteristicName']) - + If column is not in DataFrame it throws an AssertionError: - + >>> clean.df_checks(df, columns=['ResultMeasureValue']) Traceback (most recent call last): ... AssertionError: ResultMeasureValue not in DataFrame - + """ if columns is None: # Assign defaults - columns = ('ResultMeasure/MeasureUnitCode', - 'ResultMeasureValue', - 'CharacteristicName') + columns = ( + "ResultMeasure/MeasureUnitCode", + "ResultMeasureValue", + "CharacteristicName", + ) for col in columns: - assert col in df_in.columns, f'{col} not in DataFrame' + assert col in df_in.columns, f"{col} not in DataFrame" def check_precision(df_in, col, limit=3): @@ -171,7 +180,7 @@ def check_precision(df_in, col, limit=3): Notes ----- Be cautious of float type and real vs representable precision. - + Parameters ---------- df_in : pandas.DataFrame @@ -189,15 +198,15 @@ def check_precision(df_in, col, limit=3): """ df_out = df_in.copy() # Create T/F mask based on len of everything after the decimal - c_mask = [len(str(x).split('.')[1]) < limit for x in df_out[col]] - flag = f'{col}: Imprecise: lessthan{limit}decimaldigits' + c_mask = [len(str(x).split(".")[1]) < limit for x in df_out[col]] + flag = f"{col}: Imprecise: lessthan{limit}decimaldigits" df_out = add_qa_flag(df_out, c_mask, flag) # Assign flags return df_out def methods_check(df_in, char_val, methods=None): """Check methods against list of accepted methods. - + Notes ----- This is not fully implemented. @@ -222,11 +231,11 @@ def methods_check(df_in, char_val, methods=None): """ if methods is None: methods = accepted_methods - method_col = 'ResultAnalyticalMethod/MethodIdentifier' + method_col = "ResultAnalyticalMethod/MethodIdentifier" df2 = df_in.copy() # TODO: check df for method_col - char_mask = df2['CharacteristicName'] == char_val - methods = [item['Method'] for item in methods[char_val]] + char_mask = df2["CharacteristicName"] == char_val + methods = [item["Method"] for item in methods[char_val]] methods_used = list(set(df2.loc[char_mask, method_col].dropna())) accept = [method for method in methods_used if method in methods] # reject = [method for method in methods_used if method not in methods] @@ -241,7 +250,7 @@ def methods_check(df_in, char_val, methods=None): def wet_dry_checks(df_in, mask=None): """Fix suspected errors in 'ActivityMediaName' column. - + Uses the 'ResultWeightBasisText' and 'ResultSampleFractionText' columns to switch if the media is wet/dry where appropriate. @@ -259,24 +268,24 @@ def wet_dry_checks(df_in, mask=None): """ df_out = df_in.copy() - media_col = 'ActivityMediaName' + media_col = "ActivityMediaName" # Check columns are in df - df_checks(df_out, [media_col, - 'ResultSampleFractionText', - 'ResultWeightBasisText']) + df_checks(df_out, [media_col, "ResultSampleFractionText", "ResultWeightBasisText"]) # QA - Sample Media, fix assigned 'Water' that are actually 'Sediment' - qa_flag = f'{media_col}: Water changed to Sediment' + qa_flag = f"{media_col}: Water changed to Sediment" # Create mask for bad data - media_mask = ((df_out['ResultSampleFractionText'] == 'Bed Sediment') & - (df_out['ResultWeightBasisText'] == 'Dry') & - (df_out['ActivityMediaName'] == 'Water')) + media_mask = ( + (df_out["ResultSampleFractionText"] == "Bed Sediment") + & (df_out["ResultWeightBasisText"] == "Dry") + & (df_out["ActivityMediaName"] == "Water") + ) # Use mask if user specified, else run on all rows if mask: media_mask = mask & (media_mask) # Assign QA flag where data was bad df_out = add_qa_flag(df_out, media_mask, qa_flag) # Fix the data - df_out.loc[media_mask, 'ActivityMediaName'] = 'Sediment' + df_out.loc[media_mask, "ActivityMediaName"] = "Sediment" return df_out @@ -297,11 +306,11 @@ def add_qa_flag(df_in, mask, flag): ------- df_out : pandas.DataFrame Updated copy of df_in. - + Examples -------- Build pandas DataFrame to use as input: - + >>> from pandas import DataFrame >>> df = DataFrame({'CharacteristicName': ['Carbon', 'Phosphorus', 'Carbon',], ... 'ResultMeasureValue': ['1.0', '0.265', '2.1'],}) @@ -310,12 +319,12 @@ def add_qa_flag(df_in, mask, flag): 0 Carbon 1.0 1 Phosphorus 0.265 2 Carbon 2.1 - + Assign simple flag string and mask to assign flag only to Carbon: - + >>> flag = 'words' >>> mask = df['CharacteristicName']=='Carbon' - + >>> from harmonize_wq import clean >>> clean.add_qa_flag(df, mask, flag) CharacteristicName ResultMeasureValue QA_flag @@ -324,21 +333,20 @@ def add_qa_flag(df_in, mask, flag): 2 Carbon 2.1 words """ df_out = df_in.copy() - if 'QA_flag' not in list(df_out.columns): - df_out['QA_flag'] = nan + if "QA_flag" not in list(df_out.columns): + df_out["QA_flag"] = nan # Append flag where QA_flag is not nan - cond_notna = mask & (df_out['QA_flag'].notna()) # Mask cond and not NA - existing_flags = df_out.loc[cond_notna, 'QA_flag'] # Current QA flags - df_out.loc[cond_notna, 'QA_flag'] = [f'{txt}; {flag}' for - txt in existing_flags] + cond_notna = mask & (df_out["QA_flag"].notna()) # Mask cond and not NA + existing_flags = df_out.loc[cond_notna, "QA_flag"] # Current QA flags + df_out.loc[cond_notna, "QA_flag"] = [f"{txt}; {flag}" for txt in existing_flags] # Equals flag where QA_flag is nan - df_out.loc[mask & (df_out['QA_flag'].isna()), 'QA_flag'] = flag + df_out.loc[mask & (df_out["QA_flag"].isna()), "QA_flag"] = flag return df_out -def wet_dry_drop(df_in, wet_dry='wet', char_val=None): +def wet_dry_drop(df_in, wet_dry="wet", char_val=None): """Restrict to only water or only sediment samples. Parameters @@ -358,34 +366,34 @@ def wet_dry_drop(df_in, wet_dry='wet', char_val=None): df2 = df_in.copy() if char_val: # Set characteristic mask - c_mask = df2['CharacteristicName'] == char_val + c_mask = df2["CharacteristicName"] == char_val # Adding activities fails on len(df)==0, a do-nothing, end it early if len(df2[c_mask]) == 0: return df2 # Set variables for columns and check they're in df - media_col = 'ActivityMediaName' -# try: + media_col = "ActivityMediaName" + # try: df_checks(df2, media_col) -# except AssertionError: -# warn(f'Warning: {media_col} missing, querying from activities...') - # Try query/join -# if char_val: -# df2 = add_activities_to_df(df2, c_mask) -# else: -# df2 = add_activities_to_df(df2) # no mask, runs on all -# df_checks(df2, [media_col]) # Check it's been added - # if ERROR? - # print('Query and join activities first') + # except AssertionError: + # warn(f'Warning: {media_col} missing, querying from activities...') + # Try query/join + # if char_val: + # df2 = add_activities_to_df(df2, c_mask) + # else: + # df2 = add_activities_to_df(df2) # no mask, runs on all + # df_checks(df2, [media_col]) # Check it's been added + # if ERROR? + # print('Query and join activities first') # Fix wet/dry columns df2 = wet_dry_checks(df2) # Changed from df_in? # Filter wet/dry rows - if wet_dry == 'wet': - media_mask = df2[media_col] == 'Water' - elif wet_dry == 'dry': - media_mask = df2[media_col] == 'Sediment' + if wet_dry == "wet": + media_mask = df2[media_col] == "Water" + elif wet_dry == "dry": + media_mask = df2[media_col] == "Sediment" # Filter characteristic rows if char_val: diff --git a/harmonize_wq/convert.py b/harmonize_wq/convert.py index c5ea6c3..87145ba 100644 --- a/harmonize_wq/convert.py +++ b/harmonize_wq/convert.py @@ -1,42 +1,45 @@ # -*- coding: utf-8 -*- -"""Functions to convert from one unit to another, sometimes using :mod:`pint` decorators. +"""Functions to convert from one unit to another, at times using :mod:`pint` decorators. Contains several unit conversion functions not in :mod:`pint`. """ -from warnings import warn + import math +from warnings import warn + import pandas import pint from numpy import nan -from harmonize_wq.domains import registry_adds_list +from harmonize_wq.domains import registry_adds_list # TODO: does this constant belong here or in domains? -PERIODIC_MW = {'Organic carbon': 180.16, - 'C6H12O6': 180.16, - 'Phosphorus': 30.97, - 'P': 30.97, - 'PO4': 94.97, - 'Nitrogen': 14.01, - 'N': 14.01, - 'NO3': 62.01, - 'NO2': 46.01, - 'NH4': 18.04, - 'NH3': 17.03, - 'SiO3': 76.08, - } +PERIODIC_MW = { + "Organic carbon": 180.16, + "C6H12O6": 180.16, + "Phosphorus": 30.97, + "P": 30.97, + "PO4": 94.97, + "Nitrogen": 14.01, + "N": 14.01, + "NO3": 62.01, + "NO2": 46.01, + "NH4": 18.04, + "NH3": 17.03, + "SiO3": 76.08, +} # Molecular weight assumptions: Organic carbon = C6H12O6 # NOTE: for a more complete handling of MW: CalebBell/chemicals u_reg = pint.UnitRegistry() # For use in wrappers # TODO: find more elegant way to do this with all definitions -for definition in registry_adds_list('Turbidity'): +for definition in registry_adds_list("Turbidity"): u_reg.define(definition) -for definition in registry_adds_list('Salinity'): +for definition in registry_adds_list("Salinity"): u_reg.define(definition) -#timeit: 159.17 +# timeit: 159.17 # def convert_unit_series(quantity_series, unit_series, units, ureg=None): # # Convert quantities to float if they aren't already (should be) # if quantity_series.dtype=='O': @@ -52,8 +55,8 @@ # out_list = [val.to(ureg(units)) for val in val_list] # # Re-index to return series # return pandas.Series(out_list, index=quantity_series.index) -#timeit: 27.08 -def convert_unit_series(quantity_series, unit_series, units, ureg=None, errors='raise'): +# timeit: 27.08 +def convert_unit_series(quantity_series, unit_series, units, ureg=None, errors="raise"): """Convert quantities to consistent units. Convert list of quantities (quantity_list), each with a specified old unit, @@ -84,31 +87,31 @@ def convert_unit_series(quantity_series, unit_series, units, ureg=None, errors=' Examples -------- Build series to use as input: - + >>> from pandas import Series >>> quantity_series = Series([1, 10]) >>> unit_series = Series(['mg/l', 'mg/ml',]) Convert series to series of pint Quantity objects in 'mg/l': - + >>> from harmonize_wq import convert >>> convert.convert_unit_series(quantity_series, unit_series, units = 'mg/l') 0 1.0 milligram / liter 1 10000.000000000002 milligram / liter dtype: object """ - if quantity_series.dtype=='O': + if quantity_series.dtype == "O": quantity_series = pandas.to_numeric(quantity_series) # Initialize classes from pint if ureg is None: ureg = pint.UnitRegistry() Q_ = ureg.Quantity - lst_series = [pandas.Series(dtype='object')] + lst_series = [pandas.Series(dtype="object")] # Note: set of series does not preservce order and must be sorted at end for unit in list(set(unit_series)): # Filter quantity_series by unit_series where == unit - f_quant_series = quantity_series.where(unit_series==unit).dropna() + f_quant_series = quantity_series.where(unit_series == unit).dropna() unit_ = ureg(unit) # Set unit once per unit result_list = [Q_(q, unit_) for q in f_quant_series] if unit != units: @@ -116,10 +119,10 @@ def convert_unit_series(quantity_series, unit_series, units, ureg=None, errors=' try: result_list = [val.to(ureg(units)) for val in result_list] except pint.DimensionalityError as exception: - if errors=='skip': + if errors == "skip": # do nothing, leave result_list unconverted warn(f"WARNING: '{unit}' not converted") - elif errors=='ignore': + elif errors == "ignore": # convert to NaN result_list = [nan for val in result_list] warn(f"WARNING: '{unit}' converted to NaN") @@ -147,32 +150,32 @@ def mass_to_moles(ureg, char_val, Q_): ------- pint.Quantity Value in moles of substance. - + Examples -------- Build standard pint unit registry: - + >>> import pint >>> ureg = pint.UnitRegistry() - + Build pint quantity: >>> Q_ = 1 * ureg('g') - + >>> from harmonize_wq import convert >>> str(convert.mass_to_moles(ureg, 'Phosphorus', Q_)) '0.03228931223764934 mole' """ # TODO: Not used yet m_w = PERIODIC_MW[char_val] - return Q_.to('moles', 'chemistry', mw=m_w * ureg('g/mol')) + return Q_.to("moles", "chemistry", mw=m_w * ureg("g/mol")) def moles_to_mass(ureg, Q_, basis=None, char_val=None): """Convert moles substance to mass. Either basis or char_val must have a non-default value. - + Parameters ---------- ureg : pint.UnitRegistry @@ -194,14 +197,14 @@ def moles_to_mass(ureg, Q_, basis=None, char_val=None): Examples -------- Build standard pint unit registry: - + >>> import pint >>> ureg = pint.UnitRegistry() - + Build quantity: - + >>> Q_ = 0.265 * ureg('umol') - + >>> from harmonize_wq import convert >>> str(convert.moles_to_mass(ureg, Q_, basis='as P')) '8.20705e-06 gram' @@ -209,14 +212,14 @@ def moles_to_mass(ureg, Q_, basis=None, char_val=None): if basis: # Clean-up basis # print(basis) - if basis.startswith('as '): + if basis.startswith("as "): basis = basis[3:] m_w = PERIODIC_MW[basis] elif char_val: m_w = PERIODIC_MW[char_val] else: raise ValueError("Characteristic Name or basis (Speciation) required") - return Q_.to('g', 'chemistry', mw=m_w / ureg('mol/g')) + return Q_.to("g", "chemistry", mw=m_w / ureg("mol/g")) @u_reg.wraps(u_reg.NTU, u_reg.centimeter) @@ -232,14 +235,14 @@ def cm_to_NTU(val): ------- pint.Quantity The turbidity value in NTU. - + Examples -------- Build standard pint unit registry: - + >>> import pint >>> ureg = pint.UnitRegistry() - + Build cm units aware pint Quantity (already in standard unit registry): >>> turbidity = ureg.Quantity('cm') @@ -247,9 +250,9 @@ def cm_to_NTU(val): '1 centimeter' >>> type(turbidity) - + Convert to cm: - + >>> from harmonize_wq import convert >>> str(convert.cm_to_NTU(str(turbidity))) '3941.8 Nephelometric_Turbidity_Units' @@ -282,13 +285,13 @@ def NTU_to_cm(val): -------- NTU is not a standard pint unit and must be added to a unit registry first (normally done by WQCharData.update_ureg() method): - + >>> import pint >>> ureg = pint.UnitRegistry() >>> from harmonize_wq import domains >>> for definition in domains.registry_adds_list('Turbidity'): - ... ureg.define(definition) - + ... ureg.define(definition) + Build NTU aware pint pint Quantity: >>> turbidity = ureg.Quantity('NTU') @@ -296,9 +299,9 @@ def NTU_to_cm(val): '1 Nephelometric_Turbidity_Units' >>> type(turbidity) - + Convert to cm: - + >>> from harmonize_wq import convert >>> str(convert.NTU_to_cm('1 NTU')) '241.27 centimeter' @@ -316,7 +319,7 @@ def NTU_to_cm(val): @u_reg.wraps(u_reg.NTU, u_reg.dimensionless) def JTU_to_NTU(val): """Convert turbidity units from JTU (Jackson Turbidity Units) to NTU. - + Notes ----- This is based on linear relationship: 1 -> 19, 0.053 -> 1, 0.4 -> 7.5 @@ -330,18 +333,18 @@ def JTU_to_NTU(val): ------- NTU : pint.Quantity The turbidity value in dimensionless NTU. - + Examples -------- JTU is not a standard pint unit and must be added to a unit registry first (normally done by WQCharData.update_ureg() method): - + >>> import pint >>> ureg = pint.UnitRegistry() >>> from harmonize_wq import domains >>> for definition in domains.registry_adds_list('Turbidity'): - ... ureg.define(definition) - + ... ureg.define(definition) + Build JTU units aware pint Quantity: >>> turbidity = ureg.Quantity('JTU') @@ -349,9 +352,9 @@ def JTU_to_NTU(val): '1 Jackson_Turbidity_Units' >>> type(turbidity) - + Convert to NTU: - + >>> from harmonize_wq import convert >>> str(convert.JTU_to_NTU(str(turbidity))) '18.9773 Nephelometric_Turbidity_Units' @@ -362,13 +365,13 @@ def JTU_to_NTU(val): # from Maceina, M. J., & Soballe, D. M. (1990). # Wind-related limnological variation in Lake Okeechobee, Florida. # Lake and Reservoir Management, 6(1), 93-100. - return 19.025*val - 0.0477 + return 19.025 * val - 0.0477 @u_reg.wraps(u_reg.NTU, u_reg.dimensionless) def SiO2_to_NTU(val): """Convert turbidity units from SiO2 (silicon dioxide) to NTU. - + Notes ----- This is based on a linear relationship: 0.13 -> 1, 1 -> 7.5, 2.5 -> 19 @@ -382,18 +385,18 @@ def SiO2_to_NTU(val): ------- NTU : pint.Quantity.build_quantity_class The turbidity value in dimensionless NTU. - + Examples -------- SiO2 is not a standard pint unit and must be added to a unit registry first (normally done using WQCharData.update_ureg() method): - + >>> import pint >>> ureg = pint.UnitRegistry() >>> from harmonize_wq import domains >>> for definition in domains.registry_adds_list('Turbidity'): - ... ureg.define(definition) - + ... ureg.define(definition) + Build SiO2 units aware pint Quantity: >>> turbidity = ureg.Quantity('SiO2') @@ -401,9 +404,9 @@ def SiO2_to_NTU(val): '1 SiO2' >>> type(turbidity) - + Convert to NTU: - + >>> from harmonize_wq import convert >>> str(convert.SiO2_to_NTU(str(turbidity))) '7.5701 Nephelometric_Turbidity_Units' @@ -425,25 +428,26 @@ def FNU_to_NTU(val): ------- NTU : float The turbidity magnitude (NTU is dimensionless). - + Examples -------- Convert to NTU: >>> from harmonize_wq import convert >>> convert.FNU_to_NTU(8) - 10.136 + 10.136 """ return val * 1.267 -@u_reg.wraps(u_reg.gram/u_reg.kilogram, (u_reg.gram/u_reg.liter, - u_reg.standard_atmosphere, - u_reg.degree_Celsius)) -def density_to_PSU(val, - pressure=1*u_reg("atm"), - temperature=u_reg.Quantity(25, u_reg("degC"))): +@u_reg.wraps( + u_reg.gram / u_reg.kilogram, + (u_reg.gram / u_reg.liter, u_reg.standard_atmosphere, u_reg.degree_Celsius), +) +def density_to_PSU( + val, pressure=1 * u_reg("atm"), temperature=u_reg.Quantity(25, u_reg("degC")) +): """Convert salinity as density (mass/volume) to Practical Salinity Units. Parameters @@ -459,52 +463,53 @@ def density_to_PSU(val, ------- PSU : pint.Quantity.build_quantity_class The salinity value in dimensionless PSU. - + Examples -------- - PSU (Practical Salinity Units) is not a standard pint unit and must be added to a unit registry - first (normally done by WQCharData.update_ureg() method): - + PSU (Practical Salinity Units) is not a standard pint unit and must be added to a + unit registry first (normally done by WQCharData.update_ureg() method): + >>> import pint >>> ureg = pint.UnitRegistry() >>> from harmonize_wq import domains >>> for definition in domains.registry_adds_list('Salinity'): - ... ureg.define(definition) - + ... ureg.define(definition) + Build units aware pint Quantity, as string: - + >>> input_density = '1000 milligram / milliliter' - + Convert to Practical Salinity Units: - + >>> from harmonize_wq import convert >>> convert.density_to_PSU(input_density) """ # Standard Reference Value - ref = 35.16504/35.0 + ref = 35.16504 / 35.0 # density of pure water is ~1000 mg/mL if val > 1000: - PSU = (float(val)*ref)-1000 + PSU = (float(val) * ref) - 1000 else: - PSU = ((float(val)+1000)*ref)-1000 + PSU = ((float(val) + 1000) * ref) - 1000 # print('{} mg/ml == {} ppth'.format(val, PSU)) # multiply by 33.45 @26C, 33.44 @25C return PSU -@u_reg.wraps(u_reg.milligram/u_reg.milliliter, (u_reg.dimensionless, - u_reg.standard_atmosphere, - u_reg.degree_Celsius)) -def PSU_to_density(val, - pressure=1*u_reg("atm"), - temperature=u_reg.Quantity(25, u_reg("degC"))): +@u_reg.wraps( + u_reg.milligram / u_reg.milliliter, + (u_reg.dimensionless, u_reg.standard_atmosphere, u_reg.degree_Celsius), +) +def PSU_to_density( + val, pressure=1 * u_reg("atm"), temperature=u_reg.Quantity(25, u_reg("degC")) +): """Convert salinity as Practical Salinity Units (PSU) to density. Dimensionality changes from dimensionless Practical Salinity Units (PSU) to mass/volume density. - + Parameters ---------- val : pint.Quantity @@ -518,57 +523,64 @@ def PSU_to_density(val, ------- density : pint.Quantity.build_quantity_class The salinity value in density units (mg/ml). - + Examples -------- PSU is not a standard pint unit and must be added to a unit registry first. This can be done using the WQCharData.update_ureg method: - + >>> import pint >>> ureg = pint.UnitRegistry() >>> from harmonize_wq import domains >>> for definition in domains.registry_adds_list('Salinity'): - ... ureg.define(definition) - + ... ureg.define(definition) + Build units aware pint Quantity, as string because it is an altered unit registry: - + >>> unit = ureg.Quantity('PSU') >>> unit - + >>> type(unit) - + >>> input_psu = str(8*unit) >>> input_psu '8 Practical_Salinity_Units' - + Convert to density: >>> from harmonize_wq import convert >>> str(convert.PSU_to_density(input_psu)) '997.0540284772519 milligram / milliliter' """ - p, t = pressure, temperature + _p, t = pressure, temperature # Pure water density (see SMOW, Craig 1961) - x = [999.842594, - 6.793952e-2 * t, - -9.095290e-3 * t**2, - 1.001685e-4 * t**3, - -1.120083e-6 * t**4, - 6.536336e-9 * t**5] + x = [ + 999.842594, + 6.793952e-2 * t, + -9.095290e-3 * t**2, + 1.001685e-4 * t**3, + -1.120083e-6 * t**4, + 6.536336e-9 * t**5, + ] pure_water = sum(x) # Constants - a0 = [-4.0899e-3*t, 7.6438e-5*(t**2), -8.2467e-7*(t**3), 5.3875e-9*(t**4)] + a0 = [ + -4.0899e-3 * t, + 7.6438e-5 * (t**2), + -8.2467e-7 * (t**3), + 5.3875e-9 * (t**4), + ] a = 8.24493e-1 + sum(a0) - b0 = [-5.72466e-3, 1.0227e-4*t, -1.6546e-6*(t**2)] + b0 = [-5.72466e-3, 1.0227e-4 * t, -1.6546e-6 * (t**2)] b = sum(b0) - density = pure_water + a*val + b*(val**(3/2)) + 4.8314e-4*(val**2) + density = pure_water + a * val + b * (val ** (3 / 2)) + 4.8314e-4 * (val**2) # # UNESCO 1983 Eqn.(13) p17. @@ -585,14 +597,15 @@ def PSU_to_density(val, return density -@u_reg.wraps(u_reg.milligram/u_reg.liter, (None, - u_reg.standard_atmosphere, - u_reg.degree_Celsius)) -def DO_saturation(val, - pressure=1*u_reg("atm"), - temperature=u_reg.Quantity(25, u_reg("degC"))): +@u_reg.wraps( + u_reg.milligram / u_reg.liter, + (None, u_reg.standard_atmosphere, u_reg.degree_Celsius), +) +def DO_saturation( + val, pressure=1 * u_reg("atm"), temperature=u_reg.Quantity(25, u_reg("degC")) +): """Convert Dissolved Oxygen (DO) from saturation (%) to concentration (mg/l). - + Defaults assume STP where pressure is 1 atmosphere and temperature 25C. Parameters @@ -608,13 +621,13 @@ def DO_saturation(val, ------- pint.Quantity DO value in mg/l. - + Examples -------- >>> from harmonize_wq import convert >>> convert.DO_saturation(70) - + At 2 atm (10m depth) >>> convert.DO_saturation(70, ('2 standard_atmosphere')) 11.746159340060716 milligram / liter @@ -624,15 +637,16 @@ def DO_saturation(val, cP = 8.262332418 else: cP = _DO_concentration_eq(p, t) - return float(val)/100 * cP # Divide by 100? + return float(val) / 100 * cP # Divide by 100? -@u_reg.wraps(None, (u_reg.milligram/u_reg.liter, - u_reg.standard_atmosphere, - u_reg.degree_Celsius)) -def DO_concentration(val, - pressure=1*u_reg("atm"), - temperature=u_reg.Quantity(25, u_reg("degC"))): +@u_reg.wraps( + None, + (u_reg.milligram / u_reg.liter, u_reg.standard_atmosphere, u_reg.degree_Celsius), +) +def DO_concentration( + val, pressure=1 * u_reg("atm"), temperature=u_reg.Quantity(25, u_reg("degC")) +): """Convert Dissolved Oxygen (DO) from concentration (mg/l) to saturation (%). Parameters @@ -648,13 +662,13 @@ def DO_concentration(val, ------- float Dissolved Oxygen (DO) as saturation (dimensionless). - + Examples -------- Build units aware pint Quantity, as string: - + >>> input_DO = '578 mg/l' - + >>> from harmonize_wq import convert >>> convert.DO_concentration(input_DO) 6995.603308586222 @@ -664,32 +678,37 @@ def DO_concentration(val, cP = 8.262332418 else: cP = _DO_concentration_eq(p, t) - return 100*val /cP + return 100 * val / cP def _DO_concentration_eq(p, t): - """ Equilibrium oxygen concentration at non-standard""" + """Equilibrium oxygen concentration at non-standard""" # https://www.waterontheweb.org/under/waterquality/oxygen.html#:~: # text=Oxygen%20saturation%20is%20calculated%20as, # concentration%20at%20100%25%20saturation%20decreases. tk = t + 273.15 # t in kelvin (t is in C) - standard = 0.000975 - (1.426e-05*t) + (6.436e-08*(t**2)) # Theta + standard = 0.000975 - (1.426e-05 * t) + (6.436e-08 * (t**2)) # Theta # partial pressure of water vapor, atm - Pwv = math.exp(11.8571 - (3840.7/tk) - (216961/(tk**2))) + Pwv = math.exp(11.8571 - (3840.7 / tk) - (216961 / (tk**2))) # equilibrium oxygen concentration at std pres of 1 atm cStar = math.exp(7.7117 - 1.31403 * math.log(t + 45.93)) - numerator = (1-Pwv/p)*(1-(standard*p)) - denominator = (1-Pwv)*(1-standard) - - return cStar*p*(numerator/denominator) - - -@u_reg.wraps(u_reg.dimensionless, (u_reg.microsiemens / u_reg.centimeter, - u_reg.standard_atmosphere, - u_reg.degree_Celsius)) -def conductivity_to_PSU(val, - pressure=0*u_reg("atm"), - temperature=u_reg.Quantity(25, u_reg("degC"))): + numerator = (1 - Pwv / p) * (1 - (standard * p)) + denominator = (1 - Pwv) * (1 - standard) + + return cStar * p * (numerator / denominator) + + +@u_reg.wraps( + u_reg.dimensionless, + ( + u_reg.microsiemens / u_reg.centimeter, + u_reg.standard_atmosphere, + u_reg.degree_Celsius, + ), +) +def conductivity_to_PSU( + val, pressure=0 * u_reg("atm"), temperature=u_reg.Quantity(25, u_reg("degC")) +): """Estimate salinity (PSU) from conductivity. Parameters @@ -723,25 +742,25 @@ def conductivity_to_PSU(val, Alan D. Jassby and James E. Cloern (2015). wq: Some tools for exploring water quality monitoring data. R package v0.4.4. See the ec2pss function. - + Adapted from R `cond2sal_shiny `_ - + Examples -------- PSU (Practical Salinity Units) is not a standard pint unit and must be added to a unit registry first: - + >>> import pint >>> ureg = pint.UnitRegistry() >>> from harmonize_wq import domains >>> for definition in domains.registry_adds_list('Salinity'): - ... ureg.define(definition) - + ... ureg.define(definition) + Build units aware pint Quantity, as string: - + >>> input_conductivity = '111.0 uS/cm' - + Convert to Practical Salinity Units: >>> from harmonize_wq import convert @@ -760,22 +779,35 @@ def conductivity_to_PSU(val, # Csw = 42.914 K = 0.0162 Ct = round(val * (1 + 0.0191 * (t - 25)), 0) - R = (Ct/1000)/42.914 + R = (Ct / 1000) / 42.914 # Was rt c = c[0] + (c[1] * t) + (c[2] * t**2) + (c[3] * t**3) + (c[4] * t**4) - Rp = (1 + (p * e[0] + e[1] * p**2 + e[2] * p**3) / - (1 + D[0] * t + D[1] * t**2 + (D[2] + D[3] * t) * R)) - Rt1 = R/(Rp * c) - dS = ((b[0] + b[1] * Rt1**(1/2) + - b[2] * Rt1**(2/2) + - b[3] * Rt1**(3/2) + - b[4] * Rt1**(4/2) + - b[5] * Rt1**(5/2)) * - (t - 15)/(1 + K * (t - 15))) - S = (a[0] + a[1] * Rt1**(1/2) + - a[2] * Rt1**(2/2) + a[3] * Rt1**(3/2) + - a[4] * Rt1**(4/2) + a[5] * Rt1**(5/2) + dS) + Rp = 1 + (p * e[0] + e[1] * p**2 + e[2] * p**3) / ( + 1 + D[0] * t + D[1] * t**2 + (D[2] + D[3] * t) * R + ) + Rt1 = R / (Rp * c) + dS = ( + ( + b[0] + + b[1] * Rt1 ** (1 / 2) + + b[2] * Rt1 ** (2 / 2) + + b[3] * Rt1 ** (3 / 2) + + b[4] * Rt1 ** (4 / 2) + + b[5] * Rt1 ** (5 / 2) + ) + * (t - 15) + / (1 + K * (t - 15)) + ) + S = ( + a[0] + + a[1] * Rt1 ** (1 / 2) + + a[2] * Rt1 ** (2 / 2) + + a[3] * Rt1 ** (3 / 2) + + a[4] * Rt1 ** (4 / 2) + + a[5] * Rt1 ** (5 / 2) + + dS + ) # TODO: implement these two lines? Shouldn't encounter NaN. # S[is.na(S<0)]<-NA # if <0 or NA set as nan diff --git a/harmonize_wq/domains.py b/harmonize_wq/domains.py index f741199..52e60cd 100644 --- a/harmonize_wq/domains.py +++ b/harmonize_wq/domains.py @@ -4,65 +4,123 @@ These are mainly for use as filters. Small or frequently utilized domains may be hard-coded. A URL based method can be used to get the most up to date domain list. + +Attributes +---------- +accepted_methods : dict + Get accepted methods for each characteristic. Dictionary where key is + characteristic column name and value is list of dictionaries each with Source + and Method keys. + + Notes + ----- + Source should be in 'ResultAnalyticalMethod/MethodIdentifierContext' + column. This is not fully implemented. + +stations_rename : dict + Get shortened column names for shapefile (.shp) fields. + + Dictionary where key = WQP field name and value = short name for .shp. + + ESRI places a length restriction on shapefile (.shp) field names. This + returns a dictionary with the original water quality portal field name (as + key) and shortened column name for writing as .shp. We suggest using the + longer original name as the field alias when writing as .shp. + + Examples + -------- + Although running the function returns the full dictionary of Key:Value + pairs, here we show how the current name can be used as a key to get the + new name: + + >>> domains.stations_rename['OrganizationIdentifier'] + 'org_ID' + +xy_datum : dict + + Get dictionary of expected horizontal datums, where exhaustive: + {HorizontalCoordinateReferenceSystemDatumName: {Description:str, + EPSG:int}} + + The structure has {key as expected string: value as {"Description": string + and "EPSG": integer (4-digit code)}. + + Notes + ----- + source WQP: HorizontalCoordinateReferenceSystemDatum_CSV.zip + + Anything not in dict will be NaN, and non-integer EPSG will be missing: + "OTHER": {"Description": 'Other', "EPSG": nan}, + "UNKWN": {"Description": 'Unknown', "EPSG": nan} + + Examples + -------- + Running the function returns the full dictionary with {abbreviation: + {'Description':values, 'EPSG':values}}. The abbreviation key can be used to + get the EPSG code: + + >>> domains.xy_datum['NAD83'] + {'Description': 'North American Datum 1983', 'EPSG': 4269} + >>> domains.xy_datum['NAD83']['EPSG'] + 4269 """ -import requests -import pandas +import pandas +import requests -BASE_URL = 'https://cdx.epa.gov/wqx/download/DomainValues/' -TADA_DATA_URL = r'https://raw.githubusercontent.com/USEPA/EPATADA/' - -UNITS_REPLACE = {'Secchi': {}, - 'DO': {'%': 'percent'}, - 'Temperature': {}, - 'Salinity': {'ppt': 'ppth', - '0/00': 'ppth'}, - 'pH': {'None': 'dimensionless', - 'std units': 'dimensionless'}, - 'Nitrogen': {'cm3/g @STP': 'cm3/g', - 'cm3/g STP': 'cm3/g', - '%': 'percent'}, - 'Conductivity': {'uS': 'uS/cm', - 'umho': 'umho/cm'}, - 'Carbon': {'% by wt': '%', - '%': 'percent'}, - 'Chlorophyll': {'mg/cm3': 'mg/cm**3', - 'mg/m3': 'mg/m**3', - 'mg/m2': 'mg/m**3', - 'ug/cm3': 'ug/cm**3'}, - 'Turbidity': {'mg/l SiO2': 'SiO2', - 'ppm SiO2': 'SiO2'}, - 'Sediment': {'%': 'percent'}, - 'Fecal_Coliform': {'#/100ml': 'CFU/(100ml)', - 'CFU': 'CFU/(100ml)', - 'MPN': 'MPN/(100ml)'}, - 'E_coli': {'#/100ml': 'CFU/(100ml)', - 'CFU': 'CFU/(100ml)', - 'MPN': 'MPN/(100ml)'}, - 'Phosphorus': {'%': 'percent'}, - } - -OUT_UNITS = {'Secchi': 'm', - 'DO': 'mg/l', - 'Temperature': 'degC', - 'Salinity': 'PSU', - 'pH': 'dimensionless', - 'Nitrogen': 'mg/l', - 'Conductivity': 'uS/cm', - 'Carbon': 'mg/l', - 'Chlorophyll': 'mg/l', - 'Turbidity': 'NTU', - 'Sediment': 'g/kg', - 'Fecal_Coliform': 'CFU/(100ml)', - 'E_coli': 'CFU/(100ml)', - 'Phosphorus': 'mg/l' - } +BASE_URL = "https://cdx.epa.gov/wqx/download/DomainValues/" +TADA_DATA_URL = "https://raw.githubusercontent.com/USEPA/EPATADA/" + +UNITS_REPLACE = { + "Secchi": {}, + "DO": {"%": "percent"}, + "Temperature": {}, + "Salinity": {"ppt": "ppth", "0/00": "ppth"}, + "pH": {"None": "dimensionless", "std units": "dimensionless"}, + "Nitrogen": {"cm3/g @STP": "cm3/g", "cm3/g STP": "cm3/g", "%": "percent"}, + "Conductivity": {"uS": "uS/cm", "umho": "umho/cm"}, + "Carbon": {"% by wt": "%", "%": "percent"}, + "Chlorophyll": { + "mg/cm3": "mg/cm**3", + "mg/m3": "mg/m**3", + "mg/m2": "mg/m**3", + "ug/cm3": "ug/cm**3", + }, + "Turbidity": {"mg/l SiO2": "SiO2", "ppm SiO2": "SiO2"}, + "Sediment": {"%": "percent"}, + "Fecal_Coliform": { + "#/100ml": "CFU/(100ml)", + "CFU": "CFU/(100ml)", + "MPN": "MPN/(100ml)", + }, + "E_coli": {"#/100ml": "CFU/(100ml)", "CFU": "CFU/(100ml)", "MPN": "MPN/(100ml)"}, + "Phosphorus": {"%": "percent"}, +} + +OUT_UNITS = { + "Secchi": "m", + "DO": "mg/l", + "Temperature": "degC", + "Salinity": "PSU", + "pH": "dimensionless", + "Nitrogen": "mg/l", + "Conductivity": "uS/cm", + "Carbon": "mg/l", + "Chlorophyll": "mg/l", + "Turbidity": "NTU", + "Sediment": "g/kg", + "Fecal_Coliform": "CFU/(100ml)", + "E_coli": "CFU/(100ml)", + "Phosphorus": "mg/l", +} # Temporary (these are confirmed) -domain_tables = {'ActivityMedia': 'ActivityMedia_CSV', - 'SampleFraction': 'ResultSampleFraction_CSV', - 'ActivityMediaSubdivision': 'ActivityMediaSubdivision_CSV', - 'ResultValueType': 'ResultValueType_CSV'} +domain_tables = { + "ActivityMedia": "ActivityMedia_CSV", + "SampleFraction": "ResultSampleFraction_CSV", + "ActivityMediaSubdivision": "ActivityMediaSubdivision_CSV", + "ResultValueType": "ResultValueType_CSV", +} # Replaces: # get_ActivityMediaName(): # get_SampleFraction(): @@ -70,6 +128,7 @@ # get_ResultValueTypeName(): # get_domain_list(field): + def get_domain_dict(table, cols=None): """Get domain values for specified table. @@ -90,7 +149,7 @@ def get_domain_dict(table, cols=None): -------- Return dictionary for domain from WQP table (e.g., 'ResultSampleFraction'), The default keys ('Name') are shown as values ('Description') are long: - + >>> from harmonize_wq import domains >>> domains.get_domain_dict('ResultSampleFraction').keys() # doctest: +NORMALIZE_WHITESPACE dict_keys(['Acid Soluble', 'Bed Sediment', 'Bedload', 'Bioavailable', 'Comb Available', @@ -105,13 +164,12 @@ def get_domain_dict(table, cols=None): 'Total Recoverable', 'Total Residual', 'Total Soluble', 'Unfiltered', 'Unfiltered, field', 'Vapor', 'Volatile', 'Weak Acid Diss', 'Yield', 'non-linear function']) - - """ + """ # noqa: E501 if cols is None: - cols = ['Name', 'Description'] - if not table.endswith('_CSV'): - table += '_CSV' - url = f'{BASE_URL}{table}.zip' + cols = ["Name", "Description"] + if not table.endswith("_CSV"): + table += "_CSV" + url = f"{BASE_URL}{table}.zip" # Very limited url handling if requests.get(url).status_code != 200: status_code = requests.get(url).status_code @@ -122,26 +180,29 @@ def get_domain_dict(table, cols=None): def harmonize_TADA_dict(): """Get structured dictionary from TADA HarmonizationTemplate csv. - + Based on target column names and sample fractions. Returns ------- full_dict : dict - {'TADA.CharacteristicName': {Target.TADA.CharacteristicName: {Target.TADA.ResultSampleFractionText [Target.TADA.ResultSampleFractionText]}}} + {'TADA.CharacteristicName': + {Target.TADA.CharacteristicName: + {Target.TADA.ResultSampleFractionText : + [Target.TADA.ResultSampleFractionText]}}} """ # Note: too nested for refactor into single function w/ char_tbl_TADA # Read from github - csv = f'{TADA_DATA_URL}develop/inst/extdata/HarmonizationTemplate.csv' + csv = f"{TADA_DATA_URL}develop/inst/extdata/HarmonizationTemplate.csv" df = pandas.read_csv(csv) # Read csv url to DataFrame full_dict = {} # Setup results dict # Build dict one unique characteristicName at a time - for char, sub_df in df.groupby('TADA.CharacteristicName'): + for char, sub_df in df.groupby("TADA.CharacteristicName"): full_dict[char] = char_tbl_TADA(sub_df, char) # Build dictionary # Domains to check agaisnt - domain_list = list(get_domain_dict('ResultSampleFraction').keys()) + domain_list = list(get_domain_dict("ResultSampleFraction").keys()) # Update in/out with expected sample Fraction case for k_char, v_char in full_dict.items(): @@ -184,22 +245,31 @@ def re_case(word, domain_list): def char_tbl_TADA(df, char): """Get structured dictionary for TADA.CharacteristicName from TADA df. - + Parameters ---------- df : pandas.DataFrame Table from TADA for specific characteristic. char : str CharacteristicName. - + Returns ------- new_char_dict : dict - {Target.TADA.CharacteristicName: {Target.TADA.ResultSampleFractionText: [Target.TADA.ResultSampleFractionText]} + Returned dictionary follows general structure: + { + "Target.TADA.CharacteristicName": { + "Target.TADA.ResultSampleFractionText": [ + "Target.TADA.ResultSampleFractionText" + ] + } + } """ - cols = ['Target.TADA.CharacteristicName', - 'TADA.ResultSampleFractionText', - 'Target.TADA.ResultSampleFractionText'] + cols = [ + "Target.TADA.CharacteristicName", + "TADA.ResultSampleFractionText", + "Target.TADA.ResultSampleFractionText", + ] sub_df = df[cols].drop_duplicates() # TODO: superfluous? # Update Output/target columns @@ -211,12 +281,12 @@ def char_tbl_TADA(df, char): # loop over new chars, getting {new_fract: [old fracts]} new_char_dict = {} for new_char in sub_df[cols[0]].unique(): - new_char_df = sub_df[sub_df[cols[0]]==new_char] # Mask by new_char + new_char_df = sub_df[sub_df[cols[0]] == new_char] # Mask by new_char new_fract_dict = {} for new_fract in new_char_df[cols[2]].unique(): # TODO: {nan: []}? Doesn't break but needs handling later # Mask by new_fract - new_fract_df = new_char_df[new_char_df[cols[2]]==new_fract] + new_fract_df = new_char_df[new_char_df[cols[2]] == new_fract] # Add a list of possible old_fract for new_fract key new_fract_dict[new_fract] = new_fract_df[cols[1]].unique() new_char_dict[new_char] = new_fract_dict @@ -226,7 +296,7 @@ def char_tbl_TADA(df, char): def registry_adds_list(out_col): """Get units to add to :mod:`pint` unit registry by out_col column. - + Typically out_col refers back to column used for a value from the 'CharacteristicName' column. @@ -243,7 +313,7 @@ def registry_adds_list(out_col): Examples -------- Generate a new pint unit registry object for e.g., Sediment: - + >>> from harmonize_wq import domains >>> domains.registry_adds_list('Sediment') # doctest: +NORMALIZE_WHITESPACE ['fraction = [] = frac', @@ -255,39 +325,43 @@ def registry_adds_list(out_col): # define is 1% (0.08s) slower than replacement (ppm->mg/l) but more robust # Standard pint unit registry additions for dimensionless portions - pct_list = ['fraction = [] = frac', - 'percent = 1e-2 frac', - 'parts_per_thousand = 1e-3 = ppth', - 'parts_per_million = 1e-6 fraction = ppm', - ] + pct_list = [ + "fraction = [] = frac", + "percent = 1e-2 frac", + "parts_per_thousand = 1e-3 = ppth", + "parts_per_million = 1e-6 fraction = ppm", + ] # Standard pint unit registry additions for dimensionless bacteria units - bacteria_list = ['Colony_Forming_Units = [] = CFU = cfu', - 'Most_Probable_Number = CFU = MPN = mpn', - ] + bacteria_list = [ + "Colony_Forming_Units = [] = CFU = cfu", + "Most_Probable_Number = CFU = MPN = mpn", + ] # characteristic based dict - ureg_adds = {'Secchi': [], - 'DO': pct_list, - 'Temperature': [], - 'Salinity': pct_list + - ['Practical_Salinity_Units = ppth = PSU = PSS'], - 'pH': [], - 'Nitrogen': [], - 'Conductivity': [], - 'Carbon': pct_list, - 'Chlorophyll': [], - 'Turbidity': ['Nephelometric_Turbidity_Units = [turbidity] = NTU', - 'Nephelometric_Turbidity_Ratio_Units = NTU = NTRU', - 'Nephelometric_Turbidity_Multibeam_Units = NTU = NTMU', - 'Formazin_Nephelometric_Units = NTU = FNU', - 'Formazin_Nephelometric_Ratio_Units = FNRU = FNU', - 'Formazin_Turbidity_Units = NTU = FNU = FTU = FAU', - 'Jackson_Turbidity_Units = [] = JTU', - 'SiO2 = []'], - 'Sediment': pct_list, - 'Fecal_Coliform': bacteria_list, - 'E_coli': bacteria_list, - 'Phosphorus': [], - } + ureg_adds = { + "Secchi": [], + "DO": pct_list, + "Temperature": [], + "Salinity": pct_list + ["Practical_Salinity_Units = ppth = PSU = PSS"], + "pH": [], + "Nitrogen": [], + "Conductivity": [], + "Carbon": pct_list, + "Chlorophyll": [], + "Turbidity": [ + "Nephelometric_Turbidity_Units = [turbidity] = NTU", + "Nephelometric_Turbidity_Ratio_Units = NTU = NTRU", + "Nephelometric_Turbidity_Multibeam_Units = NTU = NTMU", + "Formazin_Nephelometric_Units = NTU = FNU", + "Formazin_Nephelometric_Ratio_Units = FNRU = FNU", + "Formazin_Turbidity_Units = NTU = FNU = FTU = FAU", + "Jackson_Turbidity_Units = [] = JTU", + "SiO2 = []", + ], + "Sediment": pct_list, + "Fecal_Coliform": bacteria_list, + "E_coli": bacteria_list, + "Phosphorus": [], + } return ureg_adds[out_col] @@ -306,7 +380,7 @@ def registry_adds_list(out_col): The function returns the full dictionary {CharacteristicName: out_column_name}. It can be subset by a 'CharactisticName' column value to get the name of the column for results: - + >>> domains.out_col_lookup['Escherichia coli'] 'E_coli' """ @@ -327,7 +401,7 @@ def registry_adds_list(out_col): "Fecal Coliform": "Fecal_Coliform", "Escherichia coli": "E_coli", "Phosphorus": "Phosphorus", - } +} def characteristic_cols(category=None): @@ -349,151 +423,153 @@ def characteristic_cols(category=None): -------- Running the function without a category returns the full list of column names, including a category returns only the columns in that category: - + >>> domains.characteristic_cols('QA') # doctest: +NORMALIZE_WHITESPACE ['ResultDetectionConditionText', 'ResultStatusIdentifier', 'PrecisionValue', 'DataQuality/BiasValue', 'ConfidenceIntervalValue', 'UpperConfidenceLimitValue', 'LowerConfidenceLimitValue', 'ResultCommentText', 'ResultSamplingPointName', 'ResultDetectionQuantitationLimitUrl'] """ - cols = {'ActivityStartDate': 'activity', - 'ActivityStartTime/Time': 'activity', - 'ActivityStartTime/TimeZoneCode': 'activity', - 'DataLoggerLine': 'measure', - 'ResultDetectionConditionText': 'QA', - 'MethodSpecificationName': 'measure', - 'CharacteristicName': 'measure', - 'ResultSampleFractionText': 'measure', - 'ResultMeasureValue': 'measure', - 'ResultMeasure/MeasureUnitCode': 'measure', - 'MeasureQualifierCode': 'measure', - 'ResultStatusIdentifier': 'QA', - 'ResultIdentifier': 'measure', - 'StatisticalBaseCode': 'measure', - 'ResultValueTypeName': 'measure', - 'ResultWeightBasisText': 'Basis', - 'ResultTimeBasisText': 'Basis', - 'ResultTemperatureBasisText': 'Basis', - 'ResultParticleSizeBasisText': 'Basis', - 'PrecisionValue': 'QA', - 'DataQuality/BiasValue': 'QA', - 'ConfidenceIntervalValue': 'QA', - 'UpperConfidenceLimitValue': 'QA', - 'LowerConfidenceLimitValue': 'QA', - 'ResultCommentText': 'QA', - 'USGSPCode': 'measure', - 'ResultDepthHeightMeasure/MeasureValue': 'Depth', - 'ResultDepthHeightMeasure/MeasureUnitCode': 'Depth', - 'ResultDepthAltitudeReferencePointText': 'Depth', - 'ResultSamplingPointName': 'QA', - 'BiologicalIntentName': 'Bio', - 'BiologicalIndividualIdentifier': 'BIO', - 'SubjectTaxonomicName': 'Bio', - 'UnidentifiedSpeciesIdentifier': 'BIO', - 'SampleTissueAnatomyName': 'Bio', - 'GroupSummaryCountWeight/MeasureValue': 'Bio', - 'GroupSummaryCountWeight/MeasureUnitCode': 'Bio', - 'CellFormName': 'Bio', - 'CellShapeName': 'Bio', - 'HabitName': 'Bio', - 'VoltismName': 'Bio', - 'TaxonomicPollutionTolerance': 'Bio', - 'TaxonomicPollutionToleranceScaleText': 'Bio', - 'TrophicLevelName': 'Bio', - 'FunctionalFeedingGroupName': 'Bio', - 'TaxonomicDetailsCitation/ResourceTitleName': 'Bio', - 'TaxonomicDetailsCitation/ResourceCreatorName': 'Bio', - 'TaxonomicDetailsCitation/ResourceSubjectText': 'Bio', - 'TaxonomicDetailsCitation/ResourcePublisherName': 'Bio', - 'TaxonomicDetailsCitation/ResourceDate': 'Bio', - 'TaxonomicDetailsCitation/ResourceIdentifier': 'Bio', - 'FrequencyClassInformationUrl': 'Bio', - 'ResultAnalyticalMethod/MethodIdentifier': 'measure', - 'ResultAnalyticalMethod/MethodIdentifierContext': 'measure', - 'ResultAnalyticalMethod/MethodName': 'measure', - 'ResultAnalyticalMethod/MethodUrl': 'measure', - 'ResultAnalyticalMethod/MethodQualifierTypeName': 'measure', - 'MethodDescriptionText': 'measure', - 'LaboratoryName': 'analysis', - 'AnalysisStartDate': 'analysis', - 'AnalysisStartTime/Time': 'analysis', - 'AnalysisStartTime/TimeZoneCode': 'analysis', - 'AnalysisEndDate': 'analysis', - 'AnalysisEndTime/Time': 'analysis', - 'AnalysisEndTime/TimeZoneCode': 'analysis', - 'ResultLaboratoryCommentCode': 'analysis', - 'ResultLaboratoryCommentText': 'analysis', - 'ResultDetectionQuantitationLimitUrl': 'QA', - 'LaboratoryAccreditationIndicator': 'analysis', - 'LaboratoryAccreditationAuthorityName': 'analysis', - 'TaxonomistAccreditationIndicator': 'analysis', - 'TaxonomistAccreditationAuthorityName': 'analysis', - 'LabSamplePreparationUrl': 'analysis', - 'ActivityTypeCode': 'activity', - 'ActivityMediaName': 'activity', - 'ActivityMediaSubdivisionName': 'activity', - 'ActivityEndDate': 'activity', - 'ActivityEndTime/Time': 'activity', - 'ActivityEndTime/TimeZoneCode': 'activity', - 'ActivityRelativeDepthName': 'depth', - 'ActivityDepthHeightMeasure/MeasureValue': 'depth', - 'ActivityDepthHeightMeasure/MeasureUnitCode': 'depth', - 'ActivityDepthAltitudeReferencePointText': 'depth', - 'ActivityTopDepthHeightMeasure/MeasureValue': 'depth', - 'ActivityTopDepthHeightMeasure/MeasureUnitCode': 'depth', - 'ActivityBottomDepthHeightMeasure/MeasureValue': 'depth', - 'ActivityBottomDepthHeightMeasure/MeasureUnitCode': 'depth', - 'ActivityConductingOrganizationText': 'activity', - 'ActivityCommentText': 'activity', - 'SampleAquifer': 'activity', - 'HydrologicCondition': 'activity', - 'HydrologicEvent': 'activity', - 'ActivityLocation/LatitudeMeasure': 'activity', - 'ActivityLocation/LongitudeMeasure': 'activity', - 'ActivityLocation/SourceMapScaleNumeric': 'activity', - 'ActivityLocation/HorizontalAccuracyMeasure/MeasureValue': 'activity', - 'ActivityLocation/HorizontalAccuracyMeasure/MeasureUnitCode': 'activity', - 'ActivityLocation/HorizontalCollectionMethodName': 'activity', - 'ActivityLocation/HorizontalCoordinateReferenceSystemDatumName': 'activity', - 'AssemblageSampledName': 'sample', - 'CollectionDuration/MeasureValue': 'sample', - 'CollectionDuration/MeasureUnitCode': 'sample', - 'SamplingComponentName': 'sample', - 'SamplingComponentPlaceInSeriesNumeric': 'sample', - 'ReachLengthMeasure/MeasureValue': 'sample', - 'ReachLengthMeasure/MeasureUnitCode': 'sample', - 'ReachWidthMeasure/MeasureValue': 'sample', - 'ReachWidthMeasure/MeasureUnitCode': 'sample', - 'PassCount': 'sample', - 'NetTypeName': 'sample', - 'NetSurfaceAreaMeasure/MeasureValue': 'sample', - 'NetSurfaceAreaMeasure/MeasureUnitCode': 'sample', - 'NetMeshSizeMeasure/MeasureValue': 'sample', - 'NetMeshSizeMeasure/MeasureUnitCode': 'sample', - 'BoatSpeedMeasure/MeasureValue': 'sample', - 'BoatSpeedMeasure/MeasureUnitCode': 'sample', - 'CurrentSpeedMeasure/MeasureValue': 'sample', - 'CurrentSpeedMeasure/MeasureUnitCode': 'sample', - 'ToxicityTestType': 'analysis', - 'SampleCollectionMethod/MethodIdentifier': 'sample', - 'SampleCollectionMethod/MethodIdentifierContext': 'sample', - 'SampleCollectionMethod/MethodName': 'sample', - 'SampleCollectionMethod/MethodQualifierTypeName': 'sample', - 'SampleCollectionMethod/MethodDescriptionText': 'sample', - 'SampleCollectionEquipmentName': 'sample', - 'SampleCollectionMethod/SampleCollectionEquipmentCommentText': 'sample', - 'SamplePreparationMethod/MethodIdentifier': 'sample', - 'SamplePreparationMethod/MethodIdentifierContext': 'sample', - 'SamplePreparationMethod/MethodName': 'sample', - 'SamplePreparationMethod/MethodQualifierTypeName': 'sample', - 'SamplePreparationMethod/MethodDescriptionText': 'sample', - 'SampleContainerTypeName': 'sample', - 'SampleContainerColorName': 'sample', - 'ChemicalPreservativeUsedName': 'analysis', - 'ThermalPreservativeUsedName': 'analysis', - 'SampleTransportStorageDescription': 'analysis', - 'ActivityMetricUrl': 'activity', - 'PreparationStartDate': 'analysis',} + cols = { + "ActivityStartDate": "activity", + "ActivityStartTime/Time": "activity", + "ActivityStartTime/TimeZoneCode": "activity", + "DataLoggerLine": "measure", + "ResultDetectionConditionText": "QA", + "MethodSpecificationName": "measure", + "CharacteristicName": "measure", + "ResultSampleFractionText": "measure", + "ResultMeasureValue": "measure", + "ResultMeasure/MeasureUnitCode": "measure", + "MeasureQualifierCode": "measure", + "ResultStatusIdentifier": "QA", + "ResultIdentifier": "measure", + "StatisticalBaseCode": "measure", + "ResultValueTypeName": "measure", + "ResultWeightBasisText": "Basis", + "ResultTimeBasisText": "Basis", + "ResultTemperatureBasisText": "Basis", + "ResultParticleSizeBasisText": "Basis", + "PrecisionValue": "QA", + "DataQuality/BiasValue": "QA", + "ConfidenceIntervalValue": "QA", + "UpperConfidenceLimitValue": "QA", + "LowerConfidenceLimitValue": "QA", + "ResultCommentText": "QA", + "USGSPCode": "measure", + "ResultDepthHeightMeasure/MeasureValue": "Depth", + "ResultDepthHeightMeasure/MeasureUnitCode": "Depth", + "ResultDepthAltitudeReferencePointText": "Depth", + "ResultSamplingPointName": "QA", + "BiologicalIntentName": "Bio", + "BiologicalIndividualIdentifier": "BIO", + "SubjectTaxonomicName": "Bio", + "UnidentifiedSpeciesIdentifier": "BIO", + "SampleTissueAnatomyName": "Bio", + "GroupSummaryCountWeight/MeasureValue": "Bio", + "GroupSummaryCountWeight/MeasureUnitCode": "Bio", + "CellFormName": "Bio", + "CellShapeName": "Bio", + "HabitName": "Bio", + "VoltismName": "Bio", + "TaxonomicPollutionTolerance": "Bio", + "TaxonomicPollutionToleranceScaleText": "Bio", + "TrophicLevelName": "Bio", + "FunctionalFeedingGroupName": "Bio", + "TaxonomicDetailsCitation/ResourceTitleName": "Bio", + "TaxonomicDetailsCitation/ResourceCreatorName": "Bio", + "TaxonomicDetailsCitation/ResourceSubjectText": "Bio", + "TaxonomicDetailsCitation/ResourcePublisherName": "Bio", + "TaxonomicDetailsCitation/ResourceDate": "Bio", + "TaxonomicDetailsCitation/ResourceIdentifier": "Bio", + "FrequencyClassInformationUrl": "Bio", + "ResultAnalyticalMethod/MethodIdentifier": "measure", + "ResultAnalyticalMethod/MethodIdentifierContext": "measure", + "ResultAnalyticalMethod/MethodName": "measure", + "ResultAnalyticalMethod/MethodUrl": "measure", + "ResultAnalyticalMethod/MethodQualifierTypeName": "measure", + "MethodDescriptionText": "measure", + "LaboratoryName": "analysis", + "AnalysisStartDate": "analysis", + "AnalysisStartTime/Time": "analysis", + "AnalysisStartTime/TimeZoneCode": "analysis", + "AnalysisEndDate": "analysis", + "AnalysisEndTime/Time": "analysis", + "AnalysisEndTime/TimeZoneCode": "analysis", + "ResultLaboratoryCommentCode": "analysis", + "ResultLaboratoryCommentText": "analysis", + "ResultDetectionQuantitationLimitUrl": "QA", + "LaboratoryAccreditationIndicator": "analysis", + "LaboratoryAccreditationAuthorityName": "analysis", + "TaxonomistAccreditationIndicator": "analysis", + "TaxonomistAccreditationAuthorityName": "analysis", + "LabSamplePreparationUrl": "analysis", + "ActivityTypeCode": "activity", + "ActivityMediaName": "activity", + "ActivityMediaSubdivisionName": "activity", + "ActivityEndDate": "activity", + "ActivityEndTime/Time": "activity", + "ActivityEndTime/TimeZoneCode": "activity", + "ActivityRelativeDepthName": "depth", + "ActivityDepthHeightMeasure/MeasureValue": "depth", + "ActivityDepthHeightMeasure/MeasureUnitCode": "depth", + "ActivityDepthAltitudeReferencePointText": "depth", + "ActivityTopDepthHeightMeasure/MeasureValue": "depth", + "ActivityTopDepthHeightMeasure/MeasureUnitCode": "depth", + "ActivityBottomDepthHeightMeasure/MeasureValue": "depth", + "ActivityBottomDepthHeightMeasure/MeasureUnitCode": "depth", + "ActivityConductingOrganizationText": "activity", + "ActivityCommentText": "activity", + "SampleAquifer": "activity", + "HydrologicCondition": "activity", + "HydrologicEvent": "activity", + "ActivityLocation/LatitudeMeasure": "activity", + "ActivityLocation/LongitudeMeasure": "activity", + "ActivityLocation/SourceMapScaleNumeric": "activity", + "ActivityLocation/HorizontalAccuracyMeasure/MeasureValue": "activity", + "ActivityLocation/HorizontalAccuracyMeasure/MeasureUnitCode": "activity", + "ActivityLocation/HorizontalCollectionMethodName": "activity", + "ActivityLocation/HorizontalCoordinateReferenceSystemDatumName": "activity", + "AssemblageSampledName": "sample", + "CollectionDuration/MeasureValue": "sample", + "CollectionDuration/MeasureUnitCode": "sample", + "SamplingComponentName": "sample", + "SamplingComponentPlaceInSeriesNumeric": "sample", + "ReachLengthMeasure/MeasureValue": "sample", + "ReachLengthMeasure/MeasureUnitCode": "sample", + "ReachWidthMeasure/MeasureValue": "sample", + "ReachWidthMeasure/MeasureUnitCode": "sample", + "PassCount": "sample", + "NetTypeName": "sample", + "NetSurfaceAreaMeasure/MeasureValue": "sample", + "NetSurfaceAreaMeasure/MeasureUnitCode": "sample", + "NetMeshSizeMeasure/MeasureValue": "sample", + "NetMeshSizeMeasure/MeasureUnitCode": "sample", + "BoatSpeedMeasure/MeasureValue": "sample", + "BoatSpeedMeasure/MeasureUnitCode": "sample", + "CurrentSpeedMeasure/MeasureValue": "sample", + "CurrentSpeedMeasure/MeasureUnitCode": "sample", + "ToxicityTestType": "analysis", + "SampleCollectionMethod/MethodIdentifier": "sample", + "SampleCollectionMethod/MethodIdentifierContext": "sample", + "SampleCollectionMethod/MethodName": "sample", + "SampleCollectionMethod/MethodQualifierTypeName": "sample", + "SampleCollectionMethod/MethodDescriptionText": "sample", + "SampleCollectionEquipmentName": "sample", + "SampleCollectionMethod/SampleCollectionEquipmentCommentText": "sample", + "SamplePreparationMethod/MethodIdentifier": "sample", + "SamplePreparationMethod/MethodIdentifierContext": "sample", + "SamplePreparationMethod/MethodName": "sample", + "SamplePreparationMethod/MethodQualifierTypeName": "sample", + "SamplePreparationMethod/MethodDescriptionText": "sample", + "SampleContainerTypeName": "sample", + "SampleContainerColorName": "sample", + "ChemicalPreservativeUsedName": "analysis", + "ThermalPreservativeUsedName": "analysis", + "SampleTransportStorageDescription": "analysis", + "ActivityMetricUrl": "activity", + "PreparationStartDate": "analysis", + } if category: # List of key where value is category col_list = [key for key, value in cols.items() if value == category] @@ -502,37 +578,6 @@ def characteristic_cols(category=None): return col_list -"""Get dictionary of expected horizontal datums. - -The structure has {key as expected string: value as {"Description": string -and "EPSG": integer (4-digit code)}. - -Notes ------ -source WQP: HorizontalCoordinateReferenceSystemDatum_CSV.zip - -Anything not in dict will be NaN, and non-integer EPSG will be missing: -"OTHER": {"Description": 'Other', "EPSG": nan}, -"UNKWN": {"Description": 'Unknown', "EPSG": nan} - -Returns -------- -dict - Dictionary where exhaustive: - {HorizontalCoordinateReferenceSystemDatumName: {Description:str, - EPSG:int}} - -Examples --------- -Running the function returns the full dictionary with {abbreviation: -{'Description':values, 'EPSG':values}}. The abbreviation key can be used to -get the EPSG code: - ->>> domains.xy_datum['NAD83'] -{'Description': 'North American Datum 1983', 'EPSG': 4269} ->>> domains.xy_datum['NAD83']['EPSG'] -4269 -""" xy_datum = { "NAD27": {"Description": "North American Datum 1927", "EPSG": 4267}, "NAD83": {"Description": "North American Datum 1983", "EPSG": 4269}, @@ -551,31 +596,10 @@ def characteristic_cols(category=None): "HARN": { "Description": "High Accuracy Reference Network for NAD83", "EPSG": 4152, - }, - } + }, +} # Default field mapping writes full name to alias but a short name to field -"""Get shortened column names for shapefile (.shp) fields. - -ESRI places a length restriction on shapefile (.shp) field names. This -returns a dictionary with the original water quality portal field name (as -key) and shortened column name for writing as .shp. We suggest using the -longer original name as the field alias when writing as .shp. - -Returns -------- -field_mapping : dict - Dictionary where key = WQP field name and value = short name for .shp. - -Examples --------- -Although running the function returns the full dictionary of Key:Value -pairs, here we show how the current name can be used as a key to get the -new name: - ->>> domains.stations_rename['OrganizationIdentifier'] -'org_ID' -""" stations_rename = { "OrganizationIdentifier": "org_ID", "OrganizationFormalName": "org_name", @@ -615,35 +639,21 @@ def characteristic_cols(category=None): "ProviderName": "provider", "ActivityIdentifier": "activity_ID", "ResultIdentifier": "result_ID", - } - -"""Get accepted methods for each characteristic. +} -Notes ------ -Source should be in 'ResultAnalyticalMethod/MethodIdentifierContext' -column. This is not fully implemented. - -Returns -------- -dict - Dictionary where key is characteristic column name and value is list of - dictionaries each with Source and Method keys. - -""" accepted_methods = { "Secchi": [ {"Source": "APHA", "Method": "2320-B"}, {"Source": "ASTM", "Method": "D1889"}, {"Source": "USEPA", "Method": "NRSA09 W QUAL (BOAT)"}, {"Source": "USEPA", "Method": "841-B-11-003"}, - ], + ], "DO": [ {"Source": "USEPA", "Method": "360.2"}, {"Source": "USEPA", "Method": "130.1"}, {"Source": "APHA", "Method": "4500-O-G"}, {"Source": "USEPA", "Method": "160.3"}, - {"Source": "AOAC", "Method": "973.45"}, + {"Source": "AOAC", "Method": "973.45"}, {"Source": "USDOI/USGS", "Method": "I-1576-78"}, {"Source": "USDOI/USGS", "Method": "NFM 6.2.1-LUM"}, {"Source": "ASTM", "Method": "D888(B)"}, @@ -658,7 +668,7 @@ def characteristic_cols(category=None): {"Source": "USEPA", "Method": "841-B-11-003"}, {"Source": "ASTM", "Method": "D888-12"}, {"Source": "YSI", "Method": "EXO WQ SONDE"}, - ], + ], "Temperature": [ {"Source": "USEPA", "Method": "170.1"}, {"Source": "USEPA", "Method": "130.1"}, @@ -666,7 +676,7 @@ def characteristic_cols(category=None): {"Source": "APHA", "Method": "2550"}, {"Source": "YSI", "Method": "EXO WQ SONDE"}, {"Source": "APHA", "Method": "2550 B"}, - ], + ], "Salinity": [ {"Source": "YSI", "Method": "EXO WQ SONDE"}, {"Source": "HACH", "Method": "8160"}, @@ -674,7 +684,7 @@ def characteristic_cols(category=None): {"Source": "APHA", "Method": "2130"}, {"Source": "APHA", "Method": "3.2-B"}, {"Source": "APHA", "Method": "2520-C"}, - ], + ], "pH": [ {"Source": "ASTM", "Method": "D1293(B)"}, {"Source": "YSI", "Method": "EXO WQ SONDE"}, @@ -694,7 +704,7 @@ def characteristic_cols(category=None): {"Source": "HACH", "Method": "8156"}, {"Source": "ASTM", "Method": "D1293(A)"}, {"Source": "APHA", "Method": "4500-H+B"}, - ], + ], "Nitrogen": [ {"Source": "USEPA", "Method": "353.1"}, {"Source": "USEPA", "Method": "353.2"}, @@ -752,7 +762,7 @@ def characteristic_cols(category=None): {"Source": "APHA", "Method": "5310-B"}, {"Source": "APHA", "Method": "4500-P-J"}, {"Source": "APHA", "Method": "4500-N-C"}, - ], + ], "Conductivity": [ {"Source": "ASTM", "Method": "D1125(A)"}, {"Source": "APHA", "Method": "2510"}, @@ -766,7 +776,7 @@ def characteristic_cols(category=None): {"Source": "USEPA", "Method": "120.1"}, {"Source": "USEPA", "Method": "841-B-11-003"}, {"Source": "YSI", "Method": "EXO WQ SONDE"}, - ], + ], "Carbon": [ {"Source": "USEPA", "Method": "9060"}, {"Source": "APHA_SM20ED", "Method": "5310-B"}, @@ -785,7 +795,7 @@ def characteristic_cols(category=None): {"Source": "USEPA", "Method": "415.2"}, {"Source": "APHA", "Method": "5310-B"}, {"Source": "APHA", "Method": "4500-H+B"}, - ], + ], "Chlorophyll": [ {"Source": "YSI", "Method": "EXO WQ SONDE"}, {"Source": "USEPA", "Method": "446"}, @@ -798,7 +808,7 @@ def characteristic_cols(category=None): {"Source": "APHA", "Method": "10200H(2)"}, {"Source": "APHA", "Method": "9222B"}, {"Source": "APHA", "Method": "5310-C"}, - ], + ], "Turbidity": [ {"Source": "USEPA", "Method": "160.2_M"}, {"Source": "USDOI/USGS", "Method": "I3860"}, @@ -811,7 +821,7 @@ def characteristic_cols(category=None): {"Source": "HACH", "Method": "8195"}, {"Source": "LECK MITCHELL", "Method": "M5331"}, {"Source": "ASTM", "Method": "D1889"}, - ], + ], "Sediment": [], "Fecal_Coliform": [ {"Source": "IDEXX", "Method": "COLILERT-18"}, @@ -829,7 +839,7 @@ def characteristic_cols(category=None): {"Source": "APHA", "Method": "10200-G"}, {"Source": "APHA", "Method": "9222-E"}, {"Source": "APHA", "Method": "9221-B"}, - ], + ], "E_coli": [ {"Source": "APHA", "Method": "9221A-B-C-F"}, {"Source": "IDEXX", "Method": "COLILERT/2000"}, @@ -866,7 +876,7 @@ def characteristic_cols(category=None): {"Source": "ASTM", "Method": "D5392"}, {"Source": "HACH", "Method": "10018"}, {"Source": "USEPA", "Method": "1600"}, - ], + ], "Phosphorus": [ {"Source": "APHA", "Method": "3125"}, {"Source": "APHA", "Method": "4500-P-C"}, @@ -959,5 +969,5 @@ def characteristic_cols(category=None): {"Source": "USDOI/USGS", "Method": "I-2601-90"}, {"Source": "USDOI/USGS", "Method": "I-6600-88"}, {"Source": "ASTM", "Method": "D515"}, - ], - } + ], +} diff --git a/harmonize_wq/harmonize.py b/harmonize_wq/harmonize.py index cb82af3..e8515cb 100644 --- a/harmonize_wq/harmonize.py +++ b/harmonize_wq/harmonize.py @@ -1,16 +1,19 @@ # -*- coding: utf-8 -*- """Functions to harmonize data retrieved from EPA's Water Quality Portal.""" + from warnings import warn + from numpy import nan -from harmonize_wq.wq_data import WQCharData + from harmonize_wq import convert from harmonize_wq.domains import OUT_UNITS, UNITS_REPLACE from harmonize_wq.visualize import print_report +from harmonize_wq.wq_data import WQCharData def dissolved_oxygen(wqp): """Standardize 'Dissolved Oxygen (DO)' characteristic. - + Uses :class:`wq_data.WQCharData` to check units, check unit dimensionality and perform appropriate unit conversions. @@ -29,23 +32,23 @@ def dissolved_oxygen(wqp): # Check/fix dimensionality issues (Type III) for unit in wqp.dimensions_list(): - if wqp.ureg(wqp.units).check({'[length]': -3, '[mass]': 1}): + if wqp.ureg(wqp.units).check({"[length]": -3, "[mass]": 1}): # Convert to density, e.g., % or ppm -> mg/l (assumes STP for now) wqp.apply_conversion(convert.DO_saturation, unit) elif wqp.ureg(wqp.units).dimensionless: # Convert to dimensionless, e.g., mg/l -> % or ppm wqp.apply_conversion(convert.DO_concentration, unit) - warn(f'Need % saturation equation for {unit}') + warn(f"Need % saturation equation for {unit}") return wqp def salinity(wqp): """Standardize 'Salinity' characteristic. - + Uses :class:`wq_data.WQCharData` to check basis, check units, check unit dimensionality and perform appropriate unit conversions. - + Notes ----- PSU=PSS=ppth and 'ppt' is picopint in :mod:`pint` so it is changed to @@ -61,20 +64,20 @@ def salinity(wqp): wqp : wq_data.WQCharData WQP Characteristic Info Object with updated attributes. """ - wqp.check_basis(basis_col='ResultTemperatureBasisText') # Moves '@25C' out + wqp.check_basis(basis_col="ResultTemperatureBasisText") # Moves '@25C' out wqp.check_units() # Replace know problem units, fix and flag missing units # Check/fix dimensionality issues (Type III) for unit in wqp.dimensions_list(): if wqp.ureg(wqp.units).dimensionless: # Convert to dimensionless - if wqp.ureg(unit).check({'[length]': -3, '[mass]': 1}): + if wqp.ureg(unit).check({"[length]": -3, "[mass]": 1}): # Density, e.g., 'mg/l' -> 'PSU'/'PSS'/'ppth' wqp.apply_conversion(convert.density_to_PSU, unit) else: # Will cause dimensionality error, kick it there for handling continue - elif wqp.ureg(wqp.units).check({'[length]': -3, '[mass]': 1}): + elif wqp.ureg(wqp.units).check({"[length]": -3, "[mass]": 1}): # Convert to density, e.g., PSU -> 'mg/l' wqp.apply_conversion(convert.PSU_to_density, unit) @@ -83,7 +86,7 @@ def salinity(wqp): def turbidity(wqp): """Standardize 'Turbidity' characteristic. - + Uses :class:`wq_data.WQCharData` to check units, check unit dimensionality and perform appropriate unit conversions @@ -111,7 +114,7 @@ def turbidity(wqp): ``_. :func:`convert.SiO2_to_NTU` linear relation from Otilia et al. 2013. :func:`convert.JTU_to_NTU` linear relation from Otilia et al. 2013. - + Otilia, Rusănescu Carmen, Rusănescu Marin, and Stoica Dorel. MONITORING OF PHYSICAL INDICATORS IN WATER SAMPLES. ``_. @@ -126,40 +129,40 @@ def turbidity(wqp): wqp : wq_data.WQCharData WQP Characteristic Info Object with updated attributes. """ - #These units exist but have not been encountered yet - #formazin nephelometric multibeam unit (FNMU); - #formazin backscatter unit (FBU); - #backscatter units (BU); attenuation units (AU) + # These units exist but have not been encountered yet + # formazin nephelometric multibeam unit (FNMU); + # formazin backscatter unit (FBU); + # backscatter units (BU); attenuation units (AU) wqp.check_units() # Replace know problem units, fix and flag missing units # Check/fix dimensionality issues (Type III) for unit in wqp.dimensions_list(): - if wqp.ureg(wqp.units).check({'[turbidity]': 1}): + if wqp.ureg(wqp.units).check({"[turbidity]": 1}): if wqp.ureg(unit).dimensionless: - if unit=='JTU': + if unit == "JTU": wqp.apply_conversion(convert.JTU_to_NTU, unit) - elif unit=='SiO2': + elif unit == "SiO2": wqp.apply_conversion(convert.SiO2_to_NTU, unit) else: - #raise ValueError('Bad Turbidity unit: {}'.format(unit)) - warn(f'Bad Turbidity unit: {unit}') - elif wqp.ureg(unit).check({'[length]': 1}): + # raise ValueError('Bad Turbidity unit: {}'.format(unit)) + warn(f"Bad Turbidity unit: {unit}") + elif wqp.ureg(unit).check({"[length]": 1}): wqp.apply_conversion(convert.cm_to_NTU, unit) else: - #raise ValueError('Bad Turbidity unit: {}'.format(unit)) - warn(f'Bad Turbidity unit: {unit}') - elif wqp.ureg(wqp.units).check({'[length]': 1}): + # raise ValueError('Bad Turbidity unit: {}'.format(unit)) + warn(f"Bad Turbidity unit: {unit}") + elif wqp.ureg(wqp.units).check({"[length]": 1}): wqp.apply_conversion(convert.NTU_to_cm, unit) else: - #raise ValueError('Bad Turbidity unit: {}'.format(wqp.units)) - warn(f'Bad Turbidity unit: {unit}') + # raise ValueError('Bad Turbidity unit: {}'.format(wqp.units)) + warn(f"Bad Turbidity unit: {unit}") return wqp def sediment(wqp): """Standardize 'Sediment' characteristic. - + Uses :class:`wq_data.WQCharData` to check basis, check units, and check unit dimensionality. @@ -173,8 +176,8 @@ def sediment(wqp): wqp : wq_data.WQCharData WQP Characteristic Info Object with updated attributes. """ - #'< 0.0625 mm', < 0.125 mm, < 0.25 mm, < 0.5 mm, < 1 mm, < 2 mm, < 4 mm - wqp.check_basis(basis_col='ResultParticleSizeBasisText') + # '< 0.0625 mm', < 0.125 mm, < 0.25 mm, < 0.5 mm, < 1 mm, < 2 mm, < 4 mm + wqp.check_basis(basis_col="ResultParticleSizeBasisText") wqp.check_units() # Replace know problem units, fix and flag missing units @@ -188,9 +191,9 @@ def sediment(wqp): return wqp -def harmonize_all(df_in, errors='raise'): +def harmonize_all(df_in, errors="raise"): """Harmonizes all 'CharacteristicNames' column values with methods. - + All results are standardized to default units. Intermediate columns are not retained. See :func:`domains.out_col_lookup` for list of values with methods. @@ -210,27 +213,27 @@ def harmonize_all(df_in, errors='raise'): ------- df : pandas.DataFrame Updated copy of df_in. - + Examples -------- Build example df_in table from harmonize_wq tests to use in place of Water - Quality Portal query response, this table has 'Temperature, water' and + Quality Portal query response, this table has 'Temperature, water' and 'Phosphorous' results: - + >>> import pandas >>> tests_url = 'https://raw.githubusercontent.com/USEPA/harmonize-wq/main/harmonize_wq/tests' >>> df1 = pandas.read_csv(tests_url + '/data/wqp_results.txt') >>> df1.shape (359505, 35) - - When running the function there may be read outs or warnings, as things are + + When running the function there may be read outs or warnings, as things are encountered such as unexpected nutrient sample fractions: - + >>> from harmonize_wq import harmonize >>> df_result_all = harmonize.harmonize_all(df1) 1 Phosphorus sample fractions not in frac_dict 1 Phosphorus sample fractions not in frac_dict found in expected domains, mapped to "Other_Phosphorus" - + >>> df_result_all OrganizationIdentifier ... Temperature 0 21FLHILL_WQX ... 29.93 degree_Celsius @@ -246,24 +249,24 @@ def harmonize_all(df_in, errors='raise'): 359504 21FLSMRC_WQX ... NaN [359505 rows x 42 columns] - + List columns that were added: - + >>> sorted(list(df_result_all.columns[-7:])) ... # doctest: +NORMALIZE_WHITESPACE ['Other_Phosphorus', 'Phosphorus', 'QA_flag', 'Speciation', 'TDP_Phosphorus', 'TP_Phosphorus', 'Temperature'] - + See Also -------- - See any of the 'Simple' notebooks found in + See any of the 'Simple' notebooks found in 'demos' for examples of how this function is used to standardize, clean, and wrangle a Water Quality Portal query response. - - """ + + """ # noqa: E501 df_out = df_in.copy() - char_vals = list(set(df_out['CharacteristicName'])) + char_vals = list(set(df_out["CharacteristicName"])) char_vals.sort() for char_val in char_vals: @@ -271,14 +274,20 @@ def harmonize_all(df_in, errors='raise'): return df_out -def harmonize(df_in, char_val, units_out=None, errors='raise', - intermediate_columns=False, report=False): +def harmonize( + df_in, + char_val, + units_out=None, + errors="raise", + intermediate_columns=False, + report=False, +): """Harmonize char_val rows based methods specific to that char_val. All rows where the value in the 'CharacteristicName' column matches char_val will have their results harmonized based on available methods for that char_val. - + Parameters ---------- df_in : pandas.DataFrame @@ -306,15 +315,15 @@ def harmonize(df_in, char_val, units_out=None, errors='raise', Examples -------- Build example df_in table from harmonize_wq tests to use in place of Water - Quality Portal query response, this table has 'Temperature, water' and + Quality Portal query response, this table has 'Temperature, water' and 'Phosphorous' results: - + >>> import pandas >>> tests_url = 'https://raw.githubusercontent.com/USEPA/harmonize-wq/main/harmonize_wq/tests' >>> df1 = pandas.read_csv(tests_url + '/data/wqp_results.txt') >>> df1.shape (359505, 35) - + >>> from harmonize_wq import harmonize >>> df_result = harmonize.harmonize(df1, 'Temperature, water') >>> df_result @@ -332,15 +341,15 @@ def harmonize(df_in, char_val, units_out=None, errors='raise', 359504 21FLSMRC_WQX ... NaN [359505 rows x 37 columns] - + List columns that were added: - + >>> df_result.columns[-2:] Index(['QA_flag', 'Temperature'], dtype='object') - + See Also -------- - See any of the 'Detailed' notebooks found in + See any of the 'Detailed' notebooks found in 'demos' for examples of how this function is used to standardize, clean, and wrangle a Water Quality Portal query response, one 'CharacteristicName' value at a time. @@ -358,25 +367,25 @@ def harmonize(df_in, char_val, units_out=None, errors='raise', wqp.update_ureg() # This is done based on out_col/char_val # Use out_col to dictate function - if out_col in ['pH', 'Secchi']: + if out_col in ["pH", "Secchi"]: wqp.check_units() # Fix and flag missing units # NOTE: pH undefined units -> NAN -> units, - elif out_col in ['Conductivity', 'Chlorophyll']: + elif out_col in ["Conductivity", "Chlorophyll"]: # Replace know problem units, fix and flag missing units wqp.check_units() - elif out_col in ['Fecal_Coliform', 'E_coli']: + elif out_col in ["Fecal_Coliform", "E_coli"]: # NOTE: Ecoli ['cfu/100ml', 'MPN/100ml', '#/100ml'] # NOTE: feca ['CFU', 'MPN/100ml', 'cfu/100ml', 'MPN/100 ml', '#/100ml'] # Replace known special character in unit ('#' count assumed as CFU) - wqp.replace_unit_str('#', 'CFU') + wqp.replace_unit_str("#", "CFU") # Replace known unit problems (e.g., assume CFU/MPN is /100ml) wqp.replace_unit_by_dict(UNITS_REPLACE[out_col]) - #TODO: figure out why the above must be done before replace_unit_str + # TODO: figure out why the above must be done before replace_unit_str # Replace all instances in results column - wqp.replace_unit_str('/100ml', '/(100ml)') - wqp.replace_unit_str('/100 ml', '/(100ml)') + wqp.replace_unit_str("/100ml", "/(100ml)") + wqp.replace_unit_str("/100 ml", "/(100ml)") wqp.check_units() # Fix and flag missing units - elif out_col in ['Carbon', 'Phosphorus', 'Nitrogen']: + elif out_col in ["Carbon", "Phosphorus", "Nitrogen"]: # Set Basis from unit and MethodSpec column wqp.check_basis() # Replace know problem units, fix and flag missing units (wet/dry?) @@ -386,17 +395,18 @@ def harmonize(df_in, char_val, units_out=None, errors='raise', # Replace units by dictionary wqp.replace_unit_by_dict(dimension_dict, wqp.measure_mask()) wqp.moles_convert(mol_list) # Fix up units/measures where moles - elif out_col == 'Temperature': + elif out_col == "Temperature": # Remove spaces from units for pint ('deg C' == degree coulomb) - wqp.update_units(wqp.units.replace(' ', '')) # No spaces in units_out - wqp.replace_unit_str(' ', '') # Replace in results column + wqp.update_units(wqp.units.replace(" ", "")) # No spaces in units_out + wqp.replace_unit_str(" ", "") # Replace in results column wqp.check_units() # Fix and flag missing units else: - harmonize_map = {'DO': dissolved_oxygen, - 'Salinity': salinity, - 'Turbidity': turbidity, - 'Sediment': sediment, - } + harmonize_map = { + "DO": dissolved_oxygen, + "Salinity": salinity, + "Turbidity": turbidity, + "Sediment": sediment, + } try: wqp = harmonize_map[out_col](wqp) except KeyError: @@ -411,18 +421,19 @@ def harmonize(df_in, char_val, units_out=None, errors='raise', # Note: just phosphorus right now # Total is TP (digested) from the whole water sample (vs total dissolved) # Dissolved is TDP (total) filtered water digested (vs undigested DIP) - if out_col in ['Phosphorus', 'Nitrogen']: + if out_col in ["Phosphorus", "Nitrogen"]: # NOTE: only top level fractions, while TADA has lower for: - #'Chlorophyll a', 'Turbidity', 'Fecal Coliform', 'Escherichia coli' - if out_col=='Phosphorus': - frac_dict = {'TP_Phosphorus': ['Total'], - 'TDP_Phosphorus': ['Dissolved'], - 'Other_Phosphorus': ['', nan],} + # 'Chlorophyll a', 'Turbidity', 'Fecal Coliform', 'Escherichia coli' + if out_col == "Phosphorus": + frac_dict = { + "TP_Phosphorus": ["Total"], + "TDP_Phosphorus": ["Dissolved"], + "Other_Phosphorus": ["", nan], + } else: - frac_dict = 'TADA' + frac_dict = "TADA" frac_dict = wqp.fraction(frac_dict) # Run sample fraction on WQP - df_out = wqp.df # TODO: add activities/detection limits and filter on quality? e.g., cols: @@ -437,5 +448,5 @@ def harmonize(df_in, char_val, units_out=None, errors='raise', if report: print_report(df_out.loc[wqp.c_mask], out_col, wqp.col.unit_in) if not intermediate_columns: - df_out = df_out.drop(['Units'], axis=1) # Drop intermediate columns + df_out = df_out.drop(["Units"], axis=1) # Drop intermediate columns return df_out diff --git a/harmonize_wq/location.py b/harmonize_wq/location.py index bc545a8..577c92f 100644 --- a/harmonize_wq/location.py +++ b/harmonize_wq/location.py @@ -1,22 +1,26 @@ # -*- coding: utf-8 -*- """Functions to clean/correct location data.""" -from pyproj import Transformer -from shapely.geometry import shape + import geopandas import pandas from dataretrieval import wqp +from pyproj import Transformer +from shapely.geometry import shape + +from harmonize_wq.clean import add_qa_flag, check_precision, df_checks from harmonize_wq.domains import xy_datum from harmonize_wq.wrangle import clip_stations -from harmonize_wq.clean import check_precision, df_checks, add_qa_flag -def infer_CRS(df_in, - out_EPSG, - out_col='EPSG', - bad_crs_val=None, - crs_col='HorizontalCoordinateReferenceSystemDatumName'): +def infer_CRS( + df_in, + out_EPSG, + out_col="EPSG", + bad_crs_val=None, + crs_col="HorizontalCoordinateReferenceSystemDatumName", +): """Replace missing or unrecognized Coordinate Reference System (CRS). - + Replaces with desired CRS and notes it was missing in 'QA_flag' column. Parameters @@ -43,14 +47,14 @@ def infer_CRS(df_in, -------- Build pandas DataFrame to use in example, where crs_col name is 'Datum' rather than default 'HorizontalCoordinateReferenceSystemDatumName': - + >>> from numpy import nan >>> df_in = pandas.DataFrame({'Datum': ['NAD83', 'WGS84', '', None, nan]}) >>> df_in # doctest: +NORMALIZE_WHITESPACE Datum 0 NAD83 1 WGS84 - 2 + 2 3 None 4 NaN @@ -70,11 +74,11 @@ def infer_CRS(df_in, df_out = df_in.copy() if bad_crs_val: # QA flag for bad CRS based on bad_crs_val - flag = f'{crs_col}: Bad datum {bad_crs_val}, EPSG:{out_EPSG} assumed' + flag = f"{crs_col}: Bad datum {bad_crs_val}, EPSG:{out_EPSG} assumed" c_mask = df_out[crs_col] == bad_crs_val # Mask for bad CRS value else: # QA flag for missing CRS - flag = f'{crs_col}: MISSING datum, EPSG:{out_EPSG} assumed' + flag = f"{crs_col}: MISSING datum, EPSG:{out_EPSG} assumed" c_mask = df_out[crs_col].isna() # Mask for missing units df_out = add_qa_flag(df_out, c_mask, flag) # Assign flag df_out.loc[c_mask, out_col] = out_EPSG # Update with infered unit @@ -82,10 +86,9 @@ def infer_CRS(df_in, return df_out -def harmonize_locations(df_in, out_EPSG=4326, - intermediate_columns=False, **kwargs): +def harmonize_locations(df_in, out_EPSG=4326, intermediate_columns=False, **kwargs): """Create harmonized geopandas GeoDataframe from pandas DataFrame. - + Takes a :class:`~pandas.DataFrame` with lat/lon in multiple Coordinate Reference Systems (CRS), transforms them to out_EPSG CRS, and converts to :class:`geopandas.GeoDataFrame`. A 'QA_flag' column is added to the result @@ -122,17 +125,15 @@ def harmonize_locations(df_in, out_EPSG=4326, Examples -------- Build pandas DataFrame to use in example: - - >>> df_in = pandas.DataFrame({'LatitudeMeasure': [27.5950355, - ... 27.52183, - ... 28.0661111], - ... 'LongitudeMeasure': [-82.0300865, - ... -82.64476, - ... -82.3775], - ... 'HorizontalCoordinateReferenceSystemDatumName': ['NAD83', - ... 'WGS84', - ... 'NAD27'], - ... }) + + >>> df_in = pandas.DataFrame( + ... { + ... "LatitudeMeasure": [27.5950355, 27.52183, 28.0661111], + ... "LongitudeMeasure": [-82.0300865, -82.64476, -82.3775], + ... "HorizontalCoordinateReferenceSystemDatumName": + ... ["NAD83", "WGS84", "NAD27"], + ... } + ... ) >>> df_in LatitudeMeasure ... HorizontalCoordinateReferenceSystemDatumName 0 27.595036 ... NAD83 @@ -153,10 +154,9 @@ def harmonize_locations(df_in, out_EPSG=4326, df2 = df_in.copy() # Default columns - crs_col = kwargs.get('crs_col', - "HorizontalCoordinateReferenceSystemDatumName") - lat_col = kwargs.get('lat_col', 'LatitudeMeasure') - lon_col = kwargs.get('lon_col', 'LongitudeMeasure') + crs_col = kwargs.get("crs_col", "HorizontalCoordinateReferenceSystemDatumName") + lat_col = kwargs.get("lat_col", "LatitudeMeasure") + lon_col = kwargs.get("lon_col", "LongitudeMeasure") # Check columns are in df df_checks(df2, [crs_col, lat_col, lon_col]) @@ -166,12 +166,13 @@ def harmonize_locations(df_in, out_EPSG=4326, df2 = check_precision(df2, lon_col) # Create tuple column - df2['geom_orig'] = list(zip(df2[lon_col], df2[lat_col])) + df2["geom_orig"] = list(zip(df2[lon_col], df2[lat_col])) # Create/populate EPSG column crs_mask = df2[crs_col].isin(xy_datum.keys()) # w/ known datum - df2.loc[crs_mask, 'EPSG'] = [xy_datum[crs]['EPSG'] for crs - in df2.loc[crs_mask, crs_col]] + df2.loc[crs_mask, "EPSG"] = [ + xy_datum[crs]["EPSG"] for crs in df2.loc[crs_mask, crs_col] + ] # Fix/flag missing df2 = infer_CRS(df2, out_EPSG, crs_col=crs_col) @@ -181,16 +182,17 @@ def harmonize_locations(df_in, out_EPSG=4326, df2 = infer_CRS(df2, out_EPSG, bad_crs_val=crs, crs_col=crs_col) # Transform points by vector (sub-set by datum) - for datum in set(df2['EPSG'].astype(int)): + for datum in set(df2["EPSG"].astype(int)): df2 = transform_vector_of_points(df2, datum, out_EPSG) # Convert geom to shape object to use with geopandas - df2['geom'] = [shape({'type': 'Point', 'coordinates': pnt}) - for pnt in list(df2['geom'])] - gdf = geopandas.GeoDataFrame(df2, geometry=df2['geom'], crs=out_EPSG) + df2["geom"] = [ + shape({"type": "Point", "coordinates": pnt}) for pnt in list(df2["geom"]) + ] + gdf = geopandas.GeoDataFrame(df2, geometry=df2["geom"], crs=out_EPSG) if not intermediate_columns: # Drop intermediate columns - gdf = gdf.drop(['geom', 'geom_orig', 'EPSG'], axis=1) + gdf = gdf.drop(["geom", "geom_orig", "EPSG"], axis=1) return gdf @@ -214,19 +216,18 @@ def transform_vector_of_points(df_in, datum, out_EPSG): """ # Create transform object for input datum (EPSG colum) and out_EPSG transformer = Transformer.from_crs(datum, out_EPSG) - d_mask = df_in['EPSG'] == datum # Mask for datum in subset - points = df_in.loc[d_mask, 'geom_orig'] # Points series + d_mask = df_in["EPSG"] == datum # Mask for datum in subset + points = df_in.loc[d_mask, "geom_orig"] # Points series # List of transformed point geometries new_geoms = [transformer.transform(pnt[0], pnt[1]) for pnt in points] # Assign list to df.geom using Index from mask to re-index list - df_in.loc[d_mask, 'geom'] = pandas.Series(new_geoms, - index=df_in.loc[d_mask].index) + df_in.loc[d_mask, "geom"] = pandas.Series(new_geoms, index=df_in.loc[d_mask].index) return df_in def get_harmonized_stations(query, aoi=None): """Query, harmonize and clip stations. - + Queries the `Water Quality Portal `_ for stations with data matching the query, harmonizes those stations' location information, and clips it to the area of interest (aoi) if specified. @@ -250,19 +251,19 @@ def get_harmonized_stations(query, aoi=None): Raw station results from WQP. site_md : ``dataretrieval.utils.Metadata`` Custom ``dataretrieval`` metadata object pertaining to the WQP query. - + Examples -------- - See any of the 'Simple' notebooks found in + See any of the 'Simple' notebooks found in 'demos'_ for examples of how this function is used to query and harmonize stations. - + """ # TODO: **kwargs instead of query dict? # Query stations (can be slow) - if 'dataProfile' in query.keys(): - query.pop('dataProfile') # TODO: this changes query arg (mutable) + if "dataProfile" in query.keys(): + query.pop("dataProfile") # TODO: this changes query arg (mutable) stations, site_md = wqp.what_sites(**query) # Harmonize stations diff --git a/harmonize_wq/tests/test_harmonize_WQP.py b/harmonize_wq/tests/test_harmonize_WQP.py index a1cb2d8..f5e1a9c 100644 --- a/harmonize_wq/tests/test_harmonize_WQP.py +++ b/harmonize_wq/tests/test_harmonize_WQP.py @@ -1,8 +1,5 @@ # -*- coding: utf-8 -*- """ -This will import when run from CI because sys.path[0] == cur_dir -DIRPATH = r'L:\Public\jbousqui\Code\GitHub\harmonize-wq\harmonize_wq\tests' - This script doesn't test query/download of the data using dataretrieval, instead the script is focused on processing, tidying and harmonizing data results from a query read from a ?csv. The exception is the bounding box query @@ -10,46 +7,45 @@ @author: jbousqui """ + import os -import pytest + import geopandas import pandas -from harmonize_wq import location -from harmonize_wq import harmonize -from harmonize_wq import convert -from harmonize_wq import wrangle -from harmonize_wq import clean -from harmonize_wq import visualize as viz +import pytest +from harmonize_wq import clean, convert, harmonize, location, wrangle +from harmonize_wq import visualize as viz # CI DIRPATH = os.path.dirname(os.path.realpath(__file__)) +# DIRPATH = r"L:\Public\jbousqui\Code\GitHub\harmonize-wq\harmonize_wq\tests" # Test datasets -test_dir = os.path.join(DIRPATH, 'data') +test_dir = os.path.join(DIRPATH, "data") -AOI_URL = r'https://github.com/USEPA/Coastal_Ecological_Indicators/raw/master/DGGS_Coastal/temperature_data/TampaBay.geojson' +AOI_URL = "https://github.com/USEPA/Coastal_Ecological_Indicators/raw/master/DGGS_Coastal/temperature_data/TampaBay.geojson" # results for dataretrieval.wqp.what_sites(**query) -STATIONS = pandas.read_csv(os.path.join(test_dir, 'wqp_sites.txt')) +STATIONS = pandas.read_csv(os.path.join(test_dir, "wqp_sites.txt")) # These are split by parameter sets of 2 to keep them small but not mono-param # 'Phosphorus' & 'Temperature, water' -NARROW_RESULTS = pandas.read_csv(os.path.join(test_dir, 'wqp_results.txt')) -ACTIVITIES = pandas.read_csv(os.path.join(test_dir, 'wqp_activities.txt')) +NARROW_RESULTS = pandas.read_csv(os.path.join(test_dir, "wqp_results.txt")) +ACTIVITIES = pandas.read_csv(os.path.join(test_dir, "wqp_activities.txt")) # 'Depth, Secchi disk depth' & Dissolved Oxygen -NARROW_RESULTS1 = pandas.read_csv(os.path.join(test_dir, 'wqp_results1.txt')) +NARROW_RESULTS1 = pandas.read_csv(os.path.join(test_dir, "wqp_results1.txt")) # pH & Salinity -NARROW_RESULTS2 = pandas.read_csv(os.path.join(test_dir, 'wqp_results2.txt')) +NARROW_RESULTS2 = pandas.read_csv(os.path.join(test_dir, "wqp_results2.txt")) # Nitrogen & Conductivity -NARROW_RESULTS3 = pandas.read_csv(os.path.join(test_dir, 'wqp_results3.txt')) +NARROW_RESULTS3 = pandas.read_csv(os.path.join(test_dir, "wqp_results3.txt")) # Chlorophyll_a & Organic_carbon -NARROW_RESULTS4 = pandas.read_csv(os.path.join(test_dir, 'wqp_results4.txt')) +NARROW_RESULTS4 = pandas.read_csv(os.path.join(test_dir, "wqp_results4.txt")) # Turbidity & Sediment -NARROW_RESULTS5 = pandas.read_csv(os.path.join(test_dir, 'wqp_results5.txt')) +NARROW_RESULTS5 = pandas.read_csv(os.path.join(test_dir, "wqp_results5.txt")) # Nutrients and sediment additional characteristics # NARROW_RESULTS6 = pandas.read_csv(os.path.join(test_dir, 'wqp_results6.txt')) # Fecal Coliform and Ecoli -NARROW_RESULTS7 = pandas.read_csv(os.path.join(test_dir, 'wqp_results7.txt')) +NARROW_RESULTS7 = pandas.read_csv(os.path.join(test_dir, "wqp_results7.txt")) # fixture to eventually test output writing (.shp) # @pytest.fixture(scope="session") @@ -69,11 +65,13 @@ def test_get_bounding_box(): AOI : geopandas.GeoDataFrame Geodataframe for Tampa Bay read from github """ - expected = ['-82.76095952246396', - '27.47487752677648', - '-82.37480995151799', - '28.12535740372124'] - actual = wrangle.get_bounding_box(AOI_URL).split(',') + expected = [ + "-82.76095952246396", + "27.47487752677648", + "-82.37480995151799", + "28.12535740372124", + ] + actual = wrangle.get_bounding_box(AOI_URL).split(",") assert actual == expected @@ -93,7 +91,7 @@ def test_get_bounding_box(): # Test it appends when QA_flag exists (not in test_harmonize_sites) # """ # actual = test_add_QA_flag(df_in, cond, flag) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def merged_tables(): """ Merge narrow_results and activities tables. This fixture is used in some @@ -107,48 +105,52 @@ def merged_tables(): df1 = NARROW_RESULTS df2 = ACTIVITIES # Fields to get (all for test instead?) - df2_cols = ['ActivityTypeCode', - 'ActivityMediaName', - 'ActivityMediaSubdivisionName', - 'ActivityEndDate', - 'ActivityEndTime/Time', - 'ActivityEndTime/TimeZoneCode'] + df2_cols = [ + "ActivityTypeCode", + "ActivityMediaName", + "ActivityMediaSubdivisionName", + "ActivityEndDate", + "ActivityEndTime/Time", + "ActivityEndTime/TimeZoneCode", + ] return wrangle.merge_tables(df1, df2, df2_cols=df2_cols) -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_add_activities(merged_tables): # Run using first 100 orginal results df1 = NARROW_RESULTS actual = wrangle.add_activities_to_df(df1[:100]) # Compare against activities retrieved before for expected_col in list(merged_tables.columns): - assert expected_col in actual.columns, f'{expected_col} missing' + assert expected_col in actual.columns, f"{expected_col} missing" # Check the value for one ('Quality Control Field Replicate Msr/Obs') # NOTE: this will fail if result changes but index should be consistent - actual_val = actual.iloc[46]['ActivityTypeCode'] - expected_val = merged_tables.iloc[46]['ActivityTypeCode'] - assert actual_val == expected_val, 'Not expected ActivityMediaName value' + actual_val = actual.iloc[46]["ActivityTypeCode"] + expected_val = merged_tables.iloc[46]["ActivityTypeCode"] + assert actual_val == expected_val, "Not expected ActivityMediaName value" -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_add_detection(merged_tables): merged_cols = list(merged_tables.columns) # only retrieve for first 100 phosphorous results - phos_df = merged_tables[merged_tables['CharacteristicName']=='Phosphorus'] - actual = wrangle.add_detection(phos_df[:100], 'Phosphorus') + phos_df = merged_tables[merged_tables["CharacteristicName"] == "Phosphorus"] + actual = wrangle.add_detection(phos_df[:100], "Phosphorus") actual_cols = [x for x in list(actual.columns) if x not in merged_cols] - expected_cols = ['DetectionQuantitationLimitTypeName', - 'DetectionQuantitationLimitMeasure/MeasureValue', - 'DetectionQuantitationLimitMeasure/MeasureUnitCode'] - assert actual_cols == expected_cols, 'Detection columns not added' + expected_cols = [ + "DetectionQuantitationLimitTypeName", + "DetectionQuantitationLimitMeasure/MeasureValue", + "DetectionQuantitationLimitMeasure/MeasureUnitCode", + ] + assert actual_cols == expected_cols, "Detection columns not added" # Check the value for one # NOTE: this will fail if result changes but index should be consistent actual_val = actual.iloc[97][expected_cols[1]] - assert actual_val == 0.02, 'Not expected DetectionQuantitation value' + assert actual_val == 0.02, "Not expected DetectionQuantitation value" -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def harmonized_tables(): """ Harmonize Nitrogen and Conductivity results in NARROW_RESULTS3. This @@ -160,8 +162,8 @@ def harmonized_tables(): Harmonized results for Nitrogen and Conductivity. """ - harmonized_table = harmonize.harmonize(NARROW_RESULTS3, 'Nitrogen') - harmonized_table = harmonize.harmonize(harmonized_table, 'Conductivity') + harmonized_table = harmonize.harmonize(NARROW_RESULTS3, "Nitrogen") + harmonized_table = harmonize.harmonize(harmonized_table, "Conductivity") return harmonized_table @@ -188,11 +190,12 @@ def test_harmonize_depth(): Read from data/wqp_results1.txt. """ actual = clean.harmonize_depth(NARROW_RESULTS1) - assert len(actual['Depth'].dropna()) == 13 - expected_unit = 'meter' - assert str(actual.iloc[135227]['Depth'].units) == expected_unit + assert len(actual["Depth"].dropna()) == 13 + expected_unit = "meter" + assert str(actual.iloc[135227]["Depth"].units) == expected_unit + -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def test_harmonize_locations(): """ Test functions standardizes the sites correctly @@ -204,35 +207,35 @@ def test_harmonize_locations(): """ actual = location.harmonize_locations(STATIONS) - crs_col = 'HorizontalCoordinateReferenceSystemDatumName' - expected_flag = crs_col + ': Bad datum OTHER, EPSG:4326 assumed' + crs_col = "HorizontalCoordinateReferenceSystemDatumName" + expected_flag = crs_col + ": Bad datum OTHER, EPSG:4326 assumed" assert isinstance(actual, geopandas.geodataframe.GeoDataFrame) # Test type - assert actual.crs.name == 'WGS 84' # Test for expected CRS + assert actual.crs.name == "WGS 84" # Test for expected CRS assert actual.size == 1063506 # TODO: confirm original fields un-altered # Test for expected columns - for col in ['QA_flag', 'geometry']: + for col in ["QA_flag", "geometry"]: assert col in actual.columns # Test new fields have expected dtype - assert actual['geometry'].dtype == 'geometry' + assert actual["geometry"].dtype == "geometry" # assert actual['EPSG'].dtype == 'float64' # Converted to int() later # Test flag & fix when un-recognized CRS (test on row[CRS]=='OTHER') # assert actual.iloc[3522]['EPSG'] == 4326.0 # Test fixed in new col - assert actual.iloc[3522]['QA_flag'] == expected_flag # Test flag + assert actual.iloc[3522]["QA_flag"] == expected_flag # Test flag # No changes not changes # Converted converted # Missing unit infered # Check QA_flag # Check for precision flag - actual_imprecise = actual.iloc[302]['QA_flag'] - expected_imprecise = 'LatitudeMeasure: Imprecise: lessthan3decimaldigits' + actual_imprecise = actual.iloc[302]["QA_flag"] + expected_imprecise = "LatitudeMeasure: Imprecise: lessthan3decimaldigits" assert actual_imprecise == expected_imprecise - + return actual -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_phosphorus(merged_tables): """ Test function standardizes Phosphorus results correctly @@ -243,84 +246,94 @@ def test_harmonize_phosphorus(merged_tables): Read from data/wqp_results.txt. """ # TODO: Test for expected dimensionalityError with NARROW_RESULTS? - actual = harmonize.harmonize(merged_tables, 'Phosphorus') # mg/l + actual = harmonize.harmonize(merged_tables, "Phosphorus") # mg/l # TODO: test conversion to moles and other non-standard units # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 16896735 # 17256240 # Test size # Test for expected columns - for col in ['TP_Phosphorus', 'TDP_Phosphorus', 'Other_Phosphorus']: + for col in ["TP_Phosphorus", "TDP_Phosphorus", "Other_Phosphorus"]: assert col in actual.columns # Number of results in each col - assert len(actual['TP_Phosphorus'].dropna()) == 11243 - assert len(actual['TDP_Phosphorus'].dropna()) == 601 - assert len(actual['Other_Phosphorus'].dropna()) == 1124 # 1075 NAN + assert len(actual["TP_Phosphorus"].dropna()) == 11243 + assert len(actual["TDP_Phosphorus"].dropna()) == 601 + assert len(actual["Other_Phosphorus"].dropna()) == 1124 # 1075 NAN # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(merged_tables[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(merged_tables[orig_unit_col]) # Inspect specific results - expected_unit = 'milligram / liter' # Desired units + expected_unit = "milligram / liter" # Desired units # TP - out_col = 'TP_Phosphorus' - actual.loc[((actual['CharacteristicName'] == 'Phosphorus') & - (actual['ResultSampleFractionText'] == 'Total') & - (actual[out_col].notna())), out_col] + out_col = "TP_Phosphorus" + actual.loc[ + ( + (actual["CharacteristicName"] == "Phosphorus") + & (actual["ResultSampleFractionText"] == "Total") + & (actual[out_col].notna()) + ), + out_col, + ] # Inspect specific result - where units are not converted - assert actual.iloc[2866][orig_unit_col] == 'mg/l' # Confirm orig unit + assert actual.iloc[2866][orig_unit_col] == "mg/l" # Confirm orig unit assert str(actual.iloc[2866][out_col].units) == expected_unit expected_val = actual.iloc[2866][orig_val_col] # Original value assert actual.iloc[2866][out_col].magnitude == expected_val # Unchanged # Inspect specific result - where units converted # Basis in units 'mg/l as P' # Confirm original unit - assert actual.iloc[134674][orig_unit_col] == 'mg/l as P' + assert actual.iloc[134674][orig_unit_col] == "mg/l as P" assert str(actual.iloc[134674][out_col].units) == expected_unit # Confirm original measure assert actual.iloc[134674][orig_val_col] == 0.29 assert actual.iloc[134674][out_col].magnitude == 0.29 # Basis in units 'mg/l PO4' - assert actual.iloc[142482][orig_unit_col] == 'mg/l PO4' # Confirm orig unit + assert actual.iloc[142482][orig_unit_col] == "mg/l PO4" # Confirm orig unit assert str(actual.iloc[142482][out_col].units) == expected_unit # TODO: None with different units that get converted # Inspect specific result - where units missing - assert str(actual.iloc[9738][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[9738][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, mg/l assumed' - assert actual.iloc[9738]['QA_flag'] == expected_flag + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, mg/l assumed" + assert actual.iloc[9738]["QA_flag"] == expected_flag # Check value unchanged for missing units expected_val = float(actual.iloc[9738][orig_val_col]) # Original value assert actual.iloc[9738][out_col].magnitude == expected_val # Unchanged # Inspect specific result - where value missing - assert str(actual.iloc[134943][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[134943][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - actual_flags = actual.iloc[134943]['QA_flag'].split('; ') + expected_flag = "ResultMeasureValue: missing (NaN) result" + actual_flags = actual.iloc[134943]["QA_flag"].split("; ") assert actual_flags[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[19902][orig_val_col] == '*Not Reported' + assert actual.iloc[19902][orig_val_col] == "*Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "*Not Reported" result cannot be used' - actual_flags = actual.iloc[19902]['QA_flag'].split('; ') + actual_flags = actual.iloc[19902]["QA_flag"].split("; ") assert actual_flags[0] == expected_flag # TDP - out_col = 'TDP_Phosphorus' - actual.loc[((actual['CharacteristicName'] == 'Phosphorus') & - (actual['ResultSampleFractionText'] == 'Dissolved') & - (actual[out_col].notna())), out_col] + out_col = "TDP_Phosphorus" + actual.loc[ + ( + (actual["CharacteristicName"] == "Phosphorus") + & (actual["ResultSampleFractionText"] == "Dissolved") + & (actual[out_col].notna()) + ), + out_col, + ] # Inspect specific result - where units are not converted - assert actual.iloc[673][orig_unit_col] == 'mg/l' # Confirm orig unit + assert actual.iloc[673][orig_unit_col] == "mg/l" # Confirm orig unit assert str(actual.iloc[673][out_col].units) == expected_unit expected_val = actual.iloc[673][orig_val_col] # Original value assert actual.iloc[673][out_col].magnitude == expected_val # Unchanged # Inspect specific result - where units converted # Basis in units 'mg/l as P' idx = 134696 - assert actual.iloc[idx][orig_unit_col] == 'mg/l as P' # Confirm orig unit + assert actual.iloc[idx][orig_unit_col] == "mg/l as P" # Confirm orig unit assert str(actual.iloc[idx][out_col].units) == expected_unit assert actual.iloc[idx][orig_val_col] == 0.38 # Confirm original measure assert actual.iloc[idx][out_col].magnitude == 0.38 @@ -328,22 +341,27 @@ def test_harmonize_phosphorus(merged_tables): # Inspect specific result - where units missing # TODO: None missing units w/ value # Inspect specific result - where value missing - assert str(actual.iloc[138475][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[138475][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - actual_flags = actual.iloc[138475]['QA_flag'].split('; ') + expected_flag = "ResultMeasureValue: missing (NaN) result" + actual_flags = actual.iloc[138475]["QA_flag"].split("; ") assert actual_flags[0] == expected_flag # Inspect specific result - un-usable non-numeric values # TODO: no bad value # Other - out_col = 'Other_Phosphorus' + out_col = "Other_Phosphorus" # NOTE: these are neither labled 'Total' nor 'Dissolved' - actual.loc[((actual['CharacteristicName'] == 'Phosphorus') & - (actual['ResultSampleFractionText'].isna()) & - (actual[out_col].notna())), out_col] + actual.loc[ + ( + (actual["CharacteristicName"] == "Phosphorus") + & (actual["ResultSampleFractionText"].isna()) + & (actual[out_col].notna()) + ), + out_col, + ] # Inspect specific result - where units are not converted - assert actual.iloc[19665][orig_unit_col] == 'mg/l' # Confirm orig unit + assert actual.iloc[19665][orig_unit_col] == "mg/l" # Confirm orig unit assert str(actual.iloc[19665][out_col].units) == expected_unit expected_val = float(actual.iloc[19665][orig_val_col]) # Original value assert actual.iloc[19665][out_col].magnitude == expected_val # Unchanged @@ -352,17 +370,17 @@ def test_harmonize_phosphorus(merged_tables): # Inspect specific result - where units missing # TODO: None missing units w/ value # Inspect specific result - where value missing - assert str(actual.iloc[177611][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[177611][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - actual_flags = actual.iloc[177611]['QA_flag'].split('; ') + expected_flag = "ResultMeasureValue: missing (NaN) result" + actual_flags = actual.iloc[177611]["QA_flag"].split("; ") assert actual_flags[0] == expected_flag # Inspect specific result - un-usable non-numeric values # TODO: no bad value -@pytest.fixture(scope='session') -#@pytest.mark.skip(reason="no change") +@pytest.fixture(scope="session") +# @pytest.mark.skip(reason="no change") def test_harmonize_temperature(): """ Test function standardizes Temperature results correctly @@ -372,54 +390,55 @@ def test_harmonize_temperature(): NARROW_RESULTS : pandas.DataFrame Read from data/wqp_results.txt. """ - actual = harmonize.harmonize(NARROW_RESULTS, 'Temperature, water') - actual2 = harmonize.harmonize(NARROW_RESULTS.iloc[0:10], - 'Temperature, water', - units_out='deg F') + actual = harmonize.harmonize(NARROW_RESULTS, "Temperature, water") + actual2 = harmonize.harmonize( + NARROW_RESULTS.iloc[0:10], "Temperature, water", units_out="deg F" + ) + assert isinstance(actual2, pandas.core.frame.DataFrame) # Test type # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 13301685 # Test size #14784040 - assert 'Temperature' in actual.columns # Check for column - assert len(actual['Temperature'].dropna()) == 346210 # Number of results + assert "Temperature" in actual.columns # Check for column + assert len(actual["Temperature"].dropna()) == 346210 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[0][orig_unit_col] == 'deg C' # Confirm orig unit - expected_unit = 'degree_Celsius' # Desired units - assert str(actual.iloc[0]['Temperature'].units) == expected_unit + assert actual.iloc[0][orig_unit_col] == "deg C" # Confirm orig unit + expected_unit = "degree_Celsius" # Desired units + assert str(actual.iloc[0]["Temperature"].units) == expected_unit expected_val = actual.iloc[0][orig_val_col] # Original value - assert actual.iloc[0]['Temperature'].magnitude == expected_val # Unchanged + assert actual.iloc[0]["Temperature"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[55013][orig_unit_col] == 'deg F' # Confirm orig unit - assert str(actual.iloc[55013]['Temperature'].units) == expected_unit - assert actual.iloc[55013][orig_val_col] == '87' # Confirm original measure - assert actual.iloc[55013]['Temperature'].magnitude == 30.5555555555556 + assert actual.iloc[55013][orig_unit_col] == "deg F" # Confirm orig unit + assert str(actual.iloc[55013]["Temperature"].units) == expected_unit + assert actual.iloc[55013][orig_val_col] == "87" # Confirm original measure + assert actual.iloc[55013]["Temperature"].magnitude == 30.5555555555556 # Inspect specific result - where units missing - assert str(actual.iloc[143765][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[143765][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, degC assumed' - actual_flags = actual.iloc[143765]['QA_flag'].split('; ') + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, degC assumed" + actual_flags = actual.iloc[143765]["QA_flag"].split("; ") assert actual_flags[1] == expected_flag # Should be assessed 1st (flag 0) # Check value unchagned for missing units # TODO: values would stay the same (no conversion), but this example is nan # Inspect specific result - where value missing - assert str(actual.iloc[143765][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[143765][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered values - expected_flag = 'ResultMeasureValue: missing (NaN) result' + expected_flag = "ResultMeasureValue: missing (NaN) result" assert actual_flags[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[359504][orig_val_col] == 'Not Reported' + assert actual.iloc[359504][orig_val_col] == "Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "Not Reported" result cannot be used' - assert actual.iloc[359504]['QA_flag'] == expected_flag + assert actual.iloc[359504]["QA_flag"] == expected_flag return actual -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_secchi(): """ Test function standardizes Seccchi results correctly @@ -429,50 +448,50 @@ def test_harmonize_secchi(): NARROW_RESULTS1 : pandas.DataFrame Read from data/wqp_results1.txt. """ - actual = harmonize.harmonize(NARROW_RESULTS1, 'Depth, Secchi disk depth') + actual = harmonize.harmonize(NARROW_RESULTS1, "Depth, Secchi disk depth") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 11818094 # Test size - assert 'Secchi' in actual.columns # Check for column - assert len(actual['Secchi'].dropna()) == 69144 # Number of results + assert "Secchi" in actual.columns # Check for column + assert len(actual["Secchi"].dropna()) == 69144 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS1[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS1[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[1][orig_unit_col] == 'm' # Confirm orig unit - expected_unit = 'meter' # Desired units - assert str(actual.iloc[1]['Secchi'].units) == expected_unit + assert actual.iloc[1][orig_unit_col] == "m" # Confirm orig unit + expected_unit = "meter" # Desired units + assert str(actual.iloc[1]["Secchi"].units) == expected_unit expected_val = float(actual.iloc[1][orig_val_col]) # Original value - assert actual.iloc[1]['Secchi'].magnitude == expected_val # Unchanged + assert actual.iloc[1]["Secchi"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[369][orig_unit_col] == 'ft' # Confirm orig unit - assert str(actual.iloc[369]['Secchi'].units) == expected_unit - assert actual.iloc[369][orig_val_col] == '1.5' # Confirm original measure - assert actual.iloc[369]['Secchi'].magnitude == 0.45719999999999994 + assert actual.iloc[369][orig_unit_col] == "ft" # Confirm orig unit + assert str(actual.iloc[369]["Secchi"].units) == expected_unit + assert actual.iloc[369][orig_val_col] == "1.5" # Confirm original measure + assert actual.iloc[369]["Secchi"].magnitude == 0.45719999999999994 # Inspect specific result - where units missing - assert str(actual.iloc[347590][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[347590][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, m assumed' - actual_flags = actual.iloc[347590]['QA_flag'].split('; ') + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, m assumed" + actual_flags = actual.iloc[347590]["QA_flag"].split("; ") assert actual_flags[1] == expected_flag # Should be assessed 1st (flag 0) # Check value unchanged for missing units # TODO: values would stay the same (no conversion), but this example is nan # Inspect specific result - where value missing - assert str(actual.iloc[347590][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[347590][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered values - expected_flag = 'ResultMeasureValue: missing (NaN) result' + expected_flag = "ResultMeasureValue: missing (NaN) result" assert actual_flags[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[347589][orig_val_col] == 'Not Reported' + assert actual.iloc[347589][orig_val_col] == "Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "Not Reported" result cannot be used' - assert actual.iloc[347589]['QA_flag'] == expected_flag + assert actual.iloc[347589]["QA_flag"] == expected_flag -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_DO(): """ Test function standardizes Dissolved oxygen (DO) results correctly @@ -482,63 +501,69 @@ def test_harmonize_DO(): NARROW_RESULTS1 : pandas.DataFrame Read from data/wqp_results1.txt. """ - actual = harmonize.harmonize(NARROW_RESULTS1, 'Dissolved oxygen (DO)') + actual = harmonize.harmonize(NARROW_RESULTS1, "Dissolved oxygen (DO)") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 11818094 # Test size - assert 'DO' in actual.columns # Check for column - assert len(actual['DO'].dropna()) == 278395 # Number of results + assert "DO" in actual.columns # Check for column + assert len(actual["DO"].dropna()) == 278395 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS1[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS1[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[0][orig_unit_col] == 'mg/l' # Confirm orig unit - expected_unit = 'milligram / liter' # Desired units - assert str(actual.iloc[0]['DO'].units) == expected_unit + assert actual.iloc[0][orig_unit_col] == "mg/l" # Confirm orig unit + expected_unit = "milligram / liter" # Desired units + assert str(actual.iloc[0]["DO"].units) == expected_unit expected_val = float(actual.iloc[0][orig_val_col]) # Original value - assert actual.iloc[0]['DO'].magnitude == expected_val # Unchanged + assert actual.iloc[0]["DO"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[4][orig_unit_col] == '%' # Confirm orig unit - assert str(actual.iloc[4]['DO'].units) == expected_unit - assert actual.iloc[4][orig_val_col] == '68.7' # Confirm original measure - assert actual.iloc[4]['DO'].magnitude == 0.05676222371166 + assert actual.iloc[4][orig_unit_col] == "%" # Confirm orig unit + assert str(actual.iloc[4]["DO"].units) == expected_unit + assert actual.iloc[4][orig_val_col] == "68.7" # Confirm original measure + assert actual.iloc[4]["DO"].magnitude == 0.05676222371166 # TODO: add tests for 99637 in ppm? Currently ppm == mg/l - + # TODO: add tests at different pressure and temperature - actual_p2 = str(convert.DO_saturation(70, '0.5 standard_atmosphere')) - expected_p2 = '2.7994178481769043 milligram / liter' + actual_p2 = str(convert.DO_saturation(70, "0.5 standard_atmosphere")) + expected_p2 = "2.7994178481769043 milligram / liter" assert actual_p2 == expected_p2 from harmonize_wq.convert import u_reg - actual_p2 = str(convert.DO_concentration('0.7 milligram / liter', - '2 standard_atmosphere', - u_reg.Quantity(32, u_reg("degC")))) - expected_p2 = '4.681314214558987' + + actual_p2 = str( + convert.DO_concentration( + "0.7 milligram / liter", + "2 standard_atmosphere", + u_reg.Quantity(32, u_reg("degC")), + ) + ) + expected_p2 = "4.681314214558987" assert actual_p2 == expected_p2 - + # Inspect specific result - where units missing - assert str(actual.iloc[6816][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[6816][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, mg/l assumed' - actual_flags = actual.iloc[6816]['QA_flag'].split('; ') + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, mg/l assumed" + actual_flags = actual.iloc[6816]["QA_flag"].split("; ") assert actual_flags[1] == expected_flag # Check value unchanged for missing units - # TODO: values would stay the same (no conversion), but this example is '*Not Reported' + # TODO: values would stay the same (no conversion), but this example is + # '*Not Reported' # Inspect specific result - where value missing - assert str(actual.iloc[130784][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[130784][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - assert actual.iloc[130784]['QA_flag'].split('; ')[0] == expected_flag + expected_flag = "ResultMeasureValue: missing (NaN) result" + assert actual.iloc[130784]["QA_flag"].split("; ")[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[6816][orig_val_col] == '*Not Reported' + assert actual.iloc[6816][orig_val_col] == "*Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "*Not Reported" result cannot be used' - assert actual.iloc[6816]['QA_flag'].split('; ')[0] == expected_flag + assert actual.iloc[6816]["QA_flag"].split("; ")[0] == expected_flag -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_salinity(): """ Test function standardizes Salinity results correctly @@ -550,61 +575,63 @@ def test_harmonize_salinity(): NARROW_RESULTS2 : pandas.DataFrame Read from data/wqp_results2.txt. """ - actual = harmonize.harmonize(NARROW_RESULTS2, 'Salinity', units_out='PSS') + actual = harmonize.harmonize(NARROW_RESULTS2, "Salinity", units_out="PSS") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 12181392 # Test size - assert 'Salinity' in actual.columns # Check for column - assert len(actual['Salinity'].dropna()) == 185562 # Number of results + assert "Salinity" in actual.columns # Check for column + assert len(actual["Salinity"].dropna()) == 185562 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS2[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS2[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[3][orig_unit_col] == 'PSS' # Confirm orig unit - expected_unit = 'Practical_Salinity_Units' # Desired units - assert str(actual.iloc[3]['Salinity'].units) == expected_unit + assert actual.iloc[3][orig_unit_col] == "PSS" # Confirm orig unit + expected_unit = "Practical_Salinity_Units" # Desired units + assert str(actual.iloc[3]["Salinity"].units) == expected_unit expected_val = float(actual.iloc[3][orig_val_col]) # Original value - assert actual.iloc[3]['Salinity'].magnitude == expected_val # Unchanged + assert actual.iloc[3]["Salinity"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted (ptth) - assert actual.iloc[0][orig_unit_col] == 'ppth' # Confirm orig unit - assert str(actual.iloc[0]['Salinity'].units) == expected_unit - assert actual.iloc[0][orig_val_col] == '40' # Confirm original measure - assert actual.iloc[0]['Salinity'].magnitude == 40 + assert actual.iloc[0][orig_unit_col] == "ppth" # Confirm orig unit + assert str(actual.iloc[0]["Salinity"].units) == expected_unit + assert actual.iloc[0][orig_val_col] == "40" # Confirm original measure + assert actual.iloc[0]["Salinity"].magnitude == 40 # Inspect specific result - where units converted (mg/ml) # TODO: need a different test value (something weird here) - assert actual.iloc[335435][orig_unit_col] == 'mg/mL @25C' # Confirm unit - assert str(actual.iloc[335435]['Salinity'].units) + assert actual.iloc[335435][orig_unit_col] == "mg/mL @25C" # Confirm unit + assert str(actual.iloc[335435]["Salinity"].units) assert actual.iloc[335435][orig_val_col] == 120.0 # Confirm measure - assert actual.iloc[335435]['Salinity'].magnitude == 125.28127999999992 - psu_example = str(actual.iloc[335435]['Salinity']) + assert actual.iloc[335435]["Salinity"].magnitude == 125.28127999999992 + psu_example = str(actual.iloc[335435]["Salinity"]) # Inspect specific result - where units missing - assert str(actual.iloc[21277][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[21277][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, PSS assumed' - actual_flags = actual.iloc[21277]['QA_flag'].split('; ') + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, PSS assumed" + actual_flags = actual.iloc[21277]["QA_flag"].split("; ") assert actual_flags[1] == expected_flag # Check value unchagned for missing units - # TODO: values would stay the same (no conversion), but this example is '*Not Reported' + # TODO: values would stay the same (no conversion), but this example is + # '*Not Reported' # Inspect specific result - where value missing - assert str(actual.iloc[69781][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[69781][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - assert actual.iloc[69781]['QA_flag'].split('; ')[0] == expected_flag + expected_flag = "ResultMeasureValue: missing (NaN) result" + assert actual.iloc[69781]["QA_flag"].split("; ")[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[21277][orig_val_col] == '*Not Reported' + assert actual.iloc[21277][orig_val_col] == "*Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "*Not Reported" result cannot be used' - assert actual.iloc[21277]['QA_flag'].split('; ')[0] == expected_flag + assert actual.iloc[21277]["QA_flag"].split("; ")[0] == expected_flag # Backward test PSU to density density = convert.PSU_to_density(psu_example) - assert str(density) == '997.1428971400308 milligram / milliliter' + assert str(density) == "997.1428971400308 milligram / milliliter" + -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_pH(): """ Test function standardizes pH results correctly @@ -615,51 +642,53 @@ def test_harmonize_pH(): Read from data/wqp_results2.txt. """ # actual1 = harmonize.harmonize_pH(NARROW_RESULTS2, units='dimensionless') - actual = harmonize.harmonize(NARROW_RESULTS2, 'pH') + actual = harmonize.harmonize(NARROW_RESULTS2, "pH") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 12181392 # Test size - assert 'pH' in actual.columns # Check for column - assert len(actual['pH'].dropna()) == 152314 # Number of results + assert "pH" in actual.columns # Check for column + assert len(actual["pH"].dropna()) == 152314 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS2[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS2[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[1][orig_unit_col] == 'None' # Confirm orig unit - expected_unit = 'dimensionless' # Desired units - assert str(actual.iloc[1]['pH'].units) == expected_unit + assert actual.iloc[1][orig_unit_col] == "None" # Confirm orig unit + expected_unit = "dimensionless" # Desired units + assert str(actual.iloc[1]["pH"].units) == expected_unit expected_val = float(actual.iloc[1][orig_val_col]) # Original value - assert actual.iloc[1]['pH'].magnitude == expected_val # Unchanged + assert actual.iloc[1]["pH"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[1][orig_unit_col] == 'None' # Confirm orig unit - assert str(actual.iloc[1]['pH'].units) == expected_unit - assert actual.iloc[1][orig_val_col] == '8.18' # Confirm original measure - assert actual.iloc[1]['pH'].magnitude == 8.18 + assert actual.iloc[1][orig_unit_col] == "None" # Confirm orig unit + assert str(actual.iloc[1]["pH"].units) == expected_unit + assert actual.iloc[1][orig_val_col] == "8.18" # Confirm original measure + assert actual.iloc[1]["pH"].magnitude == 8.18 # Inspect specific result - where units missing - assert str(actual.iloc[195644][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[195644][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, dimensionless assumed' - actual_flags = actual.iloc[195644]['QA_flag'].split('; ') + expected_flag = ( + "ResultMeasure/MeasureUnitCode: MISSING UNITS, dimensionless assumed" + ) + actual_flags = actual.iloc[195644]["QA_flag"].split("; ") assert actual_flags[0] == expected_flag # Check value unchanged for missing units expected_val = float(actual.iloc[195644][orig_val_col]) # Original value - assert actual.iloc[195644]['pH'].magnitude == expected_val # Unchanged + assert actual.iloc[195644]["pH"].magnitude == expected_val # Unchanged # Inspect specific result - where value missing - assert str(actual.iloc[77966][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[77966][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - assert actual.iloc[77966]['QA_flag'].split('; ')[0] == expected_flag + expected_flag = "ResultMeasureValue: missing (NaN) result" + assert actual.iloc[77966]["QA_flag"].split("; ")[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[2641][orig_val_col] == '*Not Reported' + assert actual.iloc[2641][orig_val_col] == "*Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "*Not Reported" result cannot be used' - assert actual.iloc[2641]['QA_flag'].split('; ')[0] == expected_flag + assert actual.iloc[2641]["QA_flag"].split("; ")[0] == expected_flag -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_nitrogen(): """ Test function standardizes Nitrogen results correctly @@ -670,58 +699,59 @@ def test_harmonize_nitrogen(): Read from data/wqp_results3.txt. """ # actual1 = harmonize.harmonize_Nitrogen(NARROW_RESULTS3, units='mg/l') - actual = harmonize.harmonize(NARROW_RESULTS3, 'Nitrogen') + actual = harmonize.harmonize(NARROW_RESULTS3, "Nitrogen") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type - assert actual.size == 16728 # Test size - assert 'Nitrogen' in actual.columns # Check for column - assert len(actual['Nitrogen'].dropna()) == 182 # Number of results + assert actual.size == 16728 # Test size + assert "Nitrogen" in actual.columns # Check for column + assert len(actual["Nitrogen"].dropna()) == 182 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS3[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS3[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[55][orig_unit_col] == 'mg/l' # Confirm orig unit - expected_unit = 'milligram / liter' # Desired units - assert str(actual.iloc[55]['Nitrogen'].units) == expected_unit + assert actual.iloc[55][orig_unit_col] == "mg/l" # Confirm orig unit + expected_unit = "milligram / liter" # Desired units + assert str(actual.iloc[55]["Nitrogen"].units) == expected_unit expected_val = float(actual.iloc[55][orig_val_col]) # Original value - assert actual.iloc[55]['Nitrogen'].magnitude == expected_val # Unchanged + assert actual.iloc[55]["Nitrogen"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[245][orig_unit_col] == 'g/m**3' # Confirm orig unit - assert str(actual.iloc[245]['Nitrogen'].units) == expected_unit - assert actual.iloc[245][orig_val_col] == '1' # Confirm original measure - assert actual.iloc[245]['Nitrogen'].magnitude == 1.0000000000000002 + assert actual.iloc[245][orig_unit_col] == "g/m**3" # Confirm orig unit + assert str(actual.iloc[245]["Nitrogen"].units) == expected_unit + assert actual.iloc[245][orig_val_col] == "1" # Confirm original measure + assert actual.iloc[245]["Nitrogen"].magnitude == 1.0000000000000002 # Inspect specific result - where units missing - assert str(actual.iloc[211][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[211][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, mg/l assumed' - actual_flags = actual.iloc[211]['QA_flag'].split('; ') + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, mg/l assumed" + actual_flags = actual.iloc[211]["QA_flag"].split("; ") assert actual_flags[1] == expected_flag # Check value unchagned for missing units # TODO: values would stay the same (no conversion), but this example is nan # Inspect specific result - where value missing - assert str(actual.iloc[211][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[211][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - assert actual.iloc[211]['QA_flag'].split('; ')[0] == expected_flag + expected_flag = "ResultMeasureValue: missing (NaN) result" + assert actual.iloc[211]["QA_flag"].split("; ")[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[240][orig_val_col] == 'Not reported' + assert actual.iloc[240][orig_val_col] == "Not reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "Not reported" result cannot be used' - assert actual.iloc[240]['QA_flag'].split('; ')[0] == expected_flag + assert actual.iloc[240]["QA_flag"].split("; ")[0] == expected_flag # TODO: add test case where 'g/kg' # TODO: add test case where 'cm3/g @STP' # TODO: add test case where 'cm3/g STP' - + # check sample fraction, everything went to total mixed forms - assert len(actual['Nitrogen'].dropna()) == 182, "Fraction issue" - fract_col = 'TOTAL NITROGEN_ MIXED FORMS' + assert len(actual["Nitrogen"].dropna()) == 182, "Fraction issue" + fract_col = "TOTAL NITROGEN_ MIXED FORMS" assert len(actual[fract_col].dropna()) == 182, "Fraction issue" -#@pytest.mark.skip(reason="no change") + +# @pytest.mark.skip(reason="no change") def test_harmonize_conductivity(): """ Test function standardizes Conductivity results correctly @@ -731,52 +761,52 @@ def test_harmonize_conductivity(): NARROW_RESULTS3 : pandas.DataFrame Read from data/wqp_results3.txt. """ - #actual1 = harmonize.harmonize_Conductivity(NARROW_RESULTS3, units='uS/cm') - actual = harmonize.harmonize(NARROW_RESULTS3, 'Conductivity') + # actual1 = harmonize.harmonize_Conductivity(NARROW_RESULTS3, units='uS/cm') + actual = harmonize.harmonize(NARROW_RESULTS3, "Conductivity") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 16236 # Test size - assert 'Conductivity' in actual.columns # Check for column - assert len(actual['Conductivity'].dropna()) == 59 # Number of results + assert "Conductivity" in actual.columns # Check for column + assert len(actual["Conductivity"].dropna()) == 59 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS3[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS3[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[79][orig_unit_col] == 'uS/cm' # Confirm orig unit - expected_unit = 'microsiemens / centimeter' # Desired units - assert str(actual.iloc[79]['Conductivity'].units) == expected_unit + assert actual.iloc[79][orig_unit_col] == "uS/cm" # Confirm orig unit + expected_unit = "microsiemens / centimeter" # Desired units + assert str(actual.iloc[79]["Conductivity"].units) == expected_unit expected_val = float(actual.iloc[79][orig_val_col]) # Original value - assert actual.iloc[79]['Conductivity'].magnitude == expected_val # Unchanged + assert actual.iloc[79]["Conductivity"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[244][orig_unit_col] == 'mS/cm' # Confirm orig unit - assert str(actual.iloc[244]['Conductivity'].units) == expected_unit - assert actual.iloc[244][orig_val_col] == '1' # Confirm original measure - assert actual.iloc[244]['Conductivity'].magnitude == 1000.0 + assert actual.iloc[244][orig_unit_col] == "mS/cm" # Confirm orig unit + assert str(actual.iloc[244]["Conductivity"].units) == expected_unit + assert actual.iloc[244][orig_val_col] == "1" # Confirm original measure + assert actual.iloc[244]["Conductivity"].magnitude == 1000.0 # Inspect specific result - where units missing - assert str(actual.iloc[241][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[241][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, uS/cm assumed' - actual_flags = actual.iloc[241]['QA_flag'] + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, uS/cm assumed" + actual_flags = actual.iloc[241]["QA_flag"] assert actual_flags == expected_flag # Check value unchagned for missing units expected_val = float(actual.iloc[241][orig_val_col]) # Original value - assert actual.iloc[241]['Conductivity'].magnitude == expected_val # Unchanged + assert actual.iloc[241]["Conductivity"].magnitude == expected_val # Unchanged # Inspect specific result - where value missing - assert str(actual.iloc[242][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[242][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - assert actual.iloc[242]['QA_flag'].split('; ')[0] == expected_flag + expected_flag = "ResultMeasureValue: missing (NaN) result" + assert actual.iloc[242]["QA_flag"].split("; ")[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[243][orig_val_col] == 'Not Reported' + assert actual.iloc[243][orig_val_col] == "Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "Not Reported" result cannot be used' - assert actual.iloc[243]['QA_flag'].split('; ')[0] == expected_flag + assert actual.iloc[243]["QA_flag"].split("; ")[0] == expected_flag -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_carbon_organic(): """ Test function standardizes Organic carbon results correctly @@ -786,56 +816,56 @@ def test_harmonize_carbon_organic(): NARROW_RESULTS4 : pandas.DataFrame Read from data/wqp_results4.txt. """ - #actual1 = harmonize.harmonize_Carbon_organic(NARROW_RESULTS4, units='mg/l') - #actual2 = harmonize.harmonize_Carbon_organic(NARROW_RESULTS4, units='g/kg') - actual = harmonize.harmonize(NARROW_RESULTS4, 'Organic carbon') + # actual1 = harmonize.harmonize_Carbon_organic(NARROW_RESULTS4, units='mg/l') + # actual2 = harmonize.harmonize_Carbon_organic(NARROW_RESULTS4, units='g/kg') + actual = harmonize.harmonize(NARROW_RESULTS4, "Organic carbon") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 6906695 # Test size - assert 'Carbon' in actual.columns # Check for column - assert len(actual['Carbon'].dropna()) == 30631 # Number of results + assert "Carbon" in actual.columns # Check for column + assert len(actual["Carbon"].dropna()) == 30631 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS4[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS4[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[1][orig_unit_col] == 'mg/l' # Confirm orig unit - expected_unit = 'milligram / liter' # Desired units - assert str(actual.iloc[1]['Carbon'].units) == expected_unit + assert actual.iloc[1][orig_unit_col] == "mg/l" # Confirm orig unit + expected_unit = "milligram / liter" # Desired units + assert str(actual.iloc[1]["Carbon"].units) == expected_unit expected_val = float(actual.iloc[1][orig_val_col]) # Original value - assert actual.iloc[1]['Carbon'].magnitude == expected_val # Unchanged + assert actual.iloc[1]["Carbon"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[355][orig_unit_col] == '%' # Confirm orig unit - assert str(actual.iloc[355]['Carbon'].units) == expected_unit - assert actual.iloc[355][orig_val_col] == '0.1' # Confirm original measure - assert actual.iloc[355]['Carbon'].magnitude == 1000.0 + assert actual.iloc[355][orig_unit_col] == "%" # Confirm orig unit + assert str(actual.iloc[355]["Carbon"].units) == expected_unit + assert actual.iloc[355][orig_val_col] == "0.1" # Confirm original measure + assert actual.iloc[355]["Carbon"].magnitude == 1000.0 # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, mg/l assumed' - actual_flags = actual.iloc[103082]['QA_flag'] + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, mg/l assumed" + actual_flags = actual.iloc[103082]["QA_flag"] assert actual_flags == expected_flag # Check value unchagned for missing units expected_val = float(actual.iloc[103082][orig_val_col]) # Original value - assert actual.iloc[103082]['Carbon'].magnitude == expected_val # Unchanged + assert actual.iloc[103082]["Carbon"].magnitude == expected_val # Unchanged # Inspect specific result - where value missing - assert str(actual.iloc[22044][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[22044][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - assert actual.iloc[22044]['QA_flag'].split('; ')[0] == expected_flag + expected_flag = "ResultMeasureValue: missing (NaN) result" + assert actual.iloc[22044]["QA_flag"].split("; ")[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[0][orig_val_col] == '*Non-detect' + assert actual.iloc[0][orig_val_col] == "*Non-detect" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "*Non-detect" result cannot be used' - assert actual.iloc[0]['QA_flag'].split('; ')[0] == expected_flag + assert actual.iloc[0]["QA_flag"].split("; ")[0] == expected_flag # Moles test - assert actual.iloc[103084][orig_unit_col] == 'umol' # Confirm orig unit + assert actual.iloc[103084][orig_unit_col] == "umol" # Confirm orig unit float(actual.iloc[103084][orig_val_col]) # Confirm original value - assert str(actual.iloc[103084]['Carbon'].units) == expected_unit - assert actual.iloc[103084]['Carbon'].magnitude == 0.0477424 + assert str(actual.iloc[103084]["Carbon"].units) == expected_unit + assert actual.iloc[103084]["Carbon"].magnitude == 0.0477424 -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_chlorophyll_a(): """ Test function standardizes Chlorophyll a results correctly @@ -845,50 +875,50 @@ def test_harmonize_chlorophyll_a(): NARROW_RESULTS4 : pandas.DataFrame Read from data/wqp_results4.txt. """ - actual = harmonize.harmonize(NARROW_RESULTS4, 'Chlorophyll a') + actual = harmonize.harmonize(NARROW_RESULTS4, "Chlorophyll a") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 6803610 # Test size - assert 'Chlorophyll' in actual.columns # Check for column - assert len(actual['Chlorophyll'].dropna()) == 68201 # Number of results + assert "Chlorophyll" in actual.columns # Check for column + assert len(actual["Chlorophyll"].dropna()) == 68201 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS4[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS4[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[47190][orig_unit_col] == 'mg/l' # Confirm orig unit - expected_unit = 'milligram / liter' # Desired units - assert str(actual.iloc[47190]['Chlorophyll'].units) == expected_unit + assert actual.iloc[47190][orig_unit_col] == "mg/l" # Confirm orig unit + expected_unit = "milligram / liter" # Desired units + assert str(actual.iloc[47190]["Chlorophyll"].units) == expected_unit expected_val = float(actual.iloc[47190][orig_val_col]) # Original value - assert actual.iloc[47190]['Chlorophyll'].magnitude == expected_val # Unchanged + assert actual.iloc[47190]["Chlorophyll"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[345][orig_unit_col] == 'ug/l' # Confirm orig unit - assert str(actual.iloc[345]['Chlorophyll'].units) == expected_unit - assert actual.iloc[345][orig_val_col] == '2.28' # Confirm original measure - assert actual.iloc[345]['Chlorophyll'].magnitude == 0.00228 + assert actual.iloc[345][orig_unit_col] == "ug/l" # Confirm orig unit + assert str(actual.iloc[345]["Chlorophyll"].units) == expected_unit + assert actual.iloc[345][orig_val_col] == "2.28" # Confirm original measure + assert actual.iloc[345]["Chlorophyll"].magnitude == 0.00228 # Inspect specific result - where units missing - assert str(actual.iloc[12618][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[12618][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, mg/l assumed' - actual_flags = actual.iloc[12618]['QA_flag'] + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, mg/l assumed" + actual_flags = actual.iloc[12618]["QA_flag"] assert actual_flags == expected_flag # Check value unchagned for missing units expected_val = float(actual.iloc[12618][orig_val_col]) # Original value - assert actual.iloc[12618]['Chlorophyll'].magnitude == expected_val # Unchanged + assert actual.iloc[12618]["Chlorophyll"].magnitude == expected_val # Unchanged # Inspect specific result - where value missing - assert str(actual.iloc[947][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[947][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - assert actual.iloc[947]['QA_flag'].split('; ')[0] == expected_flag + expected_flag = "ResultMeasureValue: missing (NaN) result" + assert actual.iloc[947]["QA_flag"].split("; ")[0] == expected_flag # Inspect specific result - un-usable non-numeric values assert actual.iloc[103081][orig_val_col] == "Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "Not Reported" result cannot be used' - assert actual.iloc[103081]['QA_flag'].split('; ')[0] == expected_flag + assert actual.iloc[103081]["QA_flag"].split("; ")[0] == expected_flag -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_turbidity(): """ Test function standardizes Turbidity results correctly @@ -900,67 +930,69 @@ def test_harmonize_turbidity(): NARROW_RESULTS5 : pandas.DataFrame Read from data/wqp_results5.txt. """ - actual = harmonize.harmonize(NARROW_RESULTS5, 'Turbidity') + actual = harmonize.harmonize(NARROW_RESULTS5, "Turbidity") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 8628100 # Test size - assert 'Turbidity' in actual.columns # Check for column - assert len(actual['Turbidity'].dropna()) == 131013 # Number of results + assert "Turbidity" in actual.columns # Check for column + assert len(actual["Turbidity"].dropna()) == 131013 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS5[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS5[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[1][orig_unit_col] == 'NTU' # Confirm orig unit - expected_unit = 'Nephelometric_Turbidity_Units' # Desired units - assert str(actual.iloc[1]['Turbidity'].units) == expected_unit + assert actual.iloc[1][orig_unit_col] == "NTU" # Confirm orig unit + expected_unit = "Nephelometric_Turbidity_Units" # Desired units + assert str(actual.iloc[1]["Turbidity"].units) == expected_unit expected_val = float(actual.iloc[1][orig_val_col]) # Original value - assert actual.iloc[1]['Turbidity'].magnitude == expected_val # Unchanged + assert actual.iloc[1]["Turbidity"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[58433][orig_unit_col] == 'cm' # Confirm orig unit - assert str(actual.iloc[58433]['Turbidity'].units) == expected_unit - assert actual.iloc[58433][orig_val_col] == '60' # Confirm original measure - assert actual.iloc[58433]['Turbidity'].magnitude == 8.17455929421168 #16.046015096322353 + assert actual.iloc[58433][orig_unit_col] == "cm" # Confirm orig unit + assert str(actual.iloc[58433]["Turbidity"].units) == expected_unit + assert actual.iloc[58433][orig_val_col] == "60" # Confirm original measure + assert ( + actual.iloc[58433]["Turbidity"].magnitude == 8.17455929421168 + ) # 16.046015096322353 # JTU -> NTU - assert actual.iloc[100158][orig_unit_col] == 'JTU' # Confirm orig unit - assert str(actual.iloc[100158]['Turbidity'].units) == expected_unit + assert actual.iloc[100158][orig_unit_col] == "JTU" # Confirm orig unit + assert str(actual.iloc[100158]["Turbidity"].units) == expected_unit assert actual.iloc[100158][orig_val_col] == 5.0 # Confirm original measure - assert actual.iloc[100158]['Turbidity'].magnitude == 95.0773 + assert actual.iloc[100158]["Turbidity"].magnitude == 95.0773 # mg/l SiO2 -> NTU - assert actual.iloc[126494][orig_unit_col] == 'mg/l SiO2' # Original unit - assert str(actual.iloc[126494]['Turbidity'].units) == expected_unit - assert actual.iloc[126494][orig_val_col] == '4.0' # Confirm original measure - assert actual.iloc[126494]['Turbidity'].magnitude == 30.378500000000003 + assert actual.iloc[126494][orig_unit_col] == "mg/l SiO2" # Original unit + assert str(actual.iloc[126494]["Turbidity"].units) == expected_unit + assert actual.iloc[126494][orig_val_col] == "4.0" # Confirm original measure + assert actual.iloc[126494]["Turbidity"].magnitude == 30.378500000000003 # NTRU == NTU - assert actual.iloc[124849][orig_unit_col] == 'NTRU' # Confirm orig unit - assert str(actual.iloc[124849]['Turbidity'].units) == expected_unit - assert actual.iloc[124849][orig_val_col] == '0.7' # Confirm original measure - assert actual.iloc[124849]['Turbidity'].magnitude == 0.7 + assert actual.iloc[124849][orig_unit_col] == "NTRU" # Confirm orig unit + assert str(actual.iloc[124849]["Turbidity"].units) == expected_unit + assert actual.iloc[124849][orig_val_col] == "0.7" # Confirm original measure + assert actual.iloc[124849]["Turbidity"].magnitude == 0.7 # Inspect specific result - where units missing - assert str(actual.iloc[132736][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[132736][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, NTU assumed' - actual_flags = actual.iloc[132736]['QA_flag'].split('; ') + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, NTU assumed" + actual_flags = actual.iloc[132736]["QA_flag"].split("; ") assert actual_flags[0] == expected_flag # Check value unchagned for missing units expected_val = float(actual.iloc[132736][orig_val_col]) # Original value - assert actual.iloc[132736]['Turbidity'].magnitude == expected_val # Unchanged + assert actual.iloc[132736]["Turbidity"].magnitude == expected_val # Unchanged # Inspect specific result - where value missing - assert str(actual.iloc[19988][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[19988][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - assert actual.iloc[19988]['QA_flag'].split('; ')[0] == expected_flag + expected_flag = "ResultMeasureValue: missing (NaN) result" + assert actual.iloc[19988]["QA_flag"].split("; ")[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[42][orig_val_col] == '*Not Reported' + assert actual.iloc[42][orig_val_col] == "*Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "*Not Reported" result cannot be used' - assert actual.iloc[42]['QA_flag'].split('; ')[0] == expected_flag + assert actual.iloc[42]["QA_flag"].split("; ")[0] == expected_flag -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_sediment(): """ Test function standardizes Sediment results correctly @@ -977,53 +1009,51 @@ def test_harmonize_sediment(): NARROW_RESULTS5 : pandas.DataFrame Read from data/wqp_results5.txt. """ - actual = harmonize.harmonize(NARROW_RESULTS5, - char_val='Sediment', - units_out='g/kg') + actual = harmonize.harmonize(NARROW_RESULTS5, char_val="Sediment", units_out="g/kg") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 8628100 # Test size - assert 'Sediment' in actual.columns # Check for column - assert len(actual['Sediment'].dropna()) == 37 # Number of results + assert "Sediment" in actual.columns # Check for column + assert len(actual["Sediment"].dropna()) == 37 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS5[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS5[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[132737][orig_unit_col] == 'g/kg' # Confirm orig unit - expected_unit = 'gram / kilogram' # Desired units - assert str(actual.iloc[132737]['Sediment'].units) == expected_unit + assert actual.iloc[132737][orig_unit_col] == "g/kg" # Confirm orig unit + expected_unit = "gram / kilogram" # Desired units + assert str(actual.iloc[132737]["Sediment"].units) == expected_unit expected_val = float(actual.iloc[132737][orig_val_col]) # Original value - assert actual.iloc[132737]['Sediment'].magnitude == expected_val # Unchanged + assert actual.iloc[132737]["Sediment"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[128909][orig_unit_col] == '%' # Confirm orig unit - assert str(actual.iloc[128909]['Sediment'].units) == expected_unit - assert actual.iloc[128909][orig_val_col] == '17' # Confirm original measure - assert actual.iloc[128909]['Sediment'].magnitude == 170.0 + assert actual.iloc[128909][orig_unit_col] == "%" # Confirm orig unit + assert str(actual.iloc[128909]["Sediment"].units) == expected_unit + assert actual.iloc[128909][orig_val_col] == "17" # Confirm original measure + assert actual.iloc[128909]["Sediment"].magnitude == 170.0 # Inspect specific result - where units missing - assert str(actual.iloc[132738][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[132738][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, g/kg assumed' - actual_flags = actual.iloc[132738]['QA_flag'].split('; ') + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, g/kg assumed" + actual_flags = actual.iloc[132738]["QA_flag"].split("; ") assert actual_flags[0] == expected_flag # Check value unchagned for missing units expected_val = float(actual.iloc[132738][orig_val_col]) # Original value - assert actual.iloc[132738]['Sediment'].magnitude == expected_val # Unchanged + assert actual.iloc[132738]["Sediment"].magnitude == expected_val # Unchanged # Inspect specific result - where value missing - assert str(actual.iloc[126342][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[126342][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing value - expected_flag = 'ResultMeasureValue: missing (NaN) result' - assert actual.iloc[126342]['QA_flag'].split('; ')[0] == expected_flag + expected_flag = "ResultMeasureValue: missing (NaN) result" + assert actual.iloc[126342]["QA_flag"].split("; ")[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[132739][orig_val_col] == 'Not Reported' + assert actual.iloc[132739][orig_val_col] == "Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "Not Reported" result cannot be used' - assert actual.iloc[132739]['QA_flag'].split('; ')[0] == expected_flag + assert actual.iloc[132739]["QA_flag"].split("; ")[0] == expected_flag # TODO: add units mg/l -#@pytest.mark.skip(reason="not implemented") +# @pytest.mark.skip(reason="not implemented") def test_harmonize_phosphorus_plus(): """ Test function standardizes varied Phosphorus results correctly @@ -1034,7 +1064,8 @@ def test_harmonize_phosphorus_plus(): Read from data/wqp_results6.txt. """ -#@pytest.mark.skip(reason="not implemented") + +# @pytest.mark.skip(reason="not implemented") def test_harmonize_nitrogen_plus(): """ Test function standardizes varied Nitrogen results correctly @@ -1045,7 +1076,8 @@ def test_harmonize_nitrogen_plus(): Read from data/wqp_results6.txt. """ -#@pytest.mark.skip(reason="not implemented") + +# @pytest.mark.skip(reason="not implemented") def test_harmonize_sediment_plus(): """ Test function standardizes varied Sediment results correctly @@ -1057,7 +1089,7 @@ def test_harmonize_sediment_plus(): """ -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_fecal_coliform(): """ Test function standardizes Fecal Coliform results correctly @@ -1067,50 +1099,51 @@ def test_harmonize_fecal_coliform(): NARROW_RESULTS7 : pandas.DataFrame Read from data/wqp_results7.txt. """ - actual = harmonize.harmonize(NARROW_RESULTS7, 'Fecal Coliform') + actual = harmonize.harmonize(NARROW_RESULTS7, "Fecal Coliform") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 8778720 # Test size - assert 'Fecal_Coliform' in actual.columns # Check for column - assert len(actual['Fecal_Coliform'].dropna()) == 68264 # Number of results + assert "Fecal_Coliform" in actual.columns # Check for column + assert len(actual["Fecal_Coliform"].dropna()) == 68264 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS7[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS7[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[3][orig_unit_col] == 'cfu/100ml' # Confirm orig unit - expected_unit = 'Colony_Forming_Units / milliliter' # Desired units - assert str(actual.iloc[3]['Fecal_Coliform'].units) == expected_unit + assert actual.iloc[3][orig_unit_col] == "cfu/100ml" # Confirm orig unit + expected_unit = "Colony_Forming_Units / milliliter" # Desired units + assert str(actual.iloc[3]["Fecal_Coliform"].units) == expected_unit expected_val = float(actual.iloc[3][orig_val_col]) # Original value - assert actual.iloc[3]['Fecal_Coliform'].magnitude == expected_val # Unchanged + assert actual.iloc[3]["Fecal_Coliform"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[0][orig_unit_col] == '#/100ml' # Confirm orig unit - assert str(actual.iloc[0]['Fecal_Coliform'].units) == expected_unit - assert actual.iloc[0][orig_val_col] == '2' # Confirm original measure - assert actual.iloc[0]['Fecal_Coliform'].magnitude == 2.0 + assert actual.iloc[0][orig_unit_col] == "#/100ml" # Confirm orig unit + assert str(actual.iloc[0]["Fecal_Coliform"].units) == expected_unit + assert actual.iloc[0][orig_val_col] == "2" # Confirm original measure + assert actual.iloc[0]["Fecal_Coliform"].magnitude == 2.0 # Inspect specific result - where units missing - assert str(actual.iloc[1][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[1][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasure/MeasureUnitCode: MISSING UNITS, CFU/(100ml) assumed' - actual_flags = actual.iloc[1]['QA_flag'].split('; ') + expected_flag = "ResultMeasure/MeasureUnitCode: MISSING UNITS, CFU/(100ml) assumed" + actual_flags = actual.iloc[1]["QA_flag"].split("; ") assert actual_flags[1] == expected_flag # Check value unchagned for missing units expected_val = float(actual.iloc[3][orig_val_col]) # Original value - assert actual.iloc[3]['Fecal_Coliform'].magnitude == expected_val # Unchanged + assert actual.iloc[3]["Fecal_Coliform"].magnitude == expected_val # Unchanged # Inspect specific result - where value missing - assert str(actual.iloc[1][orig_val_col]) == '*Non-detect' # Confirm missing + assert str(actual.iloc[1][orig_val_col]) == "*Non-detect" # Confirm missing # Confirm expected flag - for missing value - expected_flag = 'ResultMeasureValue: "*Non-detect" result cannot be used; ResultMeasure/MeasureUnitCode: MISSING UNITS, CFU/(100ml) assumed' - assert actual.iloc[1]['QA_flag'] == expected_flag + expected_flag = 'ResultMeasureValue: "*Non-detect" result cannot be used; ' + expected_flag += "ResultMeasure/MeasureUnitCode: MISSING UNITS, CFU/(100ml) assumed" + assert actual.iloc[1]["QA_flag"] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[75305][orig_val_col] == 'Not Reported' + assert actual.iloc[75305][orig_val_col] == "Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "Not Reported" result cannot be used' - assert actual.iloc[75305]['QA_flag'].split('; ')[0] == expected_flag + assert actual.iloc[75305]["QA_flag"].split("; ")[0] == expected_flag -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_harmonize_E_Coli(): """ Test function standardizes Escherichia Coliform (E. Coli) results correctly @@ -1120,59 +1153,59 @@ def test_harmonize_E_Coli(): NARROW_RESULTS7 : pandas.DataFrame Read from data/wqp_results7.txt. """ - actual = harmonize.harmonize(NARROW_RESULTS7, 'Escherichia coli') + actual = harmonize.harmonize(NARROW_RESULTS7, "Escherichia coli") # Test that the dataframe has expected type, size, cols, and rows assert isinstance(actual, pandas.core.frame.DataFrame) # Test type assert actual.size == 8778720 # Test size - assert 'E_coli' in actual.columns # Check for column - assert len(actual['E_coli'].dropna()) == 7205 # Number of results + assert "E_coli" in actual.columns # Check for column + assert len(actual["E_coli"].dropna()) == 7205 # Number of results # Confirm orginal data was not altered - orig_val_col = 'ResultMeasureValue' # Values + orig_val_col = "ResultMeasureValue" # Values assert actual[orig_val_col].equals(NARROW_RESULTS7[orig_val_col]) - orig_unit_col = 'ResultMeasure/MeasureUnitCode' # Units + orig_unit_col = "ResultMeasure/MeasureUnitCode" # Units assert actual[orig_unit_col].equals(NARROW_RESULTS7[orig_unit_col]) # Inspect specific result - where units are not converted - assert actual.iloc[59267][orig_unit_col] == 'cfu/100ml' # Confirm orig unit - expected_unit = 'Colony_Forming_Units / milliliter' # Desired units - assert str(actual.iloc[59267]['E_coli'].units) == expected_unit + assert actual.iloc[59267][orig_unit_col] == "cfu/100ml" # Confirm orig unit + expected_unit = "Colony_Forming_Units / milliliter" # Desired units + assert str(actual.iloc[59267]["E_coli"].units) == expected_unit expected_val = float(actual.iloc[59267][orig_val_col]) # Original value - assert actual.iloc[59267]['E_coli'].magnitude == expected_val # Unchanged + assert actual.iloc[59267]["E_coli"].magnitude == expected_val # Unchanged # Inspect specific result - where units converted - assert actual.iloc[28804][orig_unit_col] == 'MPN/100ml' # Confirm orig unit - assert str(actual.iloc[28804]['E_coli'].units) == expected_unit - assert actual.iloc[28804][orig_val_col] == '7.3' # Confirm original measure - assert actual.iloc[28804]['E_coli'].magnitude == 7.3 + assert actual.iloc[28804][orig_unit_col] == "MPN/100ml" # Confirm orig unit + assert str(actual.iloc[28804]["E_coli"].units) == expected_unit + assert actual.iloc[28804][orig_val_col] == "7.3" # Confirm original measure + assert actual.iloc[28804]["E_coli"].magnitude == 7.3 # Inspect specific result - where units missing - assert str(actual.iloc[108916][orig_unit_col]) == 'nan' # Confirm missing + assert str(actual.iloc[108916][orig_unit_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered units - expected_flag = 'ResultMeasureValue: missing (NaN) result' - actual_flags = actual.iloc[108916]['QA_flag'].split('; ') + expected_flag = "ResultMeasureValue: missing (NaN) result" + actual_flags = actual.iloc[108916]["QA_flag"].split("; ") assert actual_flags[0] == expected_flag # Check value unchagned for missing units expected_val = float(actual.iloc[59267][orig_val_col]) # Original value - assert actual.iloc[59267]['E_coli'].magnitude == expected_val # Unchanged + assert actual.iloc[59267]["E_coli"].magnitude == expected_val # Unchanged # Inspect specific result - where value missing - assert str(actual.iloc[28805][orig_val_col]) == 'nan' # Confirm missing + assert str(actual.iloc[28805][orig_val_col]) == "nan" # Confirm missing # Confirm expected flag - for missing/infered values - expected_flag = 'ResultMeasureValue: missing (NaN) result' - assert actual.iloc[28805]['QA_flag'].split('; ')[0] == expected_flag + expected_flag = "ResultMeasureValue: missing (NaN) result" + assert actual.iloc[28805]["QA_flag"].split("; ")[0] == expected_flag # Inspect specific result - un-usable non-numeric values - assert actual.iloc[69168 ][orig_val_col] == '*Not Reported' + assert actual.iloc[69168][orig_val_col] == "*Not Reported" # Confirm expected flag - for un-usable value expected_flag = 'ResultMeasureValue: "*Not Reported" result cannot be used' - assert actual.iloc[69168 ]['QA_flag'].split('; ')[0] == expected_flag + assert actual.iloc[69168]["QA_flag"].split("; ")[0] == expected_flag -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_conductivity_to_PSU(harmonized_tables): - conductivity_series = harmonized_tables['Conductivity'].dropna() + conductivity_series = harmonized_tables["Conductivity"].dropna() # With wrapper it should have to be converted to string first conductivity_series_str = conductivity_series.apply(str) actual = conductivity_series_str.apply(convert.conductivity_to_PSU) # No loss of rows assert len(actual) == len(conductivity_series) # Check it is dimensionless - assert str(actual[0].units) == 'dimensionless' + assert str(actual[0].units) == "dimensionless" # Check conversion was accurate assert conductivity_series[0].magnitude == 111.0 assert actual[0].magnitude == 0.057 @@ -1180,43 +1213,44 @@ def test_conductivity_to_PSU(harmonized_tables): assert actual[244].magnitude == 0.493 -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_accept_methods(merged_tables): - actual = clean.methods_check(merged_tables, 'Phosphorus') + actual = clean.methods_check(merged_tables, "Phosphorus") actual.sort() # Order is inconsistent so it's sorted - expected = ['365.1', '365.3', '365.4', '4500-P-E', '4500-P-F'] + expected = ["365.1", "365.3", "365.4", "4500-P-E", "4500-P-F"] assert actual == expected -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_datetime(harmonized_tables): # Testit actual = clean.datetime(harmonized_tables) # Type for time field - assert isinstance(actual['Activity_datetime'][0], - pandas._libs.tslibs.timestamps.Timestamp) + assert isinstance( + actual["Activity_datetime"][0], pandas._libs.tslibs.timestamps.Timestamp + ) -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_split_col(harmonized_tables): # Testit with default QA actual_QA = wrangle.split_col(harmonized_tables) # Check for expected columns - assert 'QA_Nitrogen' in actual_QA.columns - assert 'QA_Conductivity' in actual_QA.columns - assert 'QA_flag' not in actual_QA.columns + assert "QA_Nitrogen" in actual_QA.columns + assert "QA_Conductivity" in actual_QA.columns + assert "QA_flag" not in actual_QA.columns # Testit with non-default column - col = 'ResultAnalyticalMethod/MethodIdentifier' - actual_methods = wrangle.split_col(harmonized_tables, col, 'MethodID') - assert 'MethodID_Nitrogen' in actual_methods.columns - assert 'MethodID_Conductivity' in actual_methods.columns + col = "ResultAnalyticalMethod/MethodIdentifier" + actual_methods = wrangle.split_col(harmonized_tables, col, "MethodID") + assert "MethodID_Nitrogen" in actual_methods.columns + assert "MethodID_Conductivity" in actual_methods.columns assert col not in actual_methods.columns # TODO: test when out_col is list (i.e., Phosphorus) -#@pytest.mark.skip(reason="no change") +# @pytest.mark.skip(reason="no change") def test_split_table(harmonized_tables): # Note: it will do datetime() as well actual_main, actual_chars = wrangle.split_table(harmonized_tables) @@ -1224,82 +1258,115 @@ def test_split_table(harmonized_tables): assert len(actual_main) == len(harmonized_tables) assert len(actual_chars) == len(harmonized_tables) # Check columns expected - expected = ['OrganizationIdentifier', 'OrganizationFormalName', - 'ActivityIdentifier', 'ProjectIdentifier', - 'MonitoringLocationIdentifier', - 'DetectionQuantitationLimitTypeName', - 'DetectionQuantitationLimitMeasure/MeasureValue', - 'DetectionQuantitationLimitMeasure/MeasureUnitCode', - 'ProviderName', 'QA_flag', 'Nitrogen', 'Speciation', - 'TOTAL NITROGEN_ MIXED FORMS', 'Conductivity', - 'Activity_datetime', 'Depth'] + expected = [ + "OrganizationIdentifier", + "OrganizationFormalName", + "ActivityIdentifier", + "ProjectIdentifier", + "MonitoringLocationIdentifier", + "DetectionQuantitationLimitTypeName", + "DetectionQuantitationLimitMeasure/MeasureValue", + "DetectionQuantitationLimitMeasure/MeasureUnitCode", + "ProviderName", + "QA_flag", + "Nitrogen", + "Speciation", + "TOTAL NITROGEN_ MIXED FORMS", + "Conductivity", + "Activity_datetime", + "Depth", + ] assert list(actual_main.columns) == expected - expected = ['ActivityStartDate', 'ActivityStartTime/Time', - 'ActivityStartTime/TimeZoneCode', - 'ResultDetectionConditionText', - 'MethodSpecificationName', 'CharacteristicName', - 'ResultSampleFractionText', 'ResultMeasureValue', - 'ResultMeasure/MeasureUnitCode', 'MeasureQualifierCode', - 'ResultStatusIdentifier', 'StatisticalBaseCode', - 'ResultValueTypeName', 'ResultWeightBasisText', - 'ResultTimeBasisText', 'ResultTemperatureBasisText', - 'ResultParticleSizeBasisText', 'PrecisionValue', - 'ResultCommentText', 'USGSPCode', - 'ResultDepthHeightMeasure/MeasureValue', - 'ResultDepthHeightMeasure/MeasureUnitCode', - 'ResultDepthAltitudeReferencePointText', - 'SubjectTaxonomicName', 'SampleTissueAnatomyName', - 'ResultAnalyticalMethod/MethodIdentifier', - 'ResultAnalyticalMethod/MethodIdentifierContext', - 'ResultAnalyticalMethod/MethodName', - 'MethodDescriptionText', 'LaboratoryName', - 'AnalysisStartDate', 'ResultLaboratoryCommentText', - 'ActivityTypeCode', 'ActivityMediaName', - 'ActivityMediaSubdivisionName', 'ActivityEndDate', - 'ActivityEndTime/Time', 'ActivityEndTime/TimeZoneCode', - 'ActivityDepthHeightMeasure/MeasureValue', - 'ActivityDepthHeightMeasure/MeasureUnitCode', - 'ActivityDepthAltitudeReferencePointText', - 'ActivityTopDepthHeightMeasure/MeasureValue', - 'ActivityTopDepthHeightMeasure/MeasureUnitCode', - 'ActivityBottomDepthHeightMeasure/MeasureValue', - 'ActivityBottomDepthHeightMeasure/MeasureUnitCode', - 'ActivityConductingOrganizationText', - 'ActivityCommentText', 'SampleAquifer', - 'HydrologicCondition', 'HydrologicEvent', - 'SampleCollectionMethod/MethodIdentifier', - 'SampleCollectionMethod/MethodIdentifierContext', - 'SampleCollectionMethod/MethodName', - 'SampleCollectionEquipmentName', 'PreparationStartDate', ] + expected = [ + "ActivityStartDate", + "ActivityStartTime/Time", + "ActivityStartTime/TimeZoneCode", + "ResultDetectionConditionText", + "MethodSpecificationName", + "CharacteristicName", + "ResultSampleFractionText", + "ResultMeasureValue", + "ResultMeasure/MeasureUnitCode", + "MeasureQualifierCode", + "ResultStatusIdentifier", + "StatisticalBaseCode", + "ResultValueTypeName", + "ResultWeightBasisText", + "ResultTimeBasisText", + "ResultTemperatureBasisText", + "ResultParticleSizeBasisText", + "PrecisionValue", + "ResultCommentText", + "USGSPCode", + "ResultDepthHeightMeasure/MeasureValue", + "ResultDepthHeightMeasure/MeasureUnitCode", + "ResultDepthAltitudeReferencePointText", + "SubjectTaxonomicName", + "SampleTissueAnatomyName", + "ResultAnalyticalMethod/MethodIdentifier", + "ResultAnalyticalMethod/MethodIdentifierContext", + "ResultAnalyticalMethod/MethodName", + "MethodDescriptionText", + "LaboratoryName", + "AnalysisStartDate", + "ResultLaboratoryCommentText", + "ActivityTypeCode", + "ActivityMediaName", + "ActivityMediaSubdivisionName", + "ActivityEndDate", + "ActivityEndTime/Time", + "ActivityEndTime/TimeZoneCode", + "ActivityDepthHeightMeasure/MeasureValue", + "ActivityDepthHeightMeasure/MeasureUnitCode", + "ActivityDepthAltitudeReferencePointText", + "ActivityTopDepthHeightMeasure/MeasureValue", + "ActivityTopDepthHeightMeasure/MeasureUnitCode", + "ActivityBottomDepthHeightMeasure/MeasureValue", + "ActivityBottomDepthHeightMeasure/MeasureUnitCode", + "ActivityConductingOrganizationText", + "ActivityCommentText", + "SampleAquifer", + "HydrologicCondition", + "HydrologicEvent", + "SampleCollectionMethod/MethodIdentifier", + "SampleCollectionMethod/MethodIdentifierContext", + "SampleCollectionMethod/MethodName", + "SampleCollectionEquipmentName", + "PreparationStartDate", + ] assert list(actual_chars.columns) == expected -#test viz + +# test viz def test_map_counts(test_harmonize_locations, test_harmonize_temperature): - actual = viz.map_counts(test_harmonize_temperature, - test_harmonize_locations, - 'Temperature') - assert len(actual['cnt']) == 21075 - assert sum(actual['cnt']) == 346210 + actual = viz.map_counts( + test_harmonize_temperature, test_harmonize_locations, "Temperature" + ) + assert len(actual["cnt"]) == 21075 + assert sum(actual["cnt"]) == 346210 + def test_map_measure(test_harmonize_locations, test_harmonize_temperature): - actual = viz.map_measure(test_harmonize_temperature, - test_harmonize_locations, - 'Temperature') - assert len(actual['mean']) == 21075 - assert sum(actual['mean']) == 523776.35504297394 + actual = viz.map_measure( + test_harmonize_temperature, test_harmonize_locations, "Temperature" + ) + assert len(actual["mean"]) == 21075 + assert sum(actual["mean"]) == 523776.35504297394 + def test_station_summary(test_harmonize_temperature): - actual = viz.station_summary(test_harmonize_temperature, 'Temperature') - assert list(actual.columns) == ['MonitoringLocationIdentifier', 'cnt', 'mean'] - assert len(actual['cnt']) == 21075 - assert sum(actual['cnt']) == 346210 - assert len(actual['mean']) == 21075 - assert sum(actual['mean']) == 523776.35504297394 + actual = viz.station_summary(test_harmonize_temperature, "Temperature") + assert list(actual.columns) == ["MonitoringLocationIdentifier", "cnt", "mean"] + assert len(actual["cnt"]) == 21075 + assert sum(actual["cnt"]) == 346210 + assert len(actual["mean"]) == 21075 + assert sum(actual["mean"]) == 523776.35504297394 + def test_print_report(test_harmonize_temperature, capsys): - viz.print_report(test_harmonize_temperature, - 'Temperature', - 'ResultMeasure/MeasureUnitCode') + viz.print_report( + test_harmonize_temperature, "Temperature", "ResultMeasure/MeasureUnitCode" + ) captured, err = capsys.readouterr() expected = "-Usable results-\ncount 346210.000000\n" expected += "mean 25.175700\nstd 143.175647\n" @@ -1309,4 +1376,4 @@ def test_print_report(test_harmonize_temperature, capsys): expected += "Unusable results: 13295\n" expected += "Usable results with inferred units: 0\n" expected += "Results outside threshold (0.0 to 884.2295835882991): 4\n" - assert captured == expected \ No newline at end of file + assert captured == expected diff --git a/harmonize_wq/visualize.py b/harmonize_wq/visualize.py index 5943baf..edec83f 100644 --- a/harmonize_wq/visualize.py +++ b/harmonize_wq/visualize.py @@ -1,8 +1,11 @@ # -*- coding: utf-8 -*- """Functions to help visualize data.""" + from math import sqrt -import pandas + import geopandas +import pandas + from harmonize_wq.wrangle import merge_tables @@ -23,10 +26,10 @@ def print_report(results_in, out_col, unit_col_in, threshold=None): Returns ------- None. - + See Also -------- - See any of the 'Detailed' notebooks found in + See any of the 'Detailed' notebooks found in `demos `_ for examples of how this function is leveraged by the :func:`harmonize.harmonize_generic` report argument. @@ -35,26 +38,27 @@ def print_report(results_in, out_col, unit_col_in, threshold=None): # Series with just usable results. results = results_in[out_col].dropna() # Series with infered units - inferred = results_in.loc[((results_in[out_col].notna()) & - (results_in[unit_col_in].isna()))] + inferred = results_in.loc[ + ((results_in[out_col].notna()) & (results_in[unit_col_in].isna())) + ] # Series with just magnitude results_s = pandas.Series([x.magnitude for x in results]) # Number of usable results - print(f'-Usable results-\n{results_s.describe()}') + print(f"-Usable results-\n{results_s.describe()}") # Number measures unused - print(f'Unusable results: {len(results_in)-len(results)}') + print(f"Unusable results: {len(results_in)-len(results)}") # Number of infered result units - print(f'Usable results with inferred units: {len(inferred)}') + print(f"Usable results with inferred units: {len(inferred)}") # Results outside thresholds if not threshold: # TODO: Default mean +/-1 standard deviation works here but generally 6 - threshold = {'min': 0.0, - 'max': results_s.mean() + (6 * results_s.std())} - inside = results_s[(results_s <= threshold['max']) & - (results_s >= threshold['min'])] + threshold = {"min": 0.0, "max": results_s.mean() + (6 * results_s.std())} + inside = results_s[ + (results_s <= threshold["max"]) & (results_s >= threshold["min"]) + ] diff = len(results) - len(inside) threshold_range = f"{threshold['min']} to {threshold['max']}" - print(f'Results outside threshold ({threshold_range}): {diff}') + print(f"Results outside threshold ({threshold_range}): {diff}") # Graphic representation of stats inside.hist(bins=int(sqrt(inside.count()))) @@ -84,7 +88,7 @@ def map_counts(df_in, gdf, col=None): Examples -------- Build example DataFrame of results: - + >>> from pandas import DataFrame >>> df_in = DataFrame({'ResultMeasureValue': [5.1, 1.2, 8.7], ... 'MonitoringLocationIdentifier': ['ID1', 'ID2', 'ID1'] @@ -94,9 +98,9 @@ def map_counts(df_in, gdf, col=None): 0 5.1 ID1 1 1.2 ID2 2 8.7 ID1 - + Build example GeoDataFrame of monitoring locations: - + >>> import geopandas >>> from shapely.geometry import Point >>> from numpy import nan @@ -108,9 +112,9 @@ def map_counts(df_in, gdf, col=None): MonitoringLocationIdentifier QA_flag geometry 0 ID1 NaN POINT (1.00000 2.00000) 1 ID2 NaN POINT (2.00000 1.00000) - + Combine these to get an aggregation of results per station: - + >>> import harmonize_wq >>> cnt_gdf = harmonize_wq.visualize.map_counts(df_in, gdf) >>> cnt_gdf @@ -124,26 +128,26 @@ def map_counts(df_in, gdf, col=None): """ # Column for station - loc_id = 'MonitoringLocationIdentifier' + loc_id = "MonitoringLocationIdentifier" # TODO: col is going to be used to restrict results, if none use all if col is not None: cols = [loc_id, col] df_in = df_in.loc[df_in[col].notna(), cols].copy() # TODO: cols needed? # Map counts of all results - df_cnt = df_in.groupby(loc_id).size().to_frame('cnt') + df_cnt = df_in.groupby(loc_id).size().to_frame("cnt") df_cnt.reset_index(inplace=True) # Join it to geometry - merge_cols = ['MonitoringLocationIdentifier'] - gdf_cols = ['geometry', 'QA_flag'] + merge_cols = ["MonitoringLocationIdentifier"] + gdf_cols = ["geometry", "QA_flag"] results_df = merge_tables(df_cnt, gdf, gdf_cols, merge_cols) - return geopandas.GeoDataFrame(results_df, geometry='geometry') + return geopandas.GeoDataFrame(results_df, geometry="geometry") def map_measure(df_in, gdf, col): """Get GeoDataFrame summarized by average of results for each station. - + :class:`geopandas.GeoDataFrame` will have new column 'mean' with the average of col values for that location. @@ -160,17 +164,17 @@ def map_measure(df_in, gdf, col): ------- geopandas.GeoDataFrame GeoDataFrame with average value of results for each station. - + Examples -------- Build array of pint Quantity for Temperature: - + >>> from pint import Quantity >>> u = 'degree_Celsius' >>> temperatures = [Quantity(5.1, u), Quantity(1.2, u), Quantity(8.7, u)] - + Build example pandas DataFrame of results: - + >>> from pandas import DataFrame >>> df_in = DataFrame({'Temperature': temperatures, ... 'MonitoringLocationIdentifier': ['ID1', 'ID2', 'ID1'] @@ -180,9 +184,9 @@ def map_measure(df_in, gdf, col): 0 5.1 degree_Celsius ID1 1 1.2 degree_Celsius ID2 2 8.7 degree_Celsius ID1 - + Build example geopandas GeoDataFrame of monitoring locations: - + >>> import geopandas >>> from shapely.geometry import Point >>> from numpy import nan @@ -194,9 +198,9 @@ def map_measure(df_in, gdf, col): MonitoringLocationIdentifier QA_flag geometry 0 ID1 NaN POINT (1.00000 2.00000) 1 ID2 NaN POINT (2.00000 1.00000) - + Combine these to get an aggregation of results per station: - + >>> from harmonize_wq import visualize >>> avg_temp = visualize.map_measure(df_in, gdf, 'Temperature') >>> avg_temp @@ -209,7 +213,7 @@ def map_measure(df_in, gdf, col): >>> avg_temp.plot(column='mean', cmap='Blues', legend=True) """ - merge_cols = ['MonitoringLocationIdentifier'] + merge_cols = ["MonitoringLocationIdentifier"] if merge_cols[0] not in df_in.columns: df_temp = df_in.reset_index() # May be part of index already @@ -219,16 +223,16 @@ def map_measure(df_in, gdf, col): df_agg = station_summary(df_temp, col) # Join it to geometry - gdf_cols = ['geometry', 'QA_flag'] + gdf_cols = ["geometry", "QA_flag"] results_df = merge_tables(df_agg, gdf, gdf_cols, merge_cols) - return geopandas.GeoDataFrame(results_df, geometry='geometry') + return geopandas.GeoDataFrame(results_df, geometry="geometry") def station_summary(df_in, col): """Get summary table for stations. - - Summary table as :class:`~pandas.DataFrame` with rows for each + + Summary table as :class:`~pandas.DataFrame` with rows for each station, count, and column average. Parameters @@ -244,16 +248,16 @@ def station_summary(df_in, col): Table with result count and average summarized by station. """ # Column for station - loc_id = 'MonitoringLocationIdentifier' + loc_id = "MonitoringLocationIdentifier" # Aggregate data by station to look at results spatially cols = [loc_id, col] df = df_in.loc[df_in[col].notna(), cols].copy() # Col w/ magnitude seperate from unit avg = [x.magnitude for x in df[col]] - df['magnitude'] = pandas.Series(avg, index=df[col].index) - df_agg = df.groupby(loc_id).size().to_frame('cnt') - cols = [loc_id, 'magnitude'] - df_agg['mean'] = df[cols].groupby(loc_id).mean() + df["magnitude"] = pandas.Series(avg, index=df[col].index) + df_agg = df.groupby(loc_id).size().to_frame("cnt") + cols = [loc_id, "magnitude"] + df_agg["mean"] = df[cols].groupby(loc_id).mean() df_agg.reset_index(inplace=True) return df_agg diff --git a/harmonize_wq/wq_data.py b/harmonize_wq/wq_data.py index 4d1cd25..f9d4dd5 100644 --- a/harmonize_wq/wq_data.py +++ b/harmonize_wq/wq_data.py @@ -1,13 +1,15 @@ # -*- coding: utf-8 -*- """Class for harmonizing data retrieved from EPA's Water Quality Portal.""" + from types import SimpleNamespace from warnings import warn + import pandas import pint from numpy import nan -from harmonize_wq import domains -from harmonize_wq import basis -from harmonize_wq.clean import df_checks, add_qa_flag + +from harmonize_wq import basis, domains +from harmonize_wq.clean import add_qa_flag, df_checks from harmonize_wq.convert import convert_unit_series, moles_to_mass @@ -32,7 +34,7 @@ def units_dimension(series_in, units, ureg=None): Examples -------- Build series to use as input: - + >>> from pandas import Series >>> unit_series = Series(['mg/l', 'mg/ml', 'g/kg']) >>> unit_series @@ -42,12 +44,12 @@ def units_dimension(series_in, units, ureg=None): dtype: object Get list of unique units not in desired units dimension 'mg/l': - + >>> from harmonize_wq import wq_data >>> wq_data.units_dimension(unit_series, units='mg/l') ['g/kg'] """ - #TODO: this should be a method + # TODO: this should be a method if ureg is None: ureg = pint.UnitRegistry() dim_list = [] # List for units with mismatched dimensions @@ -60,7 +62,7 @@ def units_dimension(series_in, units, ureg=None): return dim_list -class WQCharData(): +class WQCharData: """Class for specific characteristic in Water Quality Portal results. Parameters @@ -86,11 +88,11 @@ class WQCharData(): units : str Units all results in out_col column will be converted into. Default units are returned from :func:`domains.OUT_UNITS` [out_col]. - + Examples -------- Build pandas DataFrame to use as input: - + >>> from pandas import DataFrame >>> from numpy import nan >>> df = DataFrame({'CharacteristicName': ['Phosphorus', 'Temperature, water',], @@ -101,7 +103,7 @@ class WQCharData(): CharacteristicName ResultMeasure/MeasureUnitCode ResultMeasureValue 0 Phosphorus NaN 1.0 1 Temperature, water NaN 10.0 - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Phosphorus') >>> wq.df @@ -110,7 +112,7 @@ class WQCharData(): 1 Temperature, water NaN ... NaN NaN [2 rows x 5 columns] - + >>> wq.df.columns Index(['CharacteristicName', 'ResultMeasure/MeasureUnitCode', 'ResultMeasureValue', 'Units', 'Phosphorus'], @@ -122,16 +124,17 @@ def __init__(self, df_in, char_val): df_out = df_in.copy() # self.check_df(df) df_checks(df_out) - c_mask = df_out['CharacteristicName'] == char_val + c_mask = df_out["CharacteristicName"] == char_val self.c_mask = c_mask # Deal with units: set out = in - cols = {'unit_in': 'ResultMeasure/MeasureUnitCode', - 'unit_out': 'Units', - 'measure': 'ResultMeasureValue', - 'basis': 'Speciation', } + cols = { + "unit_in": "ResultMeasure/MeasureUnitCode", + "unit_out": "Units", + "measure": "ResultMeasureValue", + "basis": "Speciation", + } self.col = SimpleNamespace(**cols) - df_out.loc[c_mask, self.col.unit_out] = df_out.loc[c_mask, - self.col.unit_in] + df_out.loc[c_mask, self.col.unit_out] = df_out.loc[c_mask, self.col.unit_in] self.df = df_out # Deal with values: set out_col = in self.out_col = domains.out_col_lookup[char_val] @@ -149,14 +152,13 @@ def _coerce_measure(self): meas_col = self.col.measure # Coerce bad measures in series to NaN - meas_s = pandas.to_numeric(df_out.loc[c_mask, meas_col], - errors='coerce') + meas_s = pandas.to_numeric(df_out.loc[c_mask, meas_col], errors="coerce") # Create a list of the bad measures in the series bad_measures = [df_out.iloc[i][meas_col] for i in meas_s[meas_s.isna()].index] for bad_meas in pandas.unique(bad_measures): # Flag each unique bad measure one measure (not row) at a time if pandas.isna(bad_meas): - flag = f'{meas_col}: missing (NaN) result' + flag = f"{meas_col}: missing (NaN) result" cond = c_mask & (df_out[meas_col].isna()) else: flag = f'{meas_col}: "{bad_meas}" result cannot be used' @@ -185,7 +187,7 @@ def _infer_units(self, flag_col=None): The default None uses WQCharData.col.unit_out instead. """ # QA flag for missing units - flag = self._unit_qa_flag('MISSING', flag_col) + flag = self._unit_qa_flag("MISSING", flag_col) # Update mask for missing units units_mask = self.c_mask & self.df[self.col.unit_out].isna() self.df = add_qa_flag(self.df, units_mask, flag) # Assign flag @@ -193,13 +195,12 @@ def _infer_units(self, flag_col=None): self.df.loc[units_mask, self.col.unit_out] = self.units # Note: .fillna(self.units) is slightly faster but hits datatype issues - def _unit_qa_flag(self, trouble, flag_col=None): """Generate a QA_flag flag string for the units column. - + If unit_col is a copy flag_col can specify the original column name for the flag. The default units, self.units replaces the problem unit. - + Parameters ---------- trouble : str @@ -207,20 +208,20 @@ def _unit_qa_flag(self, trouble, flag_col=None): flag_col : str, optional String to use when referring to the unit_col. The default None uses WQCharData.col.unit_out instead. - + Returns ------- string Flag to use in QA_flag column. """ if flag_col: - return f'{flag_col}: {trouble} UNITS, {self.units} assumed' + return f"{flag_col}: {trouble} UNITS, {self.units} assumed" # Else: Used when flag_col is None, typically the column being checked - return f'{self.col.unit_out}: {trouble} UNITS, {self.units} assumed' + return f"{self.col.unit_out}: {trouble} UNITS, {self.units} assumed" def _replace_in_col(self, col, old_val, new_val, mask=None): """Replace string throughout column, filter rows to skip by mask. - + Parameters ---------- df_in : pandas.DataFrame @@ -234,28 +235,28 @@ def _replace_in_col(self, col, old_val, new_val, mask=None): mask : pandas.Series Row conditional mask to only update a sub-set of rows. The default None uses 'CharacteristicName' mask instead. - + Returns ------- df_in : pandas.DataFrame Updated DataFrame. - + """ if mask is None: mask = self.c_mask df_in = self.df # Note: Timing is just as fast as long as df isn't copied # Timing for replace vs set unkown - mask_old = mask & (df_in[col]==old_val) - #str.replace did not work for short str to long str (over-replaces) - #df.loc[mask, col] = df.loc[mask, col].str.replace(old_val, new_val) + mask_old = mask & (df_in[col] == old_val) + # str.replace did not work for short str to long str (over-replaces) + # df.loc[mask, col] = df.loc[mask, col].str.replace(old_val, new_val) df_in.loc[mask_old, col] = new_val # This should be more explicit return df_in def _dimension_handling(self, unit, quant=None, ureg=None): """Handle and routes common dimension conversions/contexts. - + Parameters ---------- unit : str @@ -265,44 +266,44 @@ def _dimension_handling(self, unit, quant=None, ureg=None): ureg : pint.UnitRegistry, optional Unit Registry Object with any custom units defined. The default is None - + Returns ------- dict Dictionary with old_unit:new_unit. list List of Mole (substance) units. - + """ units = self.units if ureg is None: ureg = pint.UnitRegistry() # Conversion to moles performed a level up from here (class method) - if ureg(units).check({'[length]': -3, '[mass]': 1}): + if ureg(units).check({"[length]": -3, "[mass]": 1}): # Convert to density, e.g., '%' -> 'mg/l' - if ureg(unit).check({'[substance]': 1}): + if ureg(unit).check({"[substance]": 1}): if quant: # Moles -> mg/l; dim = ' / l' - return {unit: quant + ' / l'}, [quant + ' / l'] + return {unit: quant + " / l"}, [quant + " / l"] raise ValueError("Pint Quantity required for moles conversions") # Else assume it is dimensionless (e.g. unit = 'g/kg') - return {unit: unit + ' * H2O'}, [] + return {unit: unit + " * H2O"}, [] if ureg(units).dimensionless: # Convert to dimensionless, e.g., 'mg/l' -> '%' - if ureg(unit).check({'[substance]': 1}): + if ureg(unit).check({"[substance]": 1}): if quant: # Moles -> g/kg; dim = ' / l / H2O' - return {unit: quant + ' / l / H2O'}, [quant + ' / l / H2O'] + return {unit: quant + " / l / H2O"}, [quant + " / l / H2O"] raise ValueError("Pint Quantity required for moles conversions") # Else assume it is density (e.g. unit = 'mg/l') - return {unit: unit + ' / H2O'}, [] - warn('WARNING: Unexpected dimensionality') + return {unit: unit + " / H2O"}, [] + warn("WARNING: Unexpected dimensionality") return {}, [] def check_units(self, flag_col=None): """Check units. - + Checks for bad units that are missing (assumes default_unit) or unrecognized as valid by unit registry (ureg). Does not check for units in the correct dimensions, or a mistaken identity (e.g. 'deg F' @@ -313,29 +314,44 @@ def check_units(self, flag_col=None): flag_col : str, optional Column to reference in string for 'QA_flags'. The default None uses WQCharData.col.unit_out attribute. - + Returns ------- None. - + Examples -------- Build DataFrame to use as input: - + >>> from pandas import DataFrame >>> from numpy import nan - >>> df = DataFrame({'CharacteristicName': ['Phosphorus', 'Temperature, water', 'Phosphorus',], - ... 'ResultMeasure/MeasureUnitCode': [nan, nan, 'Unknown',], - ... 'ResultMeasureValue': ['1.0', '67.0', '10',], - ... }) + >>> df = DataFrame( + ... { + ... "CharacteristicName": [ + ... "Phosphorus", + ... "Temperature, water", + ... "Phosphorus", + ... ], + ... "ResultMeasure/MeasureUnitCode": [ + ... nan, + ... nan, + ... "Unknown", + ... ], + ... "ResultMeasureValue": [ + ... "1.0", + ... "67.0", + ... "10", + ... ], + ... } + ... ) >>> df CharacteristicName ResultMeasure/MeasureUnitCode ResultMeasureValue 0 Phosphorus NaN 1.0 1 Temperature, water NaN 67.0 2 Phosphorus Unknown 10 - + Build WQ Characteristic Data class from pandas DataFrame: - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Phosphorus') >>> wq.df.Units @@ -343,18 +359,18 @@ def check_units(self, flag_col=None): 1 NaN 2 Unknown Name: Units, dtype: object - + Run check_units method to replace bad or missing units for phosphorus: - + >>> wq.check_units() # doctest: +IGNORE_RESULT UserWarning: WARNING: 'Unknown' UNDEFINED UNIT for Phosphorus - + >>> wq.df[['CharacteristicName', 'Units', 'QA_flag']] CharacteristicName Units QA_flag 0 Phosphorus mg/l ResultMeasure/MeasureUnitCode: MISSING UNITS, ... 1 Temperature, water NaN NaN 2 Phosphorus mg/l ResultMeasure/MeasureUnitCode: 'Unknown' UNDEF... - + Note: it didn't infer units for 'Temperature, water' because wq is Phosphorus specific. """ @@ -385,7 +401,7 @@ def check_units(self, flag_col=None): df_out.loc[u_mask, self.col.unit_out] = self.units # Replace w/ default self.df = df_out - def check_basis(self, basis_col='MethodSpecificationName'): + def check_basis(self, basis_col="MethodSpecificationName"): """Determine speciation (basis) for measure. Parameters @@ -393,46 +409,53 @@ def check_basis(self, basis_col='MethodSpecificationName'): basis_col : str, optional Basis column name. Default is 'MethodSpecificationName' which is replaced by 'Speciation'. Other columns are updated in place. - + Returns ------- None. - + Examples -------- Build DataFrame to use as input: - + >>> from pandas import DataFrame >>> from numpy import nan - >>> df = DataFrame({'CharacteristicName': ['Phosphorus', 'Temperature, water', 'Phosphorus',], - ... 'ResultMeasure/MeasureUnitCode': ['mg/l as P', nan, 'mg/l',], - ... 'ResultMeasureValue': ['1.0', '67.0', '10',], - ... 'MethodSpecificationName': [nan, nan, 'as PO4',], - ... }) + >>> df = DataFrame( + ... { + ... "CharacteristicName": [ + ... "Phosphorus", + ... "Temperature, water", + ... "Phosphorus", + ... ], + ... "ResultMeasure/MeasureUnitCode": ["mg/l as P", nan, "mg/l",], + ... "ResultMeasureValue": ["1.0", "67.0", "10",], + ... "MethodSpecificationName": [nan, nan, "as PO4",], + ... } + ... ) >>> df[['ResultMeasure/MeasureUnitCode', 'MethodSpecificationName']] ResultMeasure/MeasureUnitCode MethodSpecificationName 0 mg/l as P NaN 1 NaN NaN 2 mg/l as PO4 - + Build WQ Characteristic Data class from pandas DataFrame: - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Phosphorus') >>> wq.df.columns # doctest: +NORMALIZE_WHITESPACE Index(['CharacteristicName', 'ResultMeasure/MeasureUnitCode', 'ResultMeasureValue', 'MethodSpecificationName', 'Units', 'Phosphorus'], dtype='object') - + Run check_basis method to speciation for phosphorus: - + >>> wq.check_basis() >>> wq.df[['MethodSpecificationName', 'Speciation']] MethodSpecificationName Speciation 0 NaN P 1 NaN NaN 2 as PO4 PO4 - + Note where basis was part of 'ResultMeasure/MeasureUnitCode' it has been removed in 'Units': @@ -452,8 +475,7 @@ def check_basis(self, basis_col='MethodSpecificationName'): df_checks(self.df, [basis_col]) # Basis from MethodSpecificationName - if basis_col == 'MethodSpecificationName': - + if basis_col == "MethodSpecificationName": # Add basis out column (i.e., 'Speciation') if it doesn't exist if self.col.basis not in self.df.columns: self.df[self.col.basis] = nan @@ -464,9 +486,9 @@ def check_basis(self, basis_col='MethodSpecificationName'): # Basis from unit try: basis_dict = basis.unit_basis_dict[self.out_col] - self.df[c_mask] = basis.basis_from_unit(self.df[c_mask], - basis_dict, - self.col.unit_out) + self.df[c_mask] = basis.basis_from_unit( + self.df[c_mask], basis_dict, self.col.unit_out + ) except KeyError: pass # Finish by filling any NAs with char_val based default @@ -479,14 +501,15 @@ def check_basis(self, basis_col='MethodSpecificationName'): self.df.loc[c_mask, col] = self.df.loc[c_mask, col].fillna(char_val) # Drop instances of 'as ' - self.df.loc[c_mask, col] = [bas[3:] - if bas.startswith('as ') else bas - for bas in self.df.loc[c_mask, col]] + self.df.loc[c_mask, col] = [ + bas[3:] if bas.startswith("as ") else bas + for bas in self.df.loc[c_mask, col] + ] else: - self.df[c_mask] = basis.update_result_basis(self.df[c_mask], - basis_col, - self.col.unit_out) + self.df[c_mask] = basis.update_result_basis( + self.df[c_mask], basis_col, self.col.unit_out + ) def update_ureg(self): """Update class unit registry to define units based on out_col.""" @@ -495,27 +518,27 @@ def update_ureg(self): def update_units(self, units_out): """Update class units attribute to convert everything into. - + This just updates the attribute, it does not perform the conversion. - + Parameters ---------- units_out : str Units to convert results into. - + Returns ------- None. - + Examples -------- Build WQ Characteristic Data class: - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Phosphorus') >>> wq.units 'mg/l' - + >>> wq.update_units('mg/kg') >>> wq.units 'mg/kg' @@ -524,29 +547,36 @@ def update_units(self, units_out): def measure_mask(self, column=None): """Get mask for characteristic and valid measure. - + Mask is characteristic specific (c_mask) and only has valid col measures (Non-NA). - + Parameters ---------- column : str, optional DataFrame column name to use. Default None uses WQCharData.out_col attribute. - + Returns ------- None. - + Examples -------- Build DataFrame to use as input: - + >>> from pandas import DataFrame >>> from numpy import nan - >>> df = DataFrame({'CharacteristicName': ['Phosphorus', 'Temperature, water', 'Phosphorus', 'Phosphorus',], - ... 'ResultMeasure/MeasureUnitCode': ['mg/l as P', nan, 'mg/l', 'mg/l',], - ... 'ResultMeasureValue': ['1.0', '67.0', '10', 'None'], + >>> df = DataFrame( + ... { + ... 'CharacteristicName': [ + ... 'Phosphorus', + ... 'Temperature, water', + ... 'Phosphorus', + ... 'Phosphorus', + ... ], + ... 'ResultMeasure/MeasureUnitCode': ['mg/l as P', nan, 'mg/l', 'mg/l',], + ... 'ResultMeasureValue': ['1.0', '67.0', '10', 'None'], ... }) >>> df CharacteristicName ResultMeasure/MeasureUnitCode ResultMeasureValue @@ -554,14 +584,14 @@ def measure_mask(self, column=None): 1 Temperature, water NaN 67.0 2 Phosphorus mg/l 10 3 Phosphorus mg/l None - + Build WQ Characteristic Data class from pandas DataFrame: - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Phosphorus') - + Check measure mask: - + >>> wq.measure_mask() 0 True 1 False @@ -573,9 +603,9 @@ def measure_mask(self, column=None): return self.c_mask & self.df[column].notna() return self.c_mask & self.df[self.out_col].notna() - def convert_units(self, default_unit=None, errors='raise'): + def convert_units(self, default_unit=None, errors="raise"): """Update out-col to convert units. - + Update class out-col used to convert :class:`pandas.DataFrame`. from old units to default_unit. @@ -588,15 +618,15 @@ def convert_units(self, default_unit=None, errors='raise'): If ‘raise’, invalid dimension conversions will raise an exception. If ‘skip’, invalid dimension conversions will not be converted. If ‘ignore’, invalid dimension conversions will be NaN. - + Returns ------- None. - + Examples -------- Build pandas DataFrame to use as input: - + >>> from pandas import DataFrame >>> df = DataFrame({'CharacteristicName': ['Phosphorus', 'Temperature, water',], ... 'ResultMeasure/MeasureUnitCode': ['mg/ml', 'deg C'], @@ -606,12 +636,12 @@ def convert_units(self, default_unit=None, errors='raise'): CharacteristicName ResultMeasure/MeasureUnitCode ResultMeasureValue 0 Phosphorus mg/ml 1.0 1 Temperature, water deg C 10.0 - + Build WQ Characteristic Data class from pandas DataFrame: - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Phosphorus') - + >>> wq.convert_units() >>> wq.df[['ResultMeasureValue', 'Units', 'Phosphorus']] ResultMeasureValue Units Phosphorus @@ -623,17 +653,19 @@ def convert_units(self, default_unit=None, errors='raise'): df_out = self.df m_mask = self.measure_mask() - params = {'quantity_series': df_out.loc[m_mask, self.out_col], - 'unit_series': df_out.loc[m_mask, self.col.unit_out], - 'units': self.units, - 'ureg': self.ureg, - 'errors': errors} + params = { + "quantity_series": df_out.loc[m_mask, self.out_col], + "unit_series": df_out.loc[m_mask, self.col.unit_out], + "units": self.units, + "ureg": self.ureg, + "errors": errors, + } df_out.loc[m_mask, self.out_col] = convert_unit_series(**params) self.df = df_out def apply_conversion(self, convert_fun, unit, u_mask=None): """Apply special dimension changing conversions. - + This uses functions in convert module and apply them across all cases of current unit. @@ -654,21 +686,27 @@ def apply_conversion(self, convert_fun, unit, u_mask=None): Examples -------- Build pandas DataFrame to use as input: - + >>> from pandas import DataFrame - >>> df = DataFrame({'CharacteristicName': ['Dissolved oxygen (DO)', 'Dissolved oxygen (DO)',], - ... 'ResultMeasure/MeasureUnitCode': ['mg/l', '%'], - ... 'ResultMeasureValue': ['1.0', '10.0',], - ... }) + >>> df = DataFrame( + ... { + ... 'CharacteristicName': [ + ... 'Dissolved oxygen (DO)', + ... 'Dissolved oxygen (DO)', + ... ], + ... 'ResultMeasure/MeasureUnitCode': ['mg/l', '%'], + ... 'ResultMeasureValue': ['1.0', '10.0',], + ... } + ... ) >>> df CharacteristicName ResultMeasure/MeasureUnitCode ResultMeasureValue 0 Dissolved oxygen (DO) mg/l 1.0 1 Dissolved oxygen (DO) % 10.0 - + Build WQ Characteristic Data class from pandas DataFrame: - + >>> from harmonize_wq import wq_data - >>> wq = wq_data.WQCharData(df, 'Dissolved oxygen (DO)') + >>> wq = wq_data.WQCharData(df, 'Dissolved oxygen (DO)') >>> wq.apply_conversion(convert.DO_saturation, '%') >>> wq.df[['Units', 'DO']] Units DO @@ -682,13 +720,13 @@ def apply_conversion(self, convert_fun, unit, u_mask=None): unit = self.ureg.Quantity(unit) # Pint quantity object from unit old_vals = df_out.loc[u_mask, self.out_col] try: - new_quants = [convert_fun(x*unit) for x in old_vals] + new_quants = [convert_fun(x * unit) for x in old_vals] except ValueError: - #print(old_vals.iloc[0]*unit) + # print(old_vals.iloc[0]*unit) # string to avoid altered ureg issues - new_quants = [convert_fun(str(x*unit)) for x in old_vals] + new_quants = [convert_fun(str(x * unit)) for x in old_vals] # 1run=6505.62ms (may be slower) vs apply (5888.43ms) - #new_vals = old_vals.apply(lambda x: convert_fun(x*unit).magnitude) + # new_vals = old_vals.apply(lambda x: convert_fun(x*unit).magnitude) new_vals = [quant.magnitude for quant in new_quants] df_out.loc[u_mask, self.out_col] = new_vals df_out.loc[u_mask, self.col.unit_out] = str(new_quants[0].units) @@ -713,31 +751,30 @@ def dimensions_list(self, m_mask=None): Examples -------- Build pandas DataFrame to use as input: - + >>> from pandas import DataFrame >>> df = DataFrame({'CharacteristicName': ['Phosphorus', 'Phosphorus',], ... 'ResultMeasure/MeasureUnitCode': ['mg/l', 'mg/kg',], - ... 'ResultMeasureValue': ['1.0', '10',], + ... 'ResultMeasureValue': ['1.0', '10',], ... }) >>> df CharacteristicName ResultMeasure/MeasureUnitCode ResultMeasureValue 0 Phosphorus mg/l 1.0 1 Phosphorus mg/kg 10 - + Build WQ Characteristic Data class from pandas DataFrame: - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Phosphorus') - + >>> wq.dimensions_list() ['mg/kg'] """ if m_mask is None: m_mask = self.measure_mask() - return units_dimension(self.df.loc[m_mask, - self.col.unit_out], - self.units, - self.ureg) + return units_dimension( + self.df.loc[m_mask, self.col.unit_out], self.units, self.ureg + ) def replace_unit_str(self, old, new, mask=None): """Replace ALL instances of old with in WQCharData.col.unit_out column. @@ -751,30 +788,33 @@ def replace_unit_str(self, old, new, mask=None): mask : pandas.Series, optional Conditional mask to limit rows. The default None, uses the c_mask attribute. - + Examples -------- Build pandas DataFrame to use as input: - + >>> from pandas import DataFrame - >>> df = DataFrame({'CharacteristicName': ['Temperature, water', 'Temperature, water',], - ... 'ResultMeasure/MeasureUnitCode': ['deg C', 'deg F',], - ... 'ResultMeasureValue': ['31', '87',], - ... }) + >>> df = DataFrame( + ... { + ... "CharacteristicName": ["Temperature, water", "Temperature, water",], + ... "ResultMeasure/MeasureUnitCode": ["deg C", "deg F",], + ... "ResultMeasureValue": ["31", "87",], + ... } + ... ) >>> df CharacteristicName ResultMeasure/MeasureUnitCode ResultMeasureValue 0 Temperature, water deg C 31 1 Temperature, water deg F 87 - + Build WQ Characteristic Data class from pandas DataFrame: - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Temperature, water') >>> wq.df[['ResultMeasure/MeasureUnitCode', 'Units', 'Temperature']] ResultMeasure/MeasureUnitCode Units Temperature 0 deg C deg C 31 1 deg F deg F 87 - + >>> wq.replace_unit_str(' ', '') >>> wq.df[['ResultMeasure/MeasureUnitCode', 'Units', 'Temperature']] ResultMeasure/MeasureUnitCode Units Temperature @@ -790,7 +830,7 @@ def replace_unit_str(self, old, new, mask=None): def replace_unit_by_dict(self, val_dict, mask=None): """Do multiple replace_in_col() replacements using val_dict. - + Replaces instances of val_dict key with val_dict value. Parameters @@ -800,27 +840,27 @@ def replace_unit_by_dict(self, val_dict, mask=None): mask : pandas.Series, optional Conditional mask to limit rows. The default None, uses the c_mask attribute. - + Returns ------- None. - + Examples -------- Build pandas DataFrame to use as input: - + >>> from pandas import DataFrame >>> df = DataFrame({'CharacteristicName': ['Fecal Coliform', 'Fecal Coliform',], ... 'ResultMeasure/MeasureUnitCode': ['#/100ml', 'MPN',], - ... 'ResultMeasureValue': ['1.0', '10',], + ... 'ResultMeasureValue': ['1.0', '10',], ... }) >>> df CharacteristicName ResultMeasure/MeasureUnitCode ResultMeasureValue 0 Fecal Coliform #/100ml 1.0 1 Fecal Coliform MPN 10 - + Build WQ Characteristic Data class from pandas DataFrame: - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Fecal Coliform') >>> wq.df @@ -829,7 +869,7 @@ def replace_unit_by_dict(self, val_dict, mask=None): 1 Fecal Coliform MPN ... MPN 10.0 [2 rows x 5 columns] - + >>> wq.replace_unit_by_dict(domains.UNITS_REPLACE['Fecal_Coliform']) >>> wq.df CharacteristicName ResultMeasure/MeasureUnitCode ... Units Fecal_Coliform @@ -837,13 +877,18 @@ def replace_unit_by_dict(self, val_dict, mask=None): 1 Fecal Coliform MPN ... MPN/(100ml) 10.0 [2 rows x 5 columns] - """ + """ # noqa: E501 col = self.col.unit_out for item in val_dict.items(): self._replace_in_col(col, item[0], item[1], mask) - def fraction(self, frac_dict=None, catch_all=None, suffix=None, - fract_col='ResultSampleFractionText'): + def fraction( + self, + frac_dict=None, + catch_all=None, + suffix=None, + fract_col="ResultSampleFractionText", + ): """Create columns for sample fractions using frac_dict to set names. Parameters @@ -864,11 +909,11 @@ def fraction(self, frac_dict=None, catch_all=None, suffix=None, ------- frac_dict : dict frac_dict updated to include any fract_col not already defined. - + Examples -------- Build pandas DataFrame to use as input: - + >>> from pandas import DataFrame >>> df = DataFrame({'CharacteristicName': ['Phosphorus', 'Phosphorus',], ... 'ResultMeasure/MeasureUnitCode': ['mg/l', 'mg/kg',], @@ -878,17 +923,17 @@ def fraction(self, frac_dict=None, catch_all=None, suffix=None, >>> df CharacteristicName ... ResultSampleFractionText 0 Phosphorus ... Dissolved - 1 Phosphorus ... + 1 Phosphorus ... [2 rows x 4 columns] - + Build WQ Characteristic Data class from pandas DataFrame: - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Phosphorus') - + Go through required checks and conversions - + >>> wq.check_units() >>> dimension_dict, mol_list = wq.dimension_fixes() >>> wq.replace_unit_by_dict(dimension_dict, wq.measure_mask()) @@ -903,7 +948,7 @@ def fraction(self, frac_dict=None, catch_all=None, suffix=None, 0 1.0 milligram / liter 1 10.000000000000002 milligram / liter Name: Phosphorus, dtype: object - + These results may have differen, non-comprable sample fractions. First, split results using a provided frac_dict (as used in harmonize()): @@ -921,9 +966,10 @@ def fraction(self, frac_dict=None, catch_all=None, suffix=None, TDP_Phosphorus Other_Phosphorus 0 1.0 milligram / liter NaN 1 NaN 10.000000000000002 milligram / liter - - Alternatively, the sample fraction lists from tada can be used, in this case they are added: - + + Alternatively, the sample fraction lists from tada can be used, in this case + they are added: + >>> wq.fraction('TADA') >>> wq.df.columns Index(['CharacteristicName', 'ResultMeasure/MeasureUnitCode', @@ -943,25 +989,25 @@ def fraction(self, frac_dict=None, catch_all=None, suffix=None, fracs = list(set(self.df[c_mask][fract_col])) # List of fracs in data - if ' ' in fracs: - #TODO: new col instead of overwrite + if " " in fracs: + # TODO: new col instead of overwrite # Replace bad sample fraction w/ nan - self.df = self._replace_in_col(fract_col, ' ', nan, c_mask) - fracs.remove(' ') + self.df = self._replace_in_col(fract_col, " ", nan, c_mask) + fracs.remove(" ") df_out = self.df # Set var for easier referencing - char = list(set(df_out[self.c_mask]['CharacteristicName']))[0] + char = list(set(df_out[self.c_mask]["CharacteristicName"]))[0] # Deal with lack of args if suffix is None: suffix = self.out_col if catch_all is None: - catch_all = f'Other_{suffix}' + catch_all = f"Other_{suffix}" # Set up dict for what sample fraction to what col if frac_dict is None: frac_dict = {} - elif frac_dict=='TADA': + elif frac_dict == "TADA": # Get dictionary for updates from TADA (note keys are all caps) tada = domains.harmonize_TADA_dict()[char.upper()] frac_dict = {} @@ -970,40 +1016,40 @@ def fraction(self, frac_dict=None, catch_all=None, suffix=None, frac_dict[key] = list(tada[key]) # Add their values frac_dict[key] += [x for v in tada[key].values() for x in v] - #else: dict was already provided + # else: dict was already provided if catch_all not in frac_dict.keys(): - frac_dict[catch_all] = ['', nan] + frac_dict[catch_all] = ["", nan] # Make sure catch_all exists if not isinstance(frac_dict[catch_all], list): frac_dict[catch_all] = [frac_dict[catch_all]] # First cut to make the keys work as column names for key in frac_dict: - frac_dict[key.replace(',', '_')] = frac_dict.pop(key) + frac_dict[key.replace(",", "_")] = frac_dict.pop(key) for key in frac_dict: if key == self.out_col: - #TODO: prevent it from over-writing any col + # TODO: prevent it from over-writing any col # If it is the same col name as the out_col add '_1' - frac_dict[key+'_1'] = frac_dict.pop(key) + frac_dict[key + "_1"] = frac_dict.pop(key) # Compare sample fractions against expected init_fracs = [x for v in frac_dict.values() for x in v] not_init = [frac for frac in fracs if frac not in init_fracs] - if len(not_init)>0: + if len(not_init) > 0: # TODO: when to add QA_flag? - smp = f'{char} sample fractions not in frac_dict' + smp = f"{char} sample fractions not in frac_dict" solution = f'expected domains, mapped to "{catch_all}"' - print(f'{len(not_init)} {smp}') + print(f"{len(not_init)} {smp}") # Compare against domains - all_fracs = list(domains.get_domain_dict('ResultSampleFraction')) + all_fracs = list(domains.get_domain_dict("ResultSampleFraction")) add_fracs = [frac for frac in not_init if frac in all_fracs] # Add new fractions to frac_dict mapped to catch_all - if len(add_fracs)>0: - print(f'{len(add_fracs)} {smp} found in {solution}') + if len(add_fracs) > 0: + print(f"{len(add_fracs)} {smp} found in {solution}") frac_dict[catch_all] += add_fracs bad_fracs = [frac for frac in not_init if frac not in all_fracs] - if len(bad_fracs)>0: - warn(f'{len(bad_fracs)} {smp} or {solution}') + if len(bad_fracs) > 0: + warn(f"{len(bad_fracs)} {smp} or {solution}") frac_dict[catch_all] += bad_fracs # Loop through dictionary making updates based on sample fraction @@ -1019,7 +1065,7 @@ def fraction(self, frac_dict=None, catch_all=None, suffix=None, def dimension_fixes(self): """ Input/output for dimension handling. - + Result dictionary key is old_unit and value is equation to get it into the desired dimension. Result list has substance to include as part of unit. @@ -1039,11 +1085,11 @@ def dimension_fixes(self): Examples -------- Build pandas DataFrame to use as input: - + >>> from pandas import DataFrame >>> df = DataFrame({'CharacteristicName': ['Phosphorus', 'Phosphorus',], ... 'ResultMeasure/MeasureUnitCode': ['mg/l', 'mg/kg',], - ... 'ResultMeasureValue': ['1.0', '10',], + ... 'ResultMeasureValue': ['1.0', '10',], ... }) >>> df CharacteristicName ResultMeasure/MeasureUnitCode ResultMeasureValue @@ -1051,10 +1097,10 @@ def dimension_fixes(self): 1 Phosphorus mg/kg 10 Build WQ Characteristic Data class from pandas DataFrame: - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Phosphorus') - + >>> wq.dimension_fixes() ({'mg/kg': 'mg/kg * H2O'}, []) """ @@ -1062,29 +1108,28 @@ def dimension_fixes(self): mol_list = [] # Empty list to append to # If converting to/from moles has extra steps - if self.ureg(self.units).check({'[substance]': 1}): + if self.ureg(self.units).check({"[substance]": 1}): # Convert everything to MOLES!!! # Must consider the different speciation for each - #TODO: This could be problematic given umol/l - warn('This feature is not available yet') + # TODO: This could be problematic given umol/l + warn("This feature is not available yet") return {}, [] for unit in self.dimensions_list(): - if self.ureg(unit).check({'[substance]': 1}): - mol_params = {'ureg': self.ureg, - 'Q_': self.ureg.Quantity(1, unit),} + if self.ureg(unit).check({"[substance]": 1}): + mol_params = { + "ureg": self.ureg, + "Q_": self.ureg.Quantity(1, unit), + } # Moles need to be further split by basis basis_lst = list(set(self.df.loc[self.c_mask, self.col.basis])) for speciation in basis_lst: - mol_params['basis'] = speciation + mol_params["basis"] = speciation quant = str(moles_to_mass(**mol_params)) - dim_tup = self._dimension_handling(unit, - quant, - self.ureg) + dim_tup = self._dimension_handling(unit, quant, self.ureg) dimension_dict.update(dim_tup[0]) - mol_list+= dim_tup[1] + mol_list += dim_tup[1] else: - dim_tup = self._dimension_handling(unit, - ureg = self.ureg) + dim_tup = self._dimension_handling(unit, ureg=self.ureg) dimension_dict.update(dim_tup[0]) return dimension_dict, mol_list @@ -1095,15 +1140,15 @@ def moles_convert(self, mol_list): ---------- mol_list : list List of Mole (substance) units. - + Returns ------- None. - + Examples -------- Build pandas DataFrame to use as input: - + >>> from pandas import DataFrame >>> from numpy import nan >>> df = DataFrame({'CharacteristicName': ['Organic carbon', 'Organic carbon',], @@ -1115,9 +1160,9 @@ def moles_convert(self, mol_list): ResultMeasure/MeasureUnitCode ResultMeasureValue 0 mg/l 1.0 1 umol 0.265 - + Build WQ Characteristic Data class from pandas DataFrame: - + >>> from harmonize_wq import wq_data >>> wq = wq_data.WQCharData(df, 'Organic carbon') >>> wq.df @@ -1126,12 +1171,12 @@ def moles_convert(self, mol_list): 1 Organic carbon umol ... umol 0.265 [2 rows x 6 columns] - + Run required checks: - + >>> wq.check_basis() >>> wq.check_units() - + Assemble dimensions dict and moles list: >>> dimension_dict, mol_list = wq.dimension_fixes() @@ -1139,25 +1184,25 @@ def moles_convert(self, mol_list): {'umol': '0.00018015999999999998 gram / l'} >>> mol_list ['0.00018015999999999998 gram / l'] - + Replace units by dimension_dict: - + >>> wq.replace_unit_by_dict(dimension_dict, wq.measure_mask()) >>> wq.df[['Units', 'Carbon']] Units Carbon 0 mg/l 1.000 - 1 0.00018015999999999998 gram / l 0.265 - + 1 0.00018015999999999998 gram / l 0.265 + Convert Carbon measure into whole units: - + >>> wq.moles_convert(mol_list) >>> wq.df[['Units', 'Carbon']] Units Carbon 0 mg/l 1.000000 1 gram / liter 0.000048 - + This allows final conversion without dimensionality issues: - + >>> wq.convert_units() >>> wq.df['Carbon'] 0 1.0 milligram / liter diff --git a/harmonize_wq/wrangle.py b/harmonize_wq/wrangle.py index 799308f..7ef534f 100644 --- a/harmonize_wq/wrangle.py +++ b/harmonize_wq/wrangle.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- """Functions to help re-shape the WQP pandas DataFrame.""" -import pandas + import geopandas -from harmonize_wq import domains -from harmonize_wq.clean import datetime, harmonize_depth, df_checks +import pandas from dataretrieval import wqp +from harmonize_wq import domains +from harmonize_wq.clean import datetime, df_checks, harmonize_depth + def split_table(df_in): """Split DataFrame columns axis into main and characteristic based. - + Splits :class:`pandas.DataFrame` in two, one with main results columns and one with Characteristic based metadata. @@ -32,19 +34,19 @@ def split_table(df_in): Examples -------- - See any of the 'Simple' notebooks found in + See any of the 'Simple' notebooks found in `demos `_ for examples of how this function is used to divide the table into columns of interest (main_df) and characteristic specific metadata (chars_df). - + """ # Run datetime on activity fields if not already done - if 'Activity_datetime' not in list(df_in.columns): + if "Activity_datetime" not in list(df_in.columns): df_out = datetime(df_in) else: df_out = df_in.copy() # Run depth if not already done - if 'Depth' not in list(df_in.columns): + if "Depth" not in list(df_in.columns): df_out = harmonize_depth(df_out) chars_cols = domains.characteristic_cols() # Characteristic columns list @@ -54,12 +56,12 @@ def split_table(df_in): return main_df, chars_df -def split_col(df_in, result_col='QA_flag', col_prefix='QA'): +def split_col(df_in, result_col="QA_flag", col_prefix="QA"): """Move each row value from a column to a characteristic specific column. Values are moved from the result_col in df_in to a new column where the column name is col_prefix + characteristic. - + Parameters ---------- df_in : pandas.DataFrame @@ -76,29 +78,29 @@ def split_col(df_in, result_col='QA_flag', col_prefix='QA'): Examples -------- - See any of the 'Simple' notebooks found in + See any of the 'Simple' notebooks found in `demos `_ for examples of how this function is used to split the QA column into multiple characteristic specific QA columns. - + """ # TODO: is this function doing too much? df_out = df_in.copy() - char_list = list(set(df_out['CharacteristicName'])) + char_list = list(set(df_out["CharacteristicName"])) # TODO: try/catch on key error col_list = [domains.out_col_lookup[char_name] for char_name in char_list] # TODO: generalize to multi-characteristics other than phosphorus - char = 'Phosphorus' + char = "Phosphorus" if char in char_list: i = char_list.index(char) - suffix = '_' + domains.out_col_lookup[char] + suffix = "_" + domains.out_col_lookup[char] col_list[i] = [col for col in df_out.columns if col.endswith(suffix)] # Drop rows where result na for i, char in enumerate(char_list): - mask = (df_out['CharacteristicName'] == char) + mask = df_out["CharacteristicName"] == char if isinstance(col_list[i], list): # All columns with that suffix must be nan for col in col_list[i]: @@ -113,12 +115,12 @@ def split_col(df_in, result_col='QA_flag', col_prefix='QA'): # Currently written to drop NaN qa flags, to keep them filter on char if isinstance(out_col, list): for col_out in out_col: - new_col = col_prefix + '_' + col_out + new_col = col_prefix + "_" + col_out mask = df_out[col_out].notna() df_out.loc[mask, new_col] = df_out.loc[mask, result_col] else: mask = df_out[out_col].notna() - new_col = col_prefix + '_' + out_col + new_col = col_prefix + "_" + out_col # Create characteristic specific QA field df_out.loc[mask, new_col] = df_out.loc[mask, result_col] @@ -129,17 +131,17 @@ def split_col(df_in, result_col='QA_flag', col_prefix='QA'): # def split_unit(series): - # If results are being written to another format that does not support - # pint objects the units must be recorded. If in the standard ureg it seems - # to write them as string, otherwise it errors. Ideally we'd either - # transfer the units to within the column name or in a seperate column (not - # preffered, only is multiple units). +# If results are being written to another format that does not support +# pint objects the units must be recorded. If in the standard ureg it seems +# to write them as string, otherwise it errors. Ideally we'd either +# transfer the units to within the column name or in a seperate column (not +# preffered, only is multiple units). # return series def collapse_results(df_in, cols=None): """Group rows/results that seems like the same sample. - + Default columns are organization, activity, location, and datetime. Parameters @@ -156,11 +158,11 @@ def collapse_results(df_in, cols=None): Examples -------- - See any of the 'Simple' notebooks found in + See any of the 'Simple' notebooks found in `demos `_ for examples of how this function is used to combine rows with the same sample organization, activity, location, and datetime. - + """ df = df_in.copy() @@ -169,16 +171,19 @@ def collapse_results(df_in, cols=None): # TODO: use date instead of datetime if na? (date_idx) if not cols: - cols = ['MonitoringLocationIdentifier', - 'Activity_datetime', - 'ActivityIdentifier', - 'OrganizationIdentifier'] + cols = [ + "MonitoringLocationIdentifier", + "Activity_datetime", + "ActivityIdentifier", + "OrganizationIdentifier", + ] df_indexed = df.groupby(by=cols, dropna=False).first() # TODO: warn about multi-lines with values (only returns first) problems = df.groupby(by=cols, dropna=False).first(min_count=2) - problems = problems.dropna(axis=1, how='all') + problems = problems.dropna(axis=1, how="all") return df_indexed + # def combine_results(df_in): # """ # NOT IN USE @@ -241,8 +246,8 @@ def collapse_results(df_in, cols=None): def get_activities_by_loc(characteristic_names, locations): """Segment batch what_activities. - - Warning this is not fully implemented and may not stay. Retrieves in batch + + Warning this is not fully implemented and may not stay. Retrieves in batch using :func:`dataretrieval.what_activities`. Parameters @@ -256,7 +261,7 @@ def get_activities_by_loc(characteristic_names, locations): ------- activities : pandas.DataFrame Combined activities for locations. - + Examples -------- See :func:`wrangle.add_activities_to_df` @@ -264,9 +269,8 @@ def get_activities_by_loc(characteristic_names, locations): # Split loc_list as query by list may cause the query url to be too long seg = 200 # Max length of each segment activities_list, md_list = [], [] - for loc_que in [locations[x:x+seg] for x in range(0, len(locations), seg)]: - query = {'characteristicName': characteristic_names, - 'siteid': loc_que} + for loc_que in [locations[x : x + seg] for x in range(0, len(locations), seg)]: + query = {"characteristicName": characteristic_names, "siteid": loc_que} res = wqp.what_activities(**query) activities_list.append(res[0]) # Query response DataFrame md_list.append(res[1]) # Query response metadata @@ -294,26 +298,26 @@ def add_activities_to_df(df_in, mask=None): Examples -------- Build example df_in table from harmonize_wq tests to use in place of Water - Quality Portal query response, this table has 'Temperature, water' and + Quality Portal query response, this table has 'Temperature, water' and 'Phosphorous' results: - + >>> import pandas >>> tests_url = 'https://raw.githubusercontent.com/USEPA/harmonize-wq/main/harmonize_wq/tests' >>> df1 = pandas.read_csv(tests_url + '/data/wqp_results.txt') >>> df1.shape (359505, 35) - + Run on the first 1000 results: >>> df2 = df1[:1000] - + >>> from harmonize_wq import wrangle >>> df_activities = wrangle.add_activities_to_df(df2) >>> df_activities.shape (1000, 100) - + Look at the columns added: - + >>> df_activities.columns[-65:] Index(['ActivityTypeCode', 'ActivityMediaName', 'ActivityMediaSubdivisionName', 'ActivityEndDate', 'ActivityEndTime/Time', @@ -363,16 +367,16 @@ def add_activities_to_df(df_in, mask=None): """ df_out = df_in.copy() # Check df for loc_field - loc_col = 'MonitoringLocationIdentifier' + loc_col = "MonitoringLocationIdentifier" df_checks(df_out, [loc_col]) # List of unique sites and characteristicNames if mask: loc_list = list(set(df_out.loc[mask, loc_col].dropna())) - char_vals = list(set(df_out.loc[mask, 'CharacteristicName'].dropna())) + char_vals = list(set(df_out.loc[mask, "CharacteristicName"].dropna())) else: # Get all loc_list = list(set(df_out[loc_col].dropna())) - char_vals = list(set(df_out['CharacteristicName'].dropna())) + char_vals = list(set(df_out["CharacteristicName"].dropna())) # Get results act_df = get_activities_by_loc(char_vals, loc_list) # Merge results @@ -399,31 +403,31 @@ def add_detection(df_in, char_val): Examples -------- Build example df_in table from harmonize_wq tests to use in place of Water - Quality Portal query response, this table has 'Temperature, water' and + Quality Portal query response, this table has 'Temperature, water' and 'Phosphorous' results: - + >>> import pandas >>> tests_url = 'https://raw.githubusercontent.com/USEPA/harmonize-wq/main/harmonize_wq/tests' >>> df1 = pandas.read_csv(tests_url + '/data/wqp_results.txt') >>> df1.shape (359505, 35) - + Run on the 1000 results to speed it up: >>> df2 = df1[19000:20000] >>> df2.shape (1000, 35) - + >>> from harmonize_wq import wrangle >>> df_detects = wrangle.add_detection(df2, 'Phosphorus') >>> df_detects.shape (1001, 38) - - Note: the additional rows are due to one result being able to be assigned + + Note: the additional rows are due to one result being able to be assigned multiple detection results. This is not the case for e.g., df1[:1000] - + Look at the columns added: - + >>> df_detects.columns[-3:] Index(['DetectionQuantitationLimitTypeName', 'DetectionQuantitationLimitMeasure/MeasureValue', @@ -432,22 +436,22 @@ def add_detection(df_in, char_val): """ df_out = df_in.copy() # Check df for loc_field - loc_col = 'MonitoringLocationIdentifier' - res_id = 'ResultIdentifier' + loc_col = "MonitoringLocationIdentifier" + res_id = "ResultIdentifier" df_checks(df_out, [loc_col, res_id]) - c_mask = df_out['CharacteristicName'] == char_val # Mask to limit rows + c_mask = df_out["CharacteristicName"] == char_val # Mask to limit rows loc_series = df_out.loc[c_mask, loc_col] # Location Series res_series = df_out.loc[c_mask, res_id] # Location Series # Get results detect_df = get_detection_by_loc(loc_series, res_series, char_val) # Merge results to table - df_merged = merge_tables(df_out, detect_df, merge_cols='all') + df_merged = merge_tables(df_out, detect_df, merge_cols="all") return df_merged def get_detection_by_loc(loc_series, result_id_series, char_val=None): """Get detection quantitation by location and characteristic (optional). - + Retrieves detection quantitation results by location and characteristic name (optional). ResultIdentifier can not be used to search. Instead location id from loc_series is used and then results are limited by @@ -483,23 +487,23 @@ def get_detection_by_loc(loc_series, result_id_series, char_val=None): # Split list - query by full list may cause the query url to be too long seg = 200 # Max length of each segment detection_list, md_list = [], [] - for id_que in [id_list[x:x+seg] for x in range(0, len(id_list), seg)]: - query = {'siteid': id_que} + for id_que in [id_list[x : x + seg] for x in range(0, len(id_list), seg)]: + query = {"siteid": id_que} if char_val: - query['characteristicName'] = char_val + query["characteristicName"] = char_val res = wqp.what_detection_limits(**query) detection_list.append(res[0]) # Query response DataFrame md_list.append(res[1]) # Query response metadata # Combine the dataframe results in the list detection_df = pandas.concat(detection_list).drop_duplicates() # Filter on resultID - df_out = detection_df[detection_df['ResultIdentifier'].isin(result_idx)] + df_out = detection_df[detection_df["ResultIdentifier"].isin(result_idx)] return df_out -def merge_tables(df1, df2, df2_cols='all', merge_cols='activity'): +def merge_tables(df1, df2, df2_cols="all", merge_cols="activity"): """Merge df1 and df2. - + Merge tables(df1 and df2), adding df2_cols to df1 where merge_cols match. Parameters @@ -524,17 +528,17 @@ def merge_tables(df1, df2, df2_cols='all', merge_cols='activity'): -------- Build example table from harmonize_wq tests to use in place of Water Quality Portal query responses: - + >>> import pandas >>> tests_url = 'https://raw.githubusercontent.com/USEPA/harmonize-wq/main/harmonize_wq/tests' >>> df1 = pandas.read_csv(tests_url + '/data/wqp_results.txt') >>> df1.shape (359505, 35) - + >>> df2 = pandas.read_csv(tests_url + '/data/wqp_activities.txt') >>> df2.shape (353911, 40) - + >>> from harmonize_wq import wrangle >>> merged = wrangle.merge_tables(df1, df2) >>> merged.shape @@ -543,17 +547,18 @@ def merge_tables(df1, df2, df2_cols='all', merge_cols='activity'): # TODO: change merge_cols default to all? col2_list = list(df2.columns) - test = merge_cols == 'activity' # Special activity test = true/false + test = merge_cols == "activity" # Special activity test = true/false - if merge_cols == 'activity': + if merge_cols == "activity": # ActivityIdentifiers are non-unique. More cols for one-to-one match. - merge_cols = ['ActivityIdentifier', - 'ActivityStartDate', - 'ActivityStartTime/Time', - 'ActivityStartTime/TimeZoneCode', - 'MonitoringLocationIdentifier', - ] - elif merge_cols == 'all': + merge_cols = [ + "ActivityIdentifier", + "ActivityStartDate", + "ActivityStartTime/Time", + "ActivityStartTime/TimeZoneCode", + "MonitoringLocationIdentifier", + ] + elif merge_cols == "all": # Use ALL shared columns. For activity this is += # 'OrganizationIdentifier', 'OrganizationFormalName', 'ProviderName' merge_cols = [x for x in list(df1.columns) if x in col2_list] @@ -561,21 +566,21 @@ def merge_tables(df1, df2, df2_cols='all', merge_cols='activity'): # Check columns in both tables shared = [x for x in list(df1.columns) if x in col2_list] for col in merge_cols: - assert col in shared, f'{col} not in both DataFrames' + assert col in shared, f"{col} not in both DataFrames" # Columns to add from df2 - if df2_cols == 'all': + if df2_cols == "all": # All columns not in df1 df2_cols = [x for x in col2_list if x not in list(df1.columns)] else: for col in df2_cols: - assert col in col2_list, f'{col} not in DataFrame' + assert col in col2_list, f"{col} not in DataFrame" # Merge activity columns to narrow results df2 = df2[merge_cols + df2_cols] # Limit df2 to columns we want df2 = df2.drop_duplicates() # Reduces many to one joins # Merge activity columns to narrow results - merged_results = pandas.merge(df1, df2, how='left', on=merge_cols) + merged_results = pandas.merge(df1, df2, how="left", on=merge_cols) if test: # Many df2 to one df1 gets multiple rows, test for extra activities # TODO: Throw more descriptive error? @@ -596,12 +601,12 @@ def as_gdf(shp): ------- shp : geopandas.GeoDataFrame GeoDataFrame for shp if it isn't already a GeoDataFrame. - + Examples -------- Use area of interest GeoJSON for Pensacola and Perdido Bays, FL from harmonize_wq tests: - + >>> from harmonize_wq import wrangle >>> aoi_url = r'https://raw.githubusercontent.com/USEPA/harmonize-wq/main/harmonize_wq/tests/data/PPBays_NCCA.geojson' >>> type(wrangle.as_gdf(aoi_url)) @@ -626,7 +631,7 @@ def get_bounding_box(shp, idx=None): Returns ------- Coordinates for bounding box as string and separated by ', '. - + Examples -------- Use area of interest GeoJSON for Pensacola and Perdido Bays, FL from @@ -642,21 +647,21 @@ def get_bounding_box(shp, idx=None): if idx is None: bbox = shp.total_bounds else: - xmin = shp.bounds['minx'][idx] - xmax = shp.bounds['maxx'][idx] - ymin = shp.bounds['miny'][idx] - ymax = shp.bounds['maxy'][idx] + xmin = shp.bounds["minx"][idx] + xmax = shp.bounds["maxx"][idx] + ymin = shp.bounds["miny"][idx] + ymax = shp.bounds["maxy"][idx] bbox = [xmin, ymin, xmax, ymax] - return ','.join(map(str, bbox)) + return ",".join(map(str, bbox)) def clip_stations(stations, aoi): """Clip stations to area of interest (aoi). - + Locations and results are queried by extent rather than the exact geometry. Clipping by the exact geometry helps reduce the size of the results. - + Notes ----- aoi is first transformed to CRS of stations. @@ -672,11 +677,11 @@ def clip_stations(stations, aoi): ------- pandas.DataFrame stations_gdf points clipped to the aoi_gdf. - + Examples -------- Build example geopandas GeoDataFrame of locations for stations: - + >>> import geopandas >>> from shapely.geometry import Point >>> from numpy import nan @@ -688,12 +693,12 @@ def clip_stations(stations, aoi): MonitoringLocationIdentifier geometry 0 In POINT (-87.12500 30.50000) 1 Out POINT (-87.50000 30.50000) - + Use area of interest GeoJSON for Pensacola and Perdido Bays, FL from harmonize_wq tests: >>> aoi_url = r'https://raw.githubusercontent.com/USEPA/harmonize-wq/main/harmonize_wq/tests/data/PPBays_NCCA.geojson' - + >>> stations_in_aoi = harmonize_wq.wrangle.clip_stations(stations_gdf, aoi_url) >>> stations_in_aoi MonitoringLocationIdentifier geometry @@ -708,7 +713,7 @@ def clip_stations(stations, aoi): def to_simple_shape(gdf, out_shp): """Simplify GeoDataFrame for better export to shapefile. - + Adopts and adapts 'Simple' from `NWQMC/pywqp `_ See :func:`domains.stations_rename` for renaming of columns. @@ -723,7 +728,7 @@ def to_simple_shape(gdf, out_shp): Examples -------- Build example geopandas GeoDataFrame of locations for stations: - + >>> import geopandas >>> from shapely.geometry import Point >>> from numpy import nan @@ -735,9 +740,9 @@ def to_simple_shape(gdf, out_shp): MonitoringLocationIdentifier geometry 0 In POINT (-87.12500 30.50000) 1 Out POINT (-87.50000 30.50000) - + Add datetime column - + >>> gdf['ActivityStartDate'] = ['2004-09-01', '2004-02-18'] >>> gdf['ActivityStartTime/Time'] = ['10:01:00', '15:39:00'] >>> gdf['ActivityStartTime/TimeZoneCode'] = ['EST', 'EST'] @@ -749,7 +754,7 @@ def to_simple_shape(gdf, out_shp): 1 Out ... 2004-02-18 20:39:00+00:00 [2 rows x 6 columns] - + >>> from harmonize_wq import wrangle >>> wrangle.to_simple_shape(gdf, 'dataframe.shp') """ @@ -766,13 +771,15 @@ def to_simple_shape(gdf, out_shp): # Results columns need to be str not pint (.astype(str)) # Narrow based on out_col lookup dictionary - results_cols = [col for col in possible_results if col in domains.out_col_lookup.values()] + results_cols = [ + col for col in possible_results if col in domains.out_col_lookup.values() + ] # TODO: check based on suffix: e.g. Phosphorus # Rename each column w/ units and write results as str for col in results_cols: gdf[col] = gdf[col].astype(str) # Drop dateime - gdf = gdf.drop(columns=['Activity_datetime']) + gdf = gdf.drop(columns=["Activity_datetime"]) # date yyyy-mm-dd (shp) # schema = geopandas.io.file.infer_schema(gdf) # schema['properties']['StartDate'] = 'date' diff --git a/pyproject.toml b/pyproject.toml index 3c603ad..256faee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,3 +33,14 @@ dependencies = { file = ["requirements.txt"] } [tool.setuptools.dynamic.optional-dependencies] dev = { file = ["requirements-dev.txt"] } + +[tool.ruff.lint] +select = [ + "E", + "F", + "W", + "I" +] + +[tool.ruff.lint.isort] +known-first-party = ["harmonize_wq"]