Merge pull request #178 from MLMI2-CSSI/dev

add f.run() functionality add compatibility with Globus SDK 3 add DOI visibility to dataset summary
MLMI2-CSSI · May 11, 2022 · cddad8c · cddad8c
2 parents bc268cb + 9df5519
commit cddad8c
Show file tree

Hide file tree

Showing 6 changed files with 78 additions and 97 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
diff --git a/foundry/foundry.py b/foundry/foundry.py
@@ -51,7 +51,7 @@ class Foundry(FoundryMetadata):
     xtract_tokens: Any
 
     def __init__(
-        self, no_browser=False, no_local_server=False, index="mdf-test", authorizers=None, **data
+        self, no_browser=False, no_local_server=False, index="mdf", authorizers=None, **data
     ):
         """Initialize a Foundry client
         Args:
@@ -73,8 +73,7 @@ def __init__(
         if authorizers:
             auths = authorizers
         else:
-            auths = mdf_toolbox.login(
-                services=[
+            services = [
                     "data_mdf",
                     "mdf_connect",
                     "search",
@@ -84,12 +83,24 @@ def __init__(
                     "funcx",
                     "openid",
                     "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all",
-                ],
+                ]
+            auths = mdf_toolbox.login(
+                services=services,
                 app_name="Foundry",
                 make_clients=True,
                 no_browser=no_browser,
                 no_local_server=no_local_server,
             )
+            # request Search as an authorizer and not a client
+            search_auth = mdf_toolbox.login(
+                services=['search'],
+                app_name="Foundry",
+                make_clients=False,
+                no_browser=no_browser,
+                no_local_server=no_local_server,
+            )
+            # add special SearchAuthorizer object
+            auths['search_authorizer'] = search_auth['search']
 
         self.forge_client = Forge(
             index=index,
@@ -203,7 +214,7 @@ def list(self):
 
         Returns
         -------
-            (pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, and publication year
+            (pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, publication year, and DOI
         """
         res = (
             self.forge_client.match_field(
@@ -212,12 +223,14 @@ def list(self):
             .search()
         )
 
+
         return pd.DataFrame(
             [
                 {
                     "source_id": r["mdf"]["source_id"],
                     "name": r["dc"]["titles"][0]["title"],
                     "year": r["dc"].get("publicationYear", None),
+                    "DOI": r["dc"]["identifier"]["identifier"], 
                 }
                 for r in res
             ]
@@ -322,8 +335,9 @@ def _repr_html_(self) -> str:
             authors = [creator['creatorName']
                        for creator in self.dc['creators']]
             authors = '; '.join(authors)
+            DOI = "DOI: " + self.dc['identifier']['identifier']
 
-            buf = f'<h2>{title}</h2>{authors}'
+            buf = f'<h2>{title}</h2>{authors}<p>{DOI}</p>'
 
             buf = f'{buf}<h3>Dataset</h3>{convert(json.loads(self.dataset.json(exclude={"dataframe"})))}'
         # buf = f'{buf}<h3>MDF</h3>{convert(self.mdf)}'

diff --git a/requirements.txt b/requirements.txt
@@ -1,13 +1,14 @@
-globus-sdk>=1.2.1,<=2.0.3
-dlhub_sdk==0.10.0
+globus-sdk>=3,<4
+dlhub_sdk>=0.10.2
 requests>=2.18.4
 tqdm>=4.19.4
 six>=1.11.0
 h5py>=2.10.0
-pandas==1.2.2
+numpy>=1.15.4
+pandas>=0.23.4
 pydantic>=1.6.1
-joblib>=0.16.0
-mdf_forge>=0.7.6
-mdf-connect-client>=0.3.8
+joblib>=1.0.0
+mdf_forge>=0.8.0
+mdf-connect-client>=0.4.0
 json2table>=1.1.5
 joblib>=1.1.0
diff --git a/setup.py b/setup.py
@@ -6,21 +6,21 @@
 # TODO: change dependencies to be looser
 setuptools.setup(
     name="foundry_ml",
-    version="0.1.2",
+    version="0.2.0",
     author="Aristana Scourtas, KJ Schmidt, Imogen Foster, Ribhav Bose, Zoa Katok, Ethan Truelove, Ian Foster, Ben Blaiszik",
     author_email="[email protected]",
     packages=setuptools.find_packages(),
     description="Package to support simplified application of machine learning models to datasets in materials science",
     long_description=long_description,
     long_description_content_type="text/markdown",
     install_requires=[
-        "mdf_forge>=0.7.6",
-        "globus-sdk>=1.2.1,<=2.0.3",
-        "dlhub_sdk==0.10.0",
+        "mdf_forge>=0.8.0",
+        "globus-sdk>=3,<4",
+        "dlhub_sdk>=0.10.2",
         "numpy>=1.15.4",
         "pandas>=0.23.4",
         "pydantic>=1.4",
-        "mdf_connect_client>=0.3.8",
+        "mdf_connect_client>=0.4.0",
         "h5py>=2.10.0",
         "joblib>=1.0.0",
         "json2table"

diff --git a/tests/local_only_tests.py b/tests/local_only_tests.py
@@ -10,14 +10,30 @@
 from dlhub_sdk import DLHubClient
 from mdf_connect_client import MDFConnectClient
 
-
-
-#updated test dataset
-test_dataset = "_test_foundry_iris_dev_v2.1"
-expected_title = "Iris Dataset"
-
-
-#Kept the Old metadata format in case we ever want to refer back
+services = [
+            "data_mdf",
+            "mdf_connect",
+            "search",
+            "dlhub",
+            "petrel",
+            "transfer",
+            "openid",
+            "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all",]
+
+auths = mdf_toolbox.login(services=services, make_clients=True)
+search_auth = mdf_toolbox.login(services=['search'], make_clients=False)
+auths['search_authorizer'] = search_auth['search']
+
+# updated test dataset for publication
+pub_test_dataset = "_test_foundry_iris_dev_v2.1"
+pub_expected_title = "Iris Dataset"
+
+# test dataset for all other tests
+test_dataset = "foundry_experimental_band_gaps_v1.1"
+expected_title = "Graph Network Based Deep Learning of Band Gaps - Experimental Band Gaps"
+
+
+# Kept the Old metadata format in case we ever want to refer back
 old_test_metadata = {
     "inputs": ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"],
     "input_descriptions": ["sepal length in unit(cm)", "sepal width in unit(cm)", "petal length in unit(cm)",
@@ -31,8 +47,7 @@
     "package_type": "tabular"
 }
 
-
-test_metadata = {
+pub_test_metadata = {
     "keys":[
         {
             "key": ["sepal length (cm)"],
@@ -89,40 +104,37 @@
     'domain': ['materials science', 'chemistry'],
     'n_items': 1000
 }
-# Globus endpoint for '_iris_dev'
-test_data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry-test%2Firis-dev%2F"
 
+# Globus endpoint for '_iris_dev' for test publication
+pub_test_data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry-test%2Firis-dev%2F"
 
-#Quick function to delete any downloaded test data
+
+# Quick function to delete any downloaded test data
 def _delete_test_data(foundry_obj):
     path = os.path.join(foundry_obj.config.local_cache_dir, test_dataset)
     if os.path.isdir(path):
         shutil.rmtree(path)
 
 
-@pytest.mark.xfail(reason="Tests will fail in cloud")
-def test_foundry_init_cloud():
-    f = Foundry()
+def test_foundry_init():
+    f = Foundry(authorizers=auths)
     assert isinstance(f.dlhub_client, DLHubClient)
     assert isinstance(f.forge_client, Forge)
     assert isinstance(f.connect_client, MDFConnectClient)
 
-    f2 = Foundry(no_browser=False, no_local_server=True)
+    f2 = Foundry(authorizers=auths, no_browser=False, no_local_server=True)
     assert isinstance(f2.dlhub_client, DLHubClient)
     assert isinstance(f2.forge_client, Forge)
     assert isinstance(f2.connect_client, MDFConnectClient)
 
-    f3 = Foundry(no_browser=True, no_local_server=False)
+    f3 = Foundry(authorizers=auths, no_browser=True, no_local_server=False)
     assert isinstance(f3.dlhub_client, DLHubClient)
     assert isinstance(f3.forge_client, Forge)
     assert isinstance(f3.connect_client, MDFConnectClient)
 
 
-@pytest.mark.xfail(reason="Test should have a local endpoint, will fail cloud CI")
 def test_download_globus():
-
-
-    f = Foundry(no_browser=True, no_local_server=True)
+    f = Foundry(authorizers=auths, no_browser=True, no_local_server=True)
 
     _delete_test_data(f)
 
@@ -133,8 +145,7 @@ def test_download_globus():
 
 
 def test_globus_dataframe_load():
-
-    f = Foundry(no_browser=True, no_local_server=True)
+    f = Foundry(authorizers=auths, no_browser=True, no_local_server=True)
 
     _delete_test_data(f)
 
@@ -150,40 +161,39 @@ def test_globus_dataframe_load():
     _delete_test_data(f)
 
 
-
 def test_publish():
     # TODO: automate dealing with curation and cleaning after tests
 
-    f = Foundry(no_browser=True, no_local_server=True)
+    f = Foundry(authorizers=auths, index="mdf-test", no_browser=True, no_local_server=True)
 
     timestamp = datetime.now().timestamp()
     title = "scourtas_example_iris_test_publish_{:.0f}".format(timestamp)
     short_name = "example_AS_iris_test_{:.0f}".format(timestamp)
     authors = ["A Scourtas"]
 
-    res = f.publish(test_metadata, test_data_source, title, authors, short_name=short_name)
+    res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=short_name)
 
     # publish with short name
     assert res['success']
     assert res['source_id'] == "_test_example_iris_{:.0f}_v1.1".format(timestamp)
 
     # TODO: publish with long title -- for some reason even when I change the title, it still says it's already pub'd
     # title += "long"
-    # res = f.publish(test_metadata, test_data_source, title, authors)
+    # res = f.publish(pub_test_metadata, pub_test_data_source, title, authors)
     # assert res['success']
     # assert res['source_id'] == "_test_scourtas_example_iris_publish_{:.0f}_v1.1".format(timestamp)
 
     # check that pushing same dataset without update flag fails
-    res = f.publish(test_metadata, test_data_source, title, authors, short_name=short_name)
+    res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=short_name)
     assert not res['success']
 
     # check that using update flag allows us to update dataset
-    res = f.publish(test_metadata, test_data_source, title, authors, short_name=short_name, update=True)
+    res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=short_name, update=True)
     assert res['success']
 
     # check that using update flag for new dataset fails
     new_short_name = short_name + "_update"
-    res = f.publish(test_metadata, test_data_source, title, authors, short_name=new_short_name, update=True)
+    res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=new_short_name, update=True)
     assert not res['success']
 
 

diff --git a/tests/test_foundry_gha.py b/tests/test_foundry_gha.py
@@ -15,7 +15,7 @@
 client_id = os.getenv('CLIENT_ID')
 client_secret = os.getenv('CLIENT_SECRET')
 
-services= [
+services = [
             "data_mdf",
             "mdf_connect",
             "search",
@@ -36,11 +36,11 @@
 
 auths['search_authorizer'] = search_auth['search']
 
-#updated test dataset
-test_dataset = "_test_foundry_iris_dev_v2.1"
-expected_title = "Iris Dataset"
+# updated test dataset
+test_dataset = "foundry_experimental_band_gaps_v1.1"
+expected_title = "Graph Network Based Deep Learning of Band Gaps - Experimental Band Gaps"
 
-#Kept the Old metadata format in case we ever want to refer back
+# Kept the Old metadata format in case we ever want to refer back
 old_test_metadata = {
     "inputs": ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"],
     "input_descriptions": ["sepal length in unit(cm)", "sepal width in unit(cm)", "petal length in unit(cm)",