Skip to content

Commit

Permalink
Merge pull request #178 from MLMI2-CSSI/dev
Browse files Browse the repository at this point in the history
add f.run() functionality
add compatibility with Globus SDK 3
add DOI visibility to dataset summary
  • Loading branch information
ascourtas authored May 11, 2022
2 parents bc268cb + 9df5519 commit cddad8c
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 97 deletions.
44 changes: 0 additions & 44 deletions .github/workflows/python-package.yml

This file was deleted.

26 changes: 20 additions & 6 deletions foundry/foundry.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class Foundry(FoundryMetadata):
xtract_tokens: Any

def __init__(
self, no_browser=False, no_local_server=False, index="mdf-test", authorizers=None, **data
self, no_browser=False, no_local_server=False, index="mdf", authorizers=None, **data
):
"""Initialize a Foundry client
Args:
Expand All @@ -73,8 +73,7 @@ def __init__(
if authorizers:
auths = authorizers
else:
auths = mdf_toolbox.login(
services=[
services = [
"data_mdf",
"mdf_connect",
"search",
Expand All @@ -84,12 +83,24 @@ def __init__(
"funcx",
"openid",
"https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all",
],
]
auths = mdf_toolbox.login(
services=services,
app_name="Foundry",
make_clients=True,
no_browser=no_browser,
no_local_server=no_local_server,
)
# request Search as an authorizer and not a client
search_auth = mdf_toolbox.login(
services=['search'],
app_name="Foundry",
make_clients=False,
no_browser=no_browser,
no_local_server=no_local_server,
)
# add special SearchAuthorizer object
auths['search_authorizer'] = search_auth['search']

self.forge_client = Forge(
index=index,
Expand Down Expand Up @@ -203,7 +214,7 @@ def list(self):
Returns
-------
(pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, and publication year
(pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, publication year, and DOI
"""
res = (
self.forge_client.match_field(
Expand All @@ -212,12 +223,14 @@ def list(self):
.search()
)


return pd.DataFrame(
[
{
"source_id": r["mdf"]["source_id"],
"name": r["dc"]["titles"][0]["title"],
"year": r["dc"].get("publicationYear", None),
"DOI": r["dc"]["identifier"]["identifier"],
}
for r in res
]
Expand Down Expand Up @@ -322,8 +335,9 @@ def _repr_html_(self) -> str:
authors = [creator['creatorName']
for creator in self.dc['creators']]
authors = '; '.join(authors)
DOI = "DOI: " + self.dc['identifier']['identifier']

buf = f'<h2>{title}</h2>{authors}'
buf = f'<h2>{title}</h2>{authors}<p>{DOI}</p>'

buf = f'{buf}<h3>Dataset</h3>{convert(json.loads(self.dataset.json(exclude={"dataframe"})))}'
# buf = f'{buf}<h3>MDF</h3>{convert(self.mdf)}'
Expand Down
13 changes: 7 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
globus-sdk>=1.2.1,<=2.0.3
dlhub_sdk==0.10.0
globus-sdk>=3,<4
dlhub_sdk>=0.10.2
requests>=2.18.4
tqdm>=4.19.4
six>=1.11.0
h5py>=2.10.0
pandas==1.2.2
numpy>=1.15.4
pandas>=0.23.4
pydantic>=1.6.1
joblib>=0.16.0
mdf_forge>=0.7.6
mdf-connect-client>=0.3.8
joblib>=1.0.0
mdf_forge>=0.8.0
mdf-connect-client>=0.4.0
json2table>=1.1.5
joblib>=1.1.0
10 changes: 5 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,21 @@
# TODO: change dependencies to be looser
setuptools.setup(
name="foundry_ml",
version="0.1.2",
version="0.2.0",
author="Aristana Scourtas, KJ Schmidt, Imogen Foster, Ribhav Bose, Zoa Katok, Ethan Truelove, Ian Foster, Ben Blaiszik",
author_email="[email protected]",
packages=setuptools.find_packages(),
description="Package to support simplified application of machine learning models to datasets in materials science",
long_description=long_description,
long_description_content_type="text/markdown",
install_requires=[
"mdf_forge>=0.7.6",
"globus-sdk>=1.2.1,<=2.0.3",
"dlhub_sdk==0.10.0",
"mdf_forge>=0.8.0",
"globus-sdk>=3,<4",
"dlhub_sdk>=0.10.2",
"numpy>=1.15.4",
"pandas>=0.23.4",
"pydantic>=1.4",
"mdf_connect_client>=0.3.8",
"mdf_connect_client>=0.4.0",
"h5py>=2.10.0",
"joblib>=1.0.0",
"json2table"
Expand Down
72 changes: 41 additions & 31 deletions tests/local_only_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,30 @@
from dlhub_sdk import DLHubClient
from mdf_connect_client import MDFConnectClient



#updated test dataset
test_dataset = "_test_foundry_iris_dev_v2.1"
expected_title = "Iris Dataset"


#Kept the Old metadata format in case we ever want to refer back
services = [
"data_mdf",
"mdf_connect",
"search",
"dlhub",
"petrel",
"transfer",
"openid",
"https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all",]

auths = mdf_toolbox.login(services=services, make_clients=True)
search_auth = mdf_toolbox.login(services=['search'], make_clients=False)
auths['search_authorizer'] = search_auth['search']

# updated test dataset for publication
pub_test_dataset = "_test_foundry_iris_dev_v2.1"
pub_expected_title = "Iris Dataset"

# test dataset for all other tests
test_dataset = "foundry_experimental_band_gaps_v1.1"
expected_title = "Graph Network Based Deep Learning of Band Gaps - Experimental Band Gaps"


# Kept the Old metadata format in case we ever want to refer back
old_test_metadata = {
"inputs": ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"],
"input_descriptions": ["sepal length in unit(cm)", "sepal width in unit(cm)", "petal length in unit(cm)",
Expand All @@ -31,8 +47,7 @@
"package_type": "tabular"
}


test_metadata = {
pub_test_metadata = {
"keys":[
{
"key": ["sepal length (cm)"],
Expand Down Expand Up @@ -89,40 +104,37 @@
'domain': ['materials science', 'chemistry'],
'n_items': 1000
}
# Globus endpoint for '_iris_dev'
test_data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry-test%2Firis-dev%2F"

# Globus endpoint for '_iris_dev' for test publication
pub_test_data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry-test%2Firis-dev%2F"

#Quick function to delete any downloaded test data

# Quick function to delete any downloaded test data
def _delete_test_data(foundry_obj):
path = os.path.join(foundry_obj.config.local_cache_dir, test_dataset)
if os.path.isdir(path):
shutil.rmtree(path)


@pytest.mark.xfail(reason="Tests will fail in cloud")
def test_foundry_init_cloud():
f = Foundry()
def test_foundry_init():
f = Foundry(authorizers=auths)
assert isinstance(f.dlhub_client, DLHubClient)
assert isinstance(f.forge_client, Forge)
assert isinstance(f.connect_client, MDFConnectClient)

f2 = Foundry(no_browser=False, no_local_server=True)
f2 = Foundry(authorizers=auths, no_browser=False, no_local_server=True)
assert isinstance(f2.dlhub_client, DLHubClient)
assert isinstance(f2.forge_client, Forge)
assert isinstance(f2.connect_client, MDFConnectClient)

f3 = Foundry(no_browser=True, no_local_server=False)
f3 = Foundry(authorizers=auths, no_browser=True, no_local_server=False)
assert isinstance(f3.dlhub_client, DLHubClient)
assert isinstance(f3.forge_client, Forge)
assert isinstance(f3.connect_client, MDFConnectClient)


@pytest.mark.xfail(reason="Test should have a local endpoint, will fail cloud CI")
def test_download_globus():


f = Foundry(no_browser=True, no_local_server=True)
f = Foundry(authorizers=auths, no_browser=True, no_local_server=True)

_delete_test_data(f)

Expand All @@ -133,8 +145,7 @@ def test_download_globus():


def test_globus_dataframe_load():

f = Foundry(no_browser=True, no_local_server=True)
f = Foundry(authorizers=auths, no_browser=True, no_local_server=True)

_delete_test_data(f)

Expand All @@ -150,40 +161,39 @@ def test_globus_dataframe_load():
_delete_test_data(f)



def test_publish():
# TODO: automate dealing with curation and cleaning after tests

f = Foundry(no_browser=True, no_local_server=True)
f = Foundry(authorizers=auths, index="mdf-test", no_browser=True, no_local_server=True)

timestamp = datetime.now().timestamp()
title = "scourtas_example_iris_test_publish_{:.0f}".format(timestamp)
short_name = "example_AS_iris_test_{:.0f}".format(timestamp)
authors = ["A Scourtas"]

res = f.publish(test_metadata, test_data_source, title, authors, short_name=short_name)
res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=short_name)

# publish with short name
assert res['success']
assert res['source_id'] == "_test_example_iris_{:.0f}_v1.1".format(timestamp)

# TODO: publish with long title -- for some reason even when I change the title, it still says it's already pub'd
# title += "long"
# res = f.publish(test_metadata, test_data_source, title, authors)
# res = f.publish(pub_test_metadata, pub_test_data_source, title, authors)
# assert res['success']
# assert res['source_id'] == "_test_scourtas_example_iris_publish_{:.0f}_v1.1".format(timestamp)

# check that pushing same dataset without update flag fails
res = f.publish(test_metadata, test_data_source, title, authors, short_name=short_name)
res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=short_name)
assert not res['success']

# check that using update flag allows us to update dataset
res = f.publish(test_metadata, test_data_source, title, authors, short_name=short_name, update=True)
res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=short_name, update=True)
assert res['success']

# check that using update flag for new dataset fails
new_short_name = short_name + "_update"
res = f.publish(test_metadata, test_data_source, title, authors, short_name=new_short_name, update=True)
res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=new_short_name, update=True)
assert not res['success']


Expand Down
10 changes: 5 additions & 5 deletions tests/test_foundry_gha.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')

services= [
services = [
"data_mdf",
"mdf_connect",
"search",
Expand All @@ -36,11 +36,11 @@

auths['search_authorizer'] = search_auth['search']

#updated test dataset
test_dataset = "_test_foundry_iris_dev_v2.1"
expected_title = "Iris Dataset"
# updated test dataset
test_dataset = "foundry_experimental_band_gaps_v1.1"
expected_title = "Graph Network Based Deep Learning of Band Gaps - Experimental Band Gaps"

#Kept the Old metadata format in case we ever want to refer back
# Kept the Old metadata format in case we ever want to refer back
old_test_metadata = {
"inputs": ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"],
"input_descriptions": ["sepal length in unit(cm)", "sepal width in unit(cm)", "petal length in unit(cm)",
Expand Down

0 comments on commit cddad8c

Please sign in to comment.