Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make FH fetch_publication_data more stable #171

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 20 additions & 16 deletions recirq/fermi_hubbard/publication.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
# limitations under the License.
"""Data specific to experiment published in arXiv:2010.07965."""

from io import BytesIO
from copy import deepcopy
from io import BytesIO
import os
from typing import Callable, List, Optional, Tuple
import re
from typing import Callable, Optional, Tuple
from urllib.request import urlopen
from zipfile import ZipFile

Expand Down Expand Up @@ -217,7 +218,7 @@ def rainbow23_layouts(sites_count: int = 8) -> Tuple[ZigZagLayout]:

def fetch_publication_data(
base_dir: Optional[str] = None,
exclude: Optional[List[str]] = None,
exclude: Optional[Tuple[str]] = (),
) -> None:
"""Downloads and extracts publication data from the Dryad repository at
https://doi.org/10.5061/dryad.crjdfn32v, saving to disk.
Expand All @@ -239,23 +240,26 @@ def fetch_publication_data(
if base_dir is None:
base_dir = "fermi_hubbard_data"

base_url = "https://datadryad.org/stash/downloads/file_stream/"
data = {
"gaussians_1u1d_nofloquet": "451326",
"gaussians_1u1d": "451327",
"trapping_2u2d": "451328",
"trapping_3u3d": "451329"
}
if exclude is not None:
data = {path: key for path, key in data.items() if path not in exclude}

for path, key in data.items():
fnames = {
"gaussians_1u1d_nofloquet", "gaussians_1u1d", "trapping_2u2d", "trapping_3u3d"
}.difference(exclude)

# Determine file IDs. Note these are not permanent on Dryad.
file_ids = {}
for line in urlopen("https://doi.org/10.5061/dryad.crjdfn32v").readlines():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this parsing the human-readable html website? If so, this seems just as unstable. What if they add a newline between the filename.zip and the id?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes this would break it. It might be that this is more stable than the file IDs - the file IDs changed at least once since fetch_publication_data was added, but I don't think the html has. I think the only way to hermetically seal it is to tell Dryad to give individual files DOIs, or use a different service that does.

for fname in fnames:
if fname + ".zip" in line.decode():
file_id, = re.findall(r"file_stream/[\d]*\d+", line.decode())
file_ids.update({fname: file_id})

# Download and extract files using IDs.
for path, key in file_ids.items():
print(f"Downloading {path}...")
if os.path.exists(path=base_dir + os.path.sep + path):
if os.path.exists(path=os.path.join(base_dir, path)):
print("Data already exists.\n")
continue

with urlopen(base_url + key) as stream:
with urlopen("https://datadryad.org/stash/downloads/" + key) as stream:
with ZipFile(BytesIO(stream.read())) as zfile:
zfile.extractall(base_dir)

Expand Down
6 changes: 3 additions & 3 deletions recirq/fermi_hubbard/publication_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@

def test_fetch_publication_data():
base_dir = "fermi_hubbard_data"
fetch_publication_data(base_dir=base_dir, exclude=["trapping_3u3d"])
fetch_publication_data(base_dir=base_dir, exclude=("trapping_3u3d",))

for path in ("gaussians_1u1d_nofloquet", "gaussians_1u1d", "trapping_2u2d"):
assert os.path.exists(base_dir + os.path.sep + path)
assert os.path.exists(os.path.join(base_dir, path))

fetch_publication_data(base_dir=base_dir)
assert os.path.exists(base_dir + os.path.sep + "trapping_3u3d")
assert os.path.exists(os.path.join(base_dir, "trapping_3u3d"))