Skip to content

Commit

Permalink
Fix file metadata incomplete return bug
Browse files Browse the repository at this point in the history
If a file is listed or checked for existence and an UPA is added to the file
before the file is completely uploaded, the metadata will never be fully
created.
  • Loading branch information
MrCreosote committed May 17, 2022
1 parent acc6048 commit bc09575
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 5 deletions.
3 changes: 3 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
### Version 1.3.5
- Fixed a bug that under some circumstances could cause incomplete file metadata to be returned.

### Version 1.3.4
- Alter the behavior of the bulk specification file writers to return an error if the
input `types` parameter is empty.
Expand Down
2 changes: 1 addition & 1 deletion staging_service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
routes = web.RouteTableDef()
VERSION = "1.3.4"
VERSION = "1.3.5"

_DATATYPE_MAPPINGS = None

Expand Down
11 changes: 7 additions & 4 deletions staging_service/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ async def _generate_metadata(path: Path, source: str):
else:
data = {}
# first ouptut of md5sum is the checksum
data["source"] = source
if source:
data["source"] = source
try:
md5 = hashlib.md5(open(path.full_path, "rb").read()).hexdigest()
except:
Expand Down Expand Up @@ -165,7 +166,7 @@ async def some_metadata(path: Path, desired_fields=False, source=None):
os.stat(path.metadata_path).st_mtime < file_stats["mtime"] / 1000
):
# if metadata does not exist or older than file: regenerate
if source is None:
if source is None: # TODO BUGFIX this will overwrite any source in the file
source = _determine_source(path)
data = await _generate_metadata(path, source)
else: # metadata already exists and is up to date
Expand All @@ -174,9 +175,11 @@ async def some_metadata(path: Path, desired_fields=False, source=None):
data = await f.read()
data = decoder.decode(data)
# due to legacy code, some file has corrupted metadata file
# Also if a file is listed or checked for existence before the upload completes
# this code block will be triggered
expected_keys = ["source", "md5", "lineCount", "head", "tail"]
if set(expected_keys) > set(data.keys()):
if source is None:
if not set(expected_keys) <= set(data.keys()):
if source is None and "source" not in data:
source = _determine_source(path)
data = await _generate_metadata(path, source)
data = {**data, **file_stats}
Expand Down
80 changes: 80 additions & 0 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
""" Unit tests for the metadata handling routines. """

import json
import uuid

from collections.abc import Callable, Generator
from pathlib import Path as PyPath
from pytest import fixture

from staging_service.utils import Path
from staging_service.metadata import some_metadata

from tests.test_app import FileUtil

# TODO TEST add more unit tests here.


@fixture(scope="module")
def temp_dir() -> Generator[PyPath, None, None]:
with FileUtil() as fu:
childdir = PyPath(fu.make_dir(str(uuid.uuid4()))).resolve()

yield childdir

# FileUtil will auto delete after exiting


async def test_incomplete_metadata_file_update(temp_dir: Path):
"""
Tests the case where a file is listed or checked for existance prior to completing
upload, and then an UPA is added to the file. This previously caused incomplete metadata
to be returned as the logic for whether to run the metadata regeneration code based on
the contents of the current metadata file was incorrect.
See https://kbase-jira.atlassian.net/browse/PTV-1767
"""
await _incomplete_metadata_file_update(
temp_dir,
{"source": "some place", "UPA": "1/2/3"},
"some place"
)

await _incomplete_metadata_file_update(
temp_dir,
{"UPA": "1/2/3"},
"Unknown"
)

async def _incomplete_metadata_file_update(temp_dir, metadict, source):
target = Path(
str(temp_dir / "full"),
str(temp_dir / "meta"),
"user_path",
"myfilename",
"super_fake_jgi_path")

with open(target.full_path, "w") as p:
p.writelines(make_test_lines(1, 6))
with open(target.metadata_path, "w") as p:
p.write(json.dumps(metadict))

res = await some_metadata(target)

assert "mtime" in res # just check the file time is there
del res["mtime"]

assert res == {
"source": source,
"md5": "9f07da9655223b777121141ff7735f25",
"head": "".join(make_test_lines(1, 5)),
"tail": "".join(make_test_lines(2, 6)),
"UPA": "1/2/3",
"isFolder": False,
"lineCount": "5",
"name": "myfilename",
"path": "user_path",
"size": 1280
}

def make_test_lines(start, stop):
return [str(i) + "a" * (256 - len(str(i)) - 1) + "\n" for i in range(start, stop)]

0 comments on commit bc09575

Please sign in to comment.