diff --git a/exasol/bucketfs/saas_file_api.py b/exasol/bucketfs/saas_file_api.py new file mode 100644 index 00000000..d1bf897c --- /dev/null +++ b/exasol/bucketfs/saas_file_api.py @@ -0,0 +1,115 @@ +from typing import Protocol, Literal, Iterable, ByteString, BinaryIO +from datetime import datetime + + +class Unset: + """ + This is a temporary class that will be replaced with relevant construct from + SaaS Open API, once it is available. + + """ + def __bool__(self) -> Literal[False]: + return False + + +UNSET: Unset = Unset() + +SAAS_FOLDER = 'folder' + + +class SaasFile: + """ + This is a temporary class that will be replaced with the File definition in the + SaaS Open API, once it is available. + """ + name: str + type: str + path: str + last_modified: datetime + size: Unset | int = UNSET + children: Unset | list["SaasFile"] = UNSET + + +class SaasFileApi(Protocol): + + def list_files(self) -> list[SaasFile]: + """ + Returns the file system as a tree structure where nodes are SaasFile objects. + """ + + def create_folder(self, path: str) -> None: + """ + Creates a folder in the SaaS file system. + + :param path: The folder path. + + Q. What happens if the parent is missing? + A. It will create it. + + Q. What happens if the directory already exists? + A. It will do nothing, no error raised. + """ + + def delete_file(self, path: str) -> None: + """ + Deletes a file in the SaaS file system. + + :param path: Path of the file to be deleted. + + Q. What happens if the path doesn't exist? + A. It does nothing, no error. + + Q. What happens if the path points to a directory? + A. It does nothing, no error. + """ + + def delete_folder(self, path: str) -> None: + """ + Deletes a folder in the Saas file system. + + :param path: Path of the folder to be deleted. + + Q. Should the folder be empty? + A. Yes, it should be empty, otherwise it won't be deleted, + however, no error will be raised. + + Q. What happens if the path points to a file? + A. Nothing, no error. + + Q. What happens if the path points to nothing? + A. Nothing, no error is raised. + """ + + def upload_file(self, path: str, data: ByteString | BinaryIO) -> None: + """ + Uploads a file to the SaaS file system. + + :param path: Path in the SaaS file system where the file should be uploaded. + :param data: Either a binary array or a binary stream, e.g. a file opened in the binary mode. + + Q. What happens if the parent is missing? + A. It will create it. + + Q. What happens if the path points to an existing file? + A. That's fine, the file will be updated. + + Q. What happens if the path points to an existing directory? + A. It will create a file and keep the directory. + """ + + def download_file(self, path: str, chunk_size: int = 8192) -> Iterable[ByteString]: + """ + Downloads a file from the SaaS file system. The content of the file will be provided + in chunks of the specified size. The full content of the file can be constructed using + code similar to the line below. + content = b''.join(api.download_file(path)) + + :param path: Path of the file in the SaaS file system that should be downloaded. + :param chunk_size: Size of the chunks the file content will be delivered in. + + Q. What happens if the path points to a directory. + A. Gets 404 Not Found error => raises FileNotFoundError exception. + + Q. What happens if the path points to nothing. + A. The same. + """ diff --git a/exasol/bucketfs/saas_path.py b/exasol/bucketfs/saas_path.py new file mode 100644 index 00000000..f7cf6447 --- /dev/null +++ b/exasol/bucketfs/saas_path.py @@ -0,0 +1,209 @@ +from typing import ByteString, BinaryIO, Iterable, Optional, Generator +from pathlib import PurePath +import errno +import os +from io import IOBase +from exasol.bucketfs.saas_file_api import SaasFileApi, SaasFile, SAAS_FOLDER +from exasol.bucketfs.pathlike import Pathlike + + +def _is_file(node: SaasFile) -> bool: + """ + The logic to determine if a node in the Saas file system refers to a file + or a directory. + """ + return bool(node.type != SAAS_FOLDER and not node.children) + + +def _create_root(node_list: list[SaasFile]) -> SaasFile: + """ + Creates the root node of the Saas file system, that is not returned by the + SaaS File API. + """ + root = SaasFile() + root.name = '' + root.path = '' + root.type = SAAS_FOLDER + root.last_modified = max(nd.last_modified for nd in node_list) + root.children = node_list + return root + + +def _walk_node(node: SaasFile, path: Pathlike, top_down: bool) -> \ + Generator[tuple[Pathlike, list[str], list[str]], None, None]: + """ + Implements a recursive walk over the SaaS file system represented by a + Pathlike object. + + :param node: The current node. + :param path: A Pathlike object corresponding to the current node + (the correspondence is not checked). + :param top_down: If True, the current node is yielded first, then its + descendants. If False, it is the other way round. + """ + dir_list = [] + file_list = [] + if node.children: + for child in node.children: + if _is_file(child): + file_list.append(child.name) + else: + dir_list.append(child.name) + if top_down: + yield path, dir_list, file_list + if node.children: + for child in node.children: + if not _is_file(child): + for paths, dirs, files in _walk_node(child, path / child.name, top_down): + yield paths, dirs, files + if not top_down: + yield path, dir_list, file_list + + +class SaaSBucketPath: + """ + Implementation of the BucketFS Pathlike protocol for the SaaS file system. + """ + + def __init__(self, saas_path: str | PurePath, saas_file_api: SaasFileApi): + """ + :param saas_path: A pure path relative to the root of the SaaS file system. + all Pure Path methods of the Pathlike protocol will be + delegated to this object. + + :param saas_file_api: An object supporting the SaaS File API protocol. + """ + self._path = PurePath(saas_path) + self._saas_file_api = saas_file_api + + def _navigate(self) -> Optional[SaasFile]: + """ + Reads the file structure from the SaaS file system and navigates to the node + corresponding to the pure path of this object. Returns None if such node doesn't + exist, otherwise returns this node. + """ + + node_list = self._saas_file_api.list_files() + + # The tree returned from the SaaS File API has no root. It starts from a list of + # children of an implied root node. In case the path points to a root, this node + # has to be created. + if not self._path.parts: + return _create_root(node_list) + + node: Optional[SaasFile] = None + for part in self._path.parts: + if not node_list: + return None + for node in node_list: + if node.name == part: + node_list = node.children + break + else: + return None + return node + + @property + def name(self) -> str: + return self._path.name + + @property + def suffix(self) -> str: + return self._path.suffix + + @property + def root(self) -> str: + return self._path.root + + @property + def parent(self) -> str: + return self._path.parent.name + + def as_uri(self) -> str: + return self._path.as_uri() + + def exists(self) -> bool: + return self._navigate() is not None + + def is_dir(self) -> bool: + current_node = self._navigate() + return (current_node is not None) and (not _is_file(current_node)) + + def is_file(self) -> bool: + current_node = self._navigate() + return (current_node is not None) and _is_file(current_node) + + def read(self, chunk_size: int = 8192) -> Iterable[ByteString]: + return self._saas_file_api.download_file(str(self._path), chunk_size) + + def write(self, data: ByteString | BinaryIO | Iterable[ByteString]): + if (not isinstance(data, IOBase) and isinstance(data, Iterable) and + all(isinstance(chunk, ByteString) for chunk in data)): + data = b''.join(data) + return self._saas_file_api.upload_file(str(self._path), data) + + def rm(self): + current_node = self._navigate() + if current_node is None: + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(self._path)) + elif not _is_file(current_node): + raise IsADirectoryError(errno.EISDIR, os.strerror(errno.EISDIR), str(self._path)) + self._saas_file_api.delete_file(str(self._path)) + + def rmdir(self, recursive: bool = False): + current_node = self._navigate() + if current_node is None: + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(self._path)) + elif _is_file(current_node): + raise NotADirectoryError(errno.ENOTDIR, os.strerror(errno.ENOTDIR), str(self._path)) + elif not current_node.children: + self._saas_file_api.delete_folder(str(self._path)) + elif recursive: + self._rmdir_recursive(current_node) + else: + raise OSError(errno.ENOTEMPTY, os.strerror(errno.ENOTEMPTY), str(self._path)) + + def _rmdir_recursive(self, node: SaasFile): + if node.children: + for child in node.children: + self._rmdir_recursive(child) + if _is_file(node): + self._saas_file_api.delete_file(node.path) + else: + self._saas_file_api.delete_folder(node.path) + + def joinpath(self, *path_segments) -> "Pathlike": + # The path segments can be of either this type or an os.PathLike. + cls = type(self) + seg_paths = [seg._path if isinstance(seg, cls) else seg for seg in path_segments] + new_path = self._path.joinpath(*seg_paths) + return cls(new_path, self._saas_file_api) + + def walk(self, top_down: bool = True) -> Generator[tuple[Pathlike, list[str], list[str]], None, None]: + current_node = self._navigate() + if current_node is None: + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(self._path)) + + if not _is_file(current_node): + for output in _walk_node(current_node, self, top_down): + yield output + + def iterdir(self) -> Generator[Pathlike, None, None]: + current_node = self._navigate() + if current_node is None: + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(self._path)) + if _is_file(current_node): + raise NotADirectoryError(errno.ENOTDIR, os.strerror(errno.ENOTDIR), str(self._path)) + + if current_node.children: + for child in current_node.children: + yield SaaSBucketPath(self._path / child.name, self._saas_file_api) + + def __truediv__(self, other): + # The other object can be of either this type or an os.PathLike. + cls = type(self) + new_path = self._path / (other._path if isinstance(other, cls) else other) + return cls(new_path, self._saas_file_api) + + def __str__(self): + return str(self._path) diff --git a/test/unit/saas_file_mock.py b/test/unit/saas_file_mock.py new file mode 100644 index 00000000..aa7cfe04 --- /dev/null +++ b/test/unit/saas_file_mock.py @@ -0,0 +1,88 @@ +from typing import Iterable, ByteString, BinaryIO +from datetime import datetime +import os +from io import IOBase +import shutil +import errno +from pathlib import Path + +from exasol.bucketfs.saas_file_api import SaasFile, UNSET, SAAS_FOLDER + + +class SaasFileApiMock: + """ + Implementation of the SaaS File API backed by the normal file system. + """ + + def __init__(self, root: str): + self.root = Path(root) + if not self.root.is_dir(): + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(root)) + + def _get_full_path(self, path: str | Path): + return self.root / path + + def list_files(self) -> list[SaasFile]: + def get_children(node: Path): + children: list[SaasFile] = [] + for path in node.glob('*'): + child = SaasFile() + child.name = path.name + child.path = str(path) + st = path.stat() + child.last_modified = datetime.fromtimestamp(st.st_mtime) + if path.is_dir(): + child.type = SAAS_FOLDER + child.size = UNSET + child.children = get_children(path) + else: + child.type = path.suffix + child.size = st.st_size + child.children = UNSET + children.append(child) + return children + return get_children(self.root) + + def create_folder(self, path: str) -> None: + # The deviation from the API is in the case when path points + # to an existing file. The API allows having both a directory and a file + # with the same name, but the Path object doesn't. + self._get_full_path(path).mkdir(parents=True, exist_ok=True) + + def delete_file(self, path: str) -> None: + try: + self._get_full_path(path).unlink(missing_ok=True) + except IsADirectoryError: + pass + + def delete_folder(self, path: str) -> None: + try: + self._get_full_path(path).rmdir() + except OSError: + pass + + def upload_file(self, path: str, data: ByteString | BinaryIO) -> None: + full_path = self._get_full_path(path) + if not full_path.parent.exists(): + full_path.parent.mkdir(parents=True) + with full_path.open('wb') as f: + if isinstance(data, IOBase): + shutil.copyfileobj(data, f) + elif isinstance(data, ByteString): + f.write(data) + else: + raise ValueError('upload_file called with unrecognised data type. ' + 'A valid data should be either ByteString or BinaryIO') + + def download_file(self, path: str, chunk_size: int) -> Iterable[ByteString]: + full_path = self._get_full_path(path) + if full_path.exists(): + if full_path.is_file(): + with full_path.open('rb') as f: + while True: + data = f.read(chunk_size) + if data: + yield data + else: + return + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(path)) diff --git a/test/unit/test_saas_path.py b/test/unit/test_saas_path.py new file mode 100644 index 00000000..d9de0d16 --- /dev/null +++ b/test/unit/test_saas_path.py @@ -0,0 +1,217 @@ +from pathlib import Path +import pytest +from itertools import chain +from exasol.bucketfs.saas_path import SaaSBucketPath +from test.unit.saas_file_mock import SaasFileApiMock + + +@pytest.fixture +def api_mock(tmpdir) -> SaasFileApiMock: + dir1 = tmpdir.mkdir('dir1') + dir2 = tmpdir.mkdir('dir2') + dir11 = dir1.mkdir('dir11') + dir12 = dir1.mkdir('dir12') + for d, d_id in zip([tmpdir, dir1, dir2, dir11, dir12], [0, 1, 2, 11, 12]): + for i in range(2): + file_name = f'file{d_id}{i}.dat' + dat = bytes([d_id * i] * 24) + with open(str(d / file_name), 'wb') as f: + f.write(dat) + return SaasFileApiMock(tmpdir) + + +@pytest.mark.parametrize("test_path, should_exist", [ + ('dir1/file11.dat', True), + ('dir1/dir11', True), + ('dir1/file19.dat', False), + ('dir1/dir3', False) +]) +def test_file_exists(api_mock, test_path, should_exist): + path = SaaSBucketPath(test_path, saas_file_api=api_mock) + assert path.exists() == should_exist + + +@pytest.mark.parametrize("test_path, is_dir", [ + ('dir1/file11.dat', False), + ('dir1/dir11', True), + ('dir1/file19.dat', False), + ('dir1/dir3', False) +]) +def test_is_dir(api_mock, test_path, is_dir): + path = SaaSBucketPath(test_path, saas_file_api=api_mock) + assert path.is_dir() == is_dir + + +@pytest.mark.parametrize("test_path, is_file", [ + ('dir1/file11.dat', True), + ('dir1/dir11', False), + ('dir1/file19.dat', False), + ('dir1/dir3', False) +]) +def test_is_file(api_mock, test_path, is_file): + path = SaaSBucketPath(test_path, saas_file_api=api_mock) + assert path.is_file() == is_file + + +def test_rm(api_mock): + path = SaaSBucketPath('dir1/dir12/file120.dat', saas_file_api=api_mock) + path.rm() + assert not path.exists() + + +def test_rm_not_exist(api_mock): + path = SaaSBucketPath('dir1/dir12/file125.dat', saas_file_api=api_mock) + with pytest.raises(FileNotFoundError): + path.rm() + + +def test_rm_directory(api_mock): + path = SaaSBucketPath('dir1/dir12', saas_file_api=api_mock) + with pytest.raises(IsADirectoryError): + path.rm() + + +def test_rmdir(api_mock): + for i in range(2): + SaaSBucketPath(f'dir1/dir12/file12{i}.dat', saas_file_api=api_mock).rm() + path = SaaSBucketPath('dir1/dir12', saas_file_api=api_mock) + path.rmdir(recursive=False) + assert not path.exists() + + +def test_rmdir_recursive(api_mock): + path = SaaSBucketPath('dir1', saas_file_api=api_mock) + path.rmdir(recursive=True) + assert not path.exists() + + +def test_rmdir_not_empty(api_mock): + path = SaaSBucketPath('dir1', saas_file_api=api_mock) + with pytest.raises(OSError): + path.rmdir(recursive=False) + + +def test_rmdir_not_exist(api_mock): + path = SaaSBucketPath('dir1/dir5', saas_file_api=api_mock) + with pytest.raises(FileNotFoundError): + path.rmdir() + + +def test_rmdir_file(api_mock): + path = SaaSBucketPath('dir1/dir12/file120.dat', saas_file_api=api_mock) + with pytest.raises(NotADirectoryError): + path.rmdir() + + +def test_joinpath(api_mock): + path1 = SaaSBucketPath('dir1', saas_file_api=api_mock) + path2 = 'dir11' + path3 = SaaSBucketPath('dir111/dir1111', saas_file_api=api_mock) + path4 = Path('dir11111/file111110.dat') + path = path1.joinpath(path2, path3, path4) + assert isinstance(path, SaaSBucketPath) + assert str(path) == 'dir1/dir11/dir111/dir1111/dir11111/file111110.dat' + + +def test_truediv(api_mock): + path1 = SaaSBucketPath('dir1', saas_file_api=api_mock) + path2 = 'dir11' + path3 = SaaSBucketPath('dir111/dir1111', saas_file_api=api_mock) + path4 = Path('dir11111/file111110.dat') + path = path1 / path2 / path3 / path4 + assert isinstance(path, SaaSBucketPath) + assert str(path) == 'dir1/dir11/dir111/dir1111/dir11111/file111110.dat' + + +def test_walk_top_down(api_mock): + path = SaaSBucketPath('', saas_file_api=api_mock) + content = [','.join(chain([pth.name, '/'], sorted(dirs), sorted(files))) + for pth, dirs, files in path.walk(top_down=True)] + expected_content = [ + ',/,dir1,dir2,file00.dat,file01.dat', + 'dir1,/,dir11,dir12,file10.dat,file11.dat', + 'dir11,/,file110.dat,file111.dat', + 'dir12,/,file120.dat,file121.dat', + 'dir2,/,file20.dat,file21.dat' + ] + assert set(content) == set(expected_content) + idx = [content.index(expected_content[i]) for i in range(3)] + assert idx == sorted(idx) + + +def test_walk_bottom_up(api_mock): + path = SaaSBucketPath('', saas_file_api=api_mock) + content = [','.join(chain([pth.name, '/'], sorted(dirs), sorted(files))) + for pth, dirs, files in path.walk(top_down=False)] + expected_content = [ + 'dir11,/,file110.dat,file111.dat', + 'dir1,/,dir11,dir12,file10.dat,file11.dat', + ',/,dir1,dir2,file00.dat,file01.dat', + 'dir12,/,file120.dat,file121.dat', + 'dir2,/,file20.dat,file21.dat' + ] + assert set(content) == set(expected_content) + idx = [content.index(expected_content[i]) for i in range(3)] + assert idx == sorted(idx) + + +def test_iterdir(api_mock): + path = SaaSBucketPath('dir1', saas_file_api=api_mock) + content = set(str(node) for node in path.iterdir()) + expected_content = { + 'dir1/dir11', + 'dir1/dir12', + 'dir1/file10.dat', + 'dir1/file11.dat' + } + assert content == expected_content + + +def test_read(api_mock): + path = SaaSBucketPath('dir1/dir12/file121.dat', saas_file_api=api_mock) + expected_chunk = bytes([12] * 8) + for chunk in path.read(chunk_size=8): + assert chunk == expected_chunk + + +def test_read_not_found(api_mock): + path = SaaSBucketPath('dir1/file12.dat', saas_file_api=api_mock) + with pytest.raises(FileNotFoundError): + list(path.read()) + + +@pytest.mark.parametrize("file_name", ['file23.dat', 'file20.dat']) +def test_write_bytes(api_mock, file_name): + data = b'abcd' + path = SaaSBucketPath(f'dir2/{file_name}', saas_file_api=api_mock) + path.write(data) + data_back = next(iter(path.read(100))) + assert data_back == data + + +def test_write_chunks(api_mock): + data_chunks = [b'abc', b'def', b'gh'] + path = SaaSBucketPath('dir2/file23.dat', saas_file_api=api_mock) + path.write(data_chunks) + data_back = next(iter(path.read(100))) + assert data_back == b'abcdefgh' + + +def test_write_file(api_mock): + path = SaaSBucketPath('dir2/file_copy.dat', saas_file_api=api_mock) + source_file = api_mock.root / 'dir2/file21.dat' + with open(source_file, 'rb') as f: + path.write(f) + with open(source_file, 'rb') as f: + assert next(iter(path.read(100))) == f.read() + + +def test_write_and_create_parent(api_mock): + path = SaaSBucketPath('dir2/dir21/file_copy.dat', saas_file_api=api_mock) + assert not path.exists() + source_file = api_mock.root / 'dir2/file21.dat' + with open(source_file, 'rb') as f: + path.write(f) + assert path.exists() + with open(source_file, 'rb') as f: + assert next(iter(path.read(100))) == f.read()