Skip to content

Commit

Permalink
fix?
Browse files Browse the repository at this point in the history
Signed-off-by: wenfeiy-db <[email protected]>
  • Loading branch information
wenfeiy-db committed Oct 26, 2023
1 parent f0284d6 commit 8a5f34a
Show file tree
Hide file tree
Showing 12 changed files with 60 additions and 47 deletions.
4 changes: 2 additions & 2 deletions mlflow/langchain/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
_PythonEnv,
_validate_env_arguments,
)
from mlflow.utils.file_utils import write_to, get_total_size
from mlflow.utils.file_utils import write_to, get_total_file_size
from mlflow.utils.model_utils import (
_add_code_from_conf_to_system_path,
_get_flavor_configuration,
Expand Down Expand Up @@ -296,7 +296,7 @@ def load_retriever(persist_directory):
**flavor_conf,
)
try:
mlflow_model.model_size_bytes = get_total_size(str(path))
mlflow_model.model_size_bytes = get_total_file_size(str(path))
except Exception as e:
logger.info(f"Fail to get the total size of {str(path)} because of error :{e}")
mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
Expand Down
4 changes: 2 additions & 2 deletions mlflow/lightgbm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
_PythonEnv,
_validate_env_arguments,
)
from mlflow.utils.file_utils import write_to, get_total_size
from mlflow.utils.file_utils import write_to, get_total_file_size
from mlflow.utils.mlflow_tags import (
MLFLOW_DATASET_CONTEXT,
)
Expand Down Expand Up @@ -214,7 +214,7 @@ def save_model(
code=code_dir_subpath,
)
try:
mlflow_model.model_size_bytes = get_total_size(str(path))
mlflow_model.model_size_bytes = get_total_file_size(str(path))
except Exception as e:
_logger.info(f"Fail to get the total size of {str(path)} because of error :{e}")
mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
Expand Down
4 changes: 2 additions & 2 deletions mlflow/pyfunc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@
_copy_file_or_tree,
get_or_create_nfs_tmp_dir,
get_or_create_tmp_dir,
get_total_size,
get_total_file_size,
write_to,
)
from mlflow.utils.model_utils import (
Expand Down Expand Up @@ -2181,7 +2181,7 @@ def _save_model_with_loader_module_and_data_path(
model_config=model_config,
)
try:
mlflow_model.model_size_bytes = get_total_size(str(path))
mlflow_model.model_size_bytes = get_total_file_size(str(path))
except Exception as e:
_logger.info(f"Fail to get the total size of {str(path)} because of error :{e}")
mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
Expand Down
4 changes: 2 additions & 2 deletions mlflow/pyfunc/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
_process_pip_requirements,
_PythonEnv,
)
from mlflow.utils.file_utils import TempDir, _copy_file_or_tree, get_total_size, write_to
from mlflow.utils.file_utils import TempDir, _copy_file_or_tree, get_total_file_size, write_to
from mlflow.utils.model_utils import _get_flavor_configuration
from mlflow.utils.requirements_utils import _get_pinned_requirement

Expand Down Expand Up @@ -317,7 +317,7 @@ def _save_model_with_class_artifacts_params(
model_config=model_config,
**custom_model_config_kwargs,
)
mlflow_model.model_size_bytes = get_total_size(str(path))
mlflow_model.model_size_bytes = get_total_file_size(str(path))
mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))

if conda_env is None:
Expand Down
4 changes: 2 additions & 2 deletions mlflow/pytorch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
)
from mlflow.utils.file_utils import (
TempDir,
write_to, get_total_size,
write_to, get_total_file_size,
)
from mlflow.utils.model_utils import (
_add_code_from_conf_to_system_path,
Expand Down Expand Up @@ -549,7 +549,7 @@ class LinearNNModel(torch.nn.Module):
python_env=_PYTHON_ENV_FILE_NAME,
)
try:
mlflow_model.model_size_bytes = get_total_size(str(path))
mlflow_model.model_size_bytes = get_total_file_size(str(path))
except Exception as e:
_logger.info(f"Fail to get the total size of {str(path)} because of error :{e}")
mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
Expand Down
4 changes: 2 additions & 2 deletions mlflow/sentence_transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
_PythonEnv,
_validate_env_arguments,
)
from mlflow.utils.file_utils import write_to, get_total_size
from mlflow.utils.file_utils import write_to, get_total_file_size
from mlflow.utils.model_utils import (
_add_code_from_conf_to_system_path,
_download_artifact_from_uri,
Expand Down Expand Up @@ -170,7 +170,7 @@ class that describes the model's inputs and outputs. If not specified but an
code=code_dir_subpath,
)
try:
mlflow_model.model_size_bytes = get_total_size(str(path))
mlflow_model.model_size_bytes = get_total_file_size(str(path))
except Exception as e:
_logger.info(f"Fail to get the total size of {str(path)} because of error :{e}")
mlflow_model.save(str(path.joinpath(MLMODEL_FILE_NAME)))
Expand Down
4 changes: 2 additions & 2 deletions mlflow/sklearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
_PythonEnv,
_validate_env_arguments,
)
from mlflow.utils.file_utils import write_to, get_total_size
from mlflow.utils.file_utils import write_to, get_total_file_size
from mlflow.utils.mlflow_tags import (
MLFLOW_AUTOLOGGING,
MLFLOW_DATASET_CONTEXT,
Expand Down Expand Up @@ -290,7 +290,7 @@ def save_model(
code=code_path_subdir,
)
try:
mlflow_model.model_size_bytes = get_total_size(str(path))
mlflow_model.model_size_bytes = get_total_file_size(str(path))
except Exception as e:
_logger.info(f"Fail to get the total size of {str(path)} because of error :{e}")
mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
Expand Down
4 changes: 2 additions & 2 deletions mlflow/tensorflow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
_PythonEnv,
_validate_env_arguments,
)
from mlflow.utils.file_utils import write_to, get_total_size
from mlflow.utils.file_utils import write_to, get_total_file_size
from mlflow.utils.model_utils import (
_add_code_from_conf_to_system_path,
_get_flavor_configuration,
Expand Down Expand Up @@ -457,7 +457,7 @@ def save_model(

# add model file size to mlflow_model
try:
mlflow_model.model_size_bytes = get_total_size(str(path))
mlflow_model.model_size_bytes = get_total_file_size(str(path))
except Exception as e:
_logger.info(f"Fail to get the total size of {str(path)} because of error :{e}")

Expand Down
4 changes: 2 additions & 2 deletions mlflow/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
_PythonEnv,
_validate_env_arguments,
)
from mlflow.utils.file_utils import get_total_size, write_to
from mlflow.utils.file_utils import get_total_file_size, write_to
from mlflow.utils.model_utils import (
_add_code_from_conf_to_system_path,
_download_artifact_from_uri,
Expand Down Expand Up @@ -530,7 +530,7 @@ def save_model(
**flavor_conf,
)
try:
mlflow_model.model_size_bytes = get_total_size(str(path))
mlflow_model.model_size_bytes = get_total_file_size(str(path))
except Exception as e:
_logger.info(f"Fail to get the total size of {str(path)} because of error :{e}")
mlflow_model.save(str(path.joinpath(MLMODEL_FILE_NAME)))
Expand Down
11 changes: 1 addition & 10 deletions mlflow/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,7 +971,7 @@ def chdir(path: str) -> None:
os.chdir(cwd)


def get_total_size(path: str) -> int:
def get_total_file_size(path: str) -> int:
"""
Return the size of all files under given path, including files in subdirectories.
Expand All @@ -992,12 +992,3 @@ def get_total_size(path: str) -> int:
full_paths = [os.path.join(cur_path, file) for file in files]
total_size += sum([os.path.getsize(file) for file in full_paths])
return total_size


# def get_total_size_safe(path: str, logger) -> int:
# try:
# return get_total_size(path)
# except Exception:
#
# return None
#
4 changes: 2 additions & 2 deletions mlflow/xgboost/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
_PythonEnv,
_validate_env_arguments,
)
from mlflow.utils.file_utils import write_to, get_total_size
from mlflow.utils.file_utils import write_to, get_total_file_size
from mlflow.utils.mlflow_tags import (
MLFLOW_DATASET_CONTEXT,
)
Expand Down Expand Up @@ -185,7 +185,7 @@ def save_model(
code=code_dir_subpath,
)
try:
mlflow_model.model_size_bytes = get_total_size(str(path))
mlflow_model.model_size_bytes = get_total_file_size(str(path))
except Exception as e:
_logger.info(f"Fail to get the total size of {str(path)} because of error :{e}")
mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
Expand Down
56 changes: 39 additions & 17 deletions tests/utils/test_file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
_copy_file_or_tree,
_handle_readonly_on_windows,
get_parent_dir,
get_total_size,
get_total_file_size,
local_file_uri_to_path,
read_parquet_as_pandas_df,
write_pandas_df_as_parquet,
Expand Down Expand Up @@ -378,25 +378,25 @@ def test_get_total_size_basic(tmp_path):
subdir = os.path.join(root, "subdir")
os.mkdir(subdir)
files = {
"file1.txt": "hello world", # 11 bytes
"file2.txt": "This is mlflow testing.", # 23 bytes
"file1.txt": b"hello world", # 11 bytes
"file2.txt": b"This is mlflow testing.", # 23 bytes
}
for name, content in files.items():
with open(os.path.join(root, name), "w") as fp:
with open(os.path.join(root, name), "wb") as fp:
fp.write(content)
with open(os.path.join(subdir, "file3.txt"), "w") as fp:
fp.write("One file under subdir.") # 22 bytes
with open(os.path.join(subdir, "file3.txt"), "wb") as fp:
fp.write(b"One file under subdir.") # 22 bytes

assert get_total_size(root) == 56
assert get_total_size(subdir) == 22
assert get_total_file_size(root) == 56
assert get_total_file_size(subdir) == 22

path_not_exists = os.path.join(root, "does_not_exist")
with pytest.raises(MlflowException, match=f"The given {path_not_exists} does not exist.",):
get_total_size(path_not_exists)
with pytest.raises(MlflowException, match=f"does not exist.",):
get_total_file_size(path_not_exists)

path_file = os.path.join(root, "file1.txt")
with pytest.raises(MlflowException, match=f"The given {path_file} is not a directory.",):
get_total_size(path_file)
with pytest.raises(MlflowException, match=f"is not a directory.",):
get_total_file_size(path_file)


@pytest.fixture
Expand All @@ -412,10 +412,30 @@ def small_qa_pipeline():


def test_get_total_size_transformers(small_qa_pipeline, tmp_path):
small_qa_pipeline.model.save_pretrained(save_directory=tmp_path.joinpath("model"))
small_qa_pipeline.tokenizer.save_pretrained(tmp_path.joinpath("components").joinpath("tokenizer"))
model_dir = tmp_path.joinpath("model")
small_qa_pipeline.model.save_pretrained(save_directory=model_dir)
tokenizer_dir = tmp_path.joinpath("components").joinpath("tokenizer")
small_qa_pipeline.tokenizer.save_pretrained(tokenizer_dir)

assert get_total_size(str(tmp_path)) == 99646933
expected_size = 0
for folder in [model_dir, tokenizer_dir]:
folder = str(folder)
expected_size += _calcualte_expected_size(folder)

assert get_total_file_size(str(tmp_path)) == expected_size


def _calcualte_expected_size(folder):
# this helper function does not consider subdirectories
expected_size = 0
for path in os.listdir(folder):
path = os.path.join(folder, path)
print(path, os.path.isfile(path))
if not os.path.isfile(path):
continue
with open(path, "rb") as fp:
expected_size += len(fp.read())
return expected_size


def test_get_total_size_sklearn(tmp_path):
Expand All @@ -434,10 +454,12 @@ def test_get_total_size_sklearn(tmp_path):
os.mkdir(pickle_dir)
with open(os.path.join(pickle_dir, "model.pkl"), "wb") as out:
pickle.dump(linear_lr, out, protocol=pickle.DEFAULT_PROTOCOL)
assert get_total_size(pickle_dir) == 906
expected_size = _calcualte_expected_size(pickle_dir)
assert get_total_file_size(pickle_dir) == expected_size

cloudpickle_dir = os.path.join(path, "cloudpickle_model")
os.mkdir(cloudpickle_dir)
with open(os.path.join(cloudpickle_dir, "model.pkl"), "wb") as out:
cloudpickle.dump(linear_lr, out, protocol=pickle.DEFAULT_PROTOCOL)
assert get_total_size(cloudpickle_dir) == 906
expected_size = _calcualte_expected_size(cloudpickle_dir)
assert get_total_file_size(cloudpickle_dir) == expected_size

0 comments on commit 8a5f34a

Please sign in to comment.