Skip to content

Commit

Permalink
amid testing metadata overhaul; pepkit#281
Browse files Browse the repository at this point in the history
  • Loading branch information
vreuter committed Mar 28, 2019
1 parent db00139 commit 1a3d763
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 84 deletions.
100 changes: 64 additions & 36 deletions peppy/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@
from .const import \
ASSAY_KEY, DATA_SOURCE_COLNAME, DEFAULT_COMPUTE_RESOURCES_NAME, \
NAME_TABLE_ATTR, DERIVATIONS_DECLARATION, IMPLICATIONS_DECLARATION, \
METADATA_KEY, SAMPLE_ANNOTATIONS_KEY, SAMPLE_SUBANNOTATIONS_KEY, \
METADATA_KEY, SAMPLE_SUBANNOTATIONS_KEY, \
SAMPLE_NAME_COLNAME
from .exceptions import PeppyError
from .sample import merge_sample, Sample
from .utils import \
add_project_sample_constants, copy, fetch_samples, is_url, non_null_value, \
warn_derived_cols, warn_implied_cols
add_project_sample_constants, copy, fetch_samples, infer_delimiter, is_url, \
non_null_value, warn_derived_cols, warn_implied_cols


MAX_PROJECT_SAMPLES_REPR = 12
Expand Down Expand Up @@ -218,12 +218,12 @@ def __init__(self, config_file, subproject=None, dry=False,
self_table_attr = "_" + NAME_TABLE_ATTR
if path_anns_file:
_LOGGER.debug("Reading sample annotations sheet: '%s'", path_anns_file)
setattr(self, self_table_attr, self.parse_sample_sheet(path_anns_file))
self[self_table_attr] = self.parse_sample_sheet(path_anns_file)
else:
_LOGGER.warning("No sample annotations sheet in config")
setattr(self, self_table_attr, None)
self[self_table_attr] = None

setattr(self, SAMPLE_SUBANNOTATIONS_KEY, None)
self[SAMPLE_SUBANNOTATIONS_KEY] = None

# Basic sample maker will handle name uniqueness check.
if defer_sample_construction or self._sample_table is None:
Expand Down Expand Up @@ -265,15 +265,6 @@ def __setitem__(self, key, value):
value = _Metadata(value)
super(Project, self).__setitem__(key, value)

@property
def subproject(self):
"""
Return currently active subproject or None if none was activated
:return str: currently active subproject
"""
return self._subproject

@property
def constants(self):
"""
Expand Down Expand Up @@ -417,11 +408,20 @@ def sample_subannotation(self):

@property
def sample_table(self):
"""
Return (possibly first parsing/building) the table of samples.
:return pandas.core.frame.DataFrame: table of samples' metadata
"""
from copy import copy as cp
if self._sample_table is None:
self._sample_table = \
self.parse_sample_sheet(getattr(self.metadata, NAME_TABLE_ATTR))
return cp(self._sample_table)
key = NAME_TABLE_ATTR
attr = "_" + key
if self.get(attr) is None:
if key not in self[METADATA_KEY]:
return None
sheetfile = self[METADATA_KEY][NAME_TABLE_ATTR]
self[attr] = self.parse_sample_sheet(sheetfile)
return cp(self[attr])

@property
def sheet(self):
Expand All @@ -434,6 +434,33 @@ def sheet(self):
format(NAME_TABLE_ATTR), DeprecationWarning)
return getattr(self, NAME_TABLE_ATTR)

@property
def subproject(self):
"""
Return currently active subproject or None if none was activated
:return str: name of currently active subproject
"""
return self._subproject

@property
def subsample_table(self):
"""
Return (possibly first parsing/building) the table of subsamples.
:return pandas.core.frame.DataFrame: table of subsamples' metadata
"""
from copy import copy as cp
key = SAMPLE_SUBANNOTATIONS_KEY
attr = "_" + key
if self.get(attr) is None:
if key not in self[METADATA_KEY]:
return None
sheetfile = self[METADATA_KEY][key]
self[attr] = pd.read_csv(sheetfile, sep=infer_delimiter(sheetfile),
dtype=str, index_col=False, engine="python", keep_default_na=False)
return cp(self[attr])

@property
def templates_folder(self):
"""
Expand Down Expand Up @@ -610,7 +637,7 @@ def finalize_pipelines_directory(self, pipe_path=""):
_LOGGER.debug("Got {} as pipelines path(s) ({})".
format(pipe_path, type(pipe_path)))
pipe_path = []
setattr(self.metadata, NEW_PIPES_KEY, pipe_path)
self[METADATA_KEY][NEW_PIPES_KEY] = pipe_path

def get_arg_string(self, pipeline_name):
"""
Expand Down Expand Up @@ -700,7 +727,7 @@ def _set_basic_samples(self):
_LOGGER.info("Reading subannotations: %s", sub_ann)
subann_table = pd.read_csv(
sub_ann, sep=None, engine="python", dtype=str)
setattr(self, SAMPLE_SUBANNOTATIONS_KEY, subann_table)
self[SAMPLE_SUBANNOTATIONS_KEY] = subann_table
_LOGGER.debug("Subannotations shape: {}".format(subann_table.shape))
else:
_LOGGER.debug("Alleged path to sample subannotations data is "
Expand Down Expand Up @@ -833,8 +860,7 @@ def parse_config_file(self, subproject=None):
for var in self.required_metadata:
if var not in self.metadata:
raise ValueError("Missing required metadata item: '{}'".format(var))
setattr(self.metadata, var,
os.path.expandvars(getattr(self.metadata, var)))
self[METADATA_KEY][var] = os.path.expandvars(self.metadata.get(var))

_LOGGER.debug("{} metadata: {}".format(self.__class__.__name__,
self.metadata))
Expand All @@ -850,14 +876,12 @@ def parse_config_file(self, subproject=None):
}

for key, value in config_vars.items():
if hasattr(self.metadata, key):
if not os.path.isabs(getattr(self.metadata, key)):
v = os.path.join(
self.output_dir, getattr(self.metadata, key))
setattr(self.metadata, key, v)
if key in self.metadata:
if not os.path.isabs(self.metadata[key]):
self.metadata[key] = \
os.path.join(self.output_dir, self.metadata[key])
else:
outpath = os.path.join(self.output_dir, value)
setattr(self.metadata, key, outpath)
self.metadata[key] = os.path.join(self.output_dir, value)

# Variables which are relative to the config file
# All variables in these sections should be relative to project config.
Expand Down Expand Up @@ -950,8 +974,7 @@ def parse_sample_sheet(sample_file, dtype=str):
# See https://github.com/pepkit/peppy/issues/159 for the original issue
# and https://github.com/pepkit/peppy/pull/160 for the pull request
# that resolved it.
ext = os.path.splitext(sample_file)[1][1:].lower()
sep = {"txt": "\t", "tsv": "\t", "csv": ","}.get(ext)
sep = infer_delimiter(sample_file)
try:
df = pd.read_csv(sample_file, sep=sep, dtype=dtype, index_col=False,
engine="python", keep_default_na=False)
Expand Down Expand Up @@ -979,10 +1002,15 @@ def __init__(self, missing_section, path_config_file=None):
class MissingSampleSheetError(PeppyError):
""" Represent case in which sample sheet is specified but nonexistent. """
def __init__(self, sheetfile):
super(Project.MissingSampleSheetError, self).__init__(
"Missing sample annotation sheet ({}); a project need not use "
"a sample sheet, but if it does the file must exist."
.format(sheetfile))
parent_folder = os.path.dirname(sheetfile)
contents = os.listdir(parent_folder) \
if os.path.isdir(parent_folder) else []
msg = "Missing sample annotation sheet ({}); a project need not use " \
"a sample sheet, but if it does the file must exist.".\
format(sheetfile)
if contents:
msg += " Contents of parent folder: {}".format(", ".join(contents))
super(Project.MissingSampleSheetError, self).__init__(msg)

@staticmethod
def _omit_from_repr(k, cls):
Expand Down
12 changes: 11 additions & 1 deletion peppy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,17 @@ def import_from_source(module_filepath):
return mod


def infer_delimiter(filepath):
"""
From extension infer delimiter used in a separated values file.
:param str filepath: path to file about which to make inference
:return str | NoneType: extension if inference succeeded; else null
"""
ext = os.path.splitext(filepath)[1][1:].lower()
return {"txt": "\t", "tsv": "\t", "csv": ","}.get(ext)


def is_null_like(x):
"""
Determine whether an object is effectively null.
Expand Down Expand Up @@ -414,7 +425,6 @@ def _warn_cols_to_attrs(prefix):
"as {pfx}_attributes".format(pfx=prefix), DeprecationWarning)



class CommandChecker(object):
"""
Validate PATH availability of executables referenced by a config file.
Expand Down
30 changes: 15 additions & 15 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from peppy import \
setup_peppy_logger, Project, SAMPLE_NAME_COLNAME
from peppy.const import METADATA_KEY, NAME_TABLE_ATTR
from peppy.const import METADATA_KEY, NAME_TABLE_ATTR, SAMPLE_SUBANNOTATIONS_KEY


_LOGGER = logging.getLogger("peppy")
Expand All @@ -34,7 +34,7 @@
{tab_key}: samples.csv
output_dir: test
pipeline_interfaces: pipelines
subsample_table: merge.csv
{subtab_key}: merge.csv
derived_attributes: [{{derived_attribute_names}}]
Expand All @@ -50,7 +50,8 @@
phenome: hg72
b:
genome: hg38
""".format(md_key=METADATA_KEY, tab_key=NAME_TABLE_ATTR).splitlines(True)
""".format(md_key=METADATA_KEY, tab_key=NAME_TABLE_ATTR,
subtab_key=SAMPLE_SUBANNOTATIONS_KEY).splitlines(True)
# Will populate the corresponding string format entry in project config lines.
DERIVED_COLNAMES = ["file", "file2", "dcol1", "dcol2",
"nonmerged_col", "nonmerged_col", "data_source"]
Expand Down Expand Up @@ -199,11 +200,14 @@ def conf_logs(request):
_LOGGER = logging.getLogger("peppy.{}".format(__name__))



@pytest.fixture(scope="function")
def sample_annotation_lines():
return SAMPLE_ANNOTATION_LINES
"""
Return fixed collection of lines for sample annotations sheet.
:return Iterable[str]: collection of lines for sample annotations sheet
"""
return SAMPLE_ANNOTATION_LINES


@pytest.fixture(scope="function")
Expand Down Expand Up @@ -234,7 +238,6 @@ def path_empty_project(request, tmpdir):
return conf_path



def interactive(
prj_lines=PROJECT_CONFIG_LINES,
iface_lines=PIPELINE_INTERFACE_CONFIG_LINES,
Expand Down Expand Up @@ -287,7 +290,6 @@ def interactive(
return prj



class _DataSourceFormatMapping(dict):
"""
Partially format text with braces. This helps since bracing is the
Expand All @@ -298,7 +300,6 @@ def __missing__(self, derived_attribute):
return "{" + derived_attribute + "}"



def _write_temp(lines, dirpath, fname):
"""
Note that delete flag is a required argument since it's potentially
Expand Down Expand Up @@ -337,14 +338,12 @@ def _write_temp(lines, dirpath, fname):
return filepath



@pytest.fixture(scope="function")
def project_config_lines():
""" Provide safer iteration over the lines for Project config file. """
return PROJECT_CONFIG_LINES



@pytest.fixture(scope="function")
def path_project_conf(tmpdir, project_config_lines):
"""
Expand All @@ -359,7 +358,6 @@ def path_project_conf(tmpdir, project_config_lines):
project_config_lines, tmpdir.strpath, P_CONFIG_FILENAME)



@pytest.fixture(scope="function")
def proj_conf_data(path_project_conf):
"""
Expand All @@ -373,7 +371,6 @@ def proj_conf_data(path_project_conf):
return yaml.safe_load(conf_file)



@pytest.fixture(scope="function")
def path_sample_anns(tmpdir, sample_annotation_lines):
"""
Expand All @@ -385,15 +382,18 @@ def path_sample_anns(tmpdir, sample_annotation_lines):
:return str: path to the sample annotations file that was written
"""
filepath = _write_temp(
sample_annotation_lines, tmpdir.strpath, ANNOTATIONS_FILENAME)
sample_annotation_lines, tmpdir.strpath, ANNOTATIONS_FILENAME)
return filepath



@pytest.fixture(scope="function")
def p_conf_fname():
return P_CONFIG_FILENAME
"""
Return fixed name of project config file.
:return str: name of project config file
"""
return P_CONFIG_FILENAME


@pytest.fixture(scope="class")
Expand Down
Loading

0 comments on commit 1a3d763

Please sign in to comment.