Skip to content

Commit

Permalink
update get lib type and get read orient
Browse files Browse the repository at this point in the history
  • Loading branch information
balajtimate committed Jan 8, 2024
1 parent 5f8f62d commit 1e992e1
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 33 deletions.
65 changes: 39 additions & 26 deletions htsinfer/get_library_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _evaluate_mate_relationship(
ids_2: As `ids_1` but for the putative second mate file.
"""
self.results.relationship = StatesTypeRelationship.not_mates
if ids_1 == ids_2:
if ids_1 == ids_2 and len(ids_1) > 0 and len(ids_2) > 0:
if (
self.results.file_1 == StatesType.first_mate and
self.results.file_2 == StatesType.second_mate
Expand All @@ -136,7 +136,7 @@ def _evaluate_mate_relationship(
self.mapping.evaluate()
self._align_mates()
else:
LOGGER.warning(
LOGGER.debug(
"Library source is not determined, "
"mate relationship cannot be inferred by alignment."
)
Expand Down Expand Up @@ -181,17 +181,23 @@ def _align_mates(self):
seq_id2 = read2.query_name
if seq_id2 != previous_seq_id2 \
and previous_seq_id2 is not None:
if self._compare_alignments(mate1[read_counter], reads2):
if read_counter < len(mate1) and self._compare_alignments(
mate1[read_counter], reads2
):
concordant += 1
reads2.clear()
read_counter += 1
if read2.reference_end:
reads2.append(read2)
previous_seq_id2 = seq_id2

if self._compare_alignments(mate1[read_counter], reads2):
if read_counter < len(mate1) and self._compare_alignments(
mate1[read_counter], reads2
):
concordant += 1

LOGGER.debug(f"Number of mapped reads file 1: {len(mate1)}")
LOGGER.debug(f"Number of mapped reads file 2: {read_counter}")
LOGGER.debug(f"Number of concordant reads: {concordant}")
self._update_relationship(concordant, read_counter)

samfile1.close()
Expand Down Expand Up @@ -325,12 +331,14 @@ def evaluate(self) -> None:

if self.seq_id_format is None:
self.result = StatesType.not_available
raise MetadataWarning(
LOGGER.warning(
"Could not determine sequence identifier format."
)
LOGGER.debug(
f"Sequence identifier format: {self.seq_id_format.name}"
)
else:
LOGGER.debug(
"Sequence identifier format: "
f"{self.seq_id_format.name}"
)

# Ensure that remaining records are compatible with sequence
# identifier format and library type determined from first
Expand All @@ -339,28 +347,33 @@ def evaluate(self) -> None:
"Checking consistency of remaining reads with initially "
"determined identifier format and library type..."
)
for record in seq_iter:
records += 1
try:
self._get_read_type(
seq_id=record[0],
regex=self.seq_id_format.value,
)
except (
InconsistentFastqIdentifiers,
UnknownFastqIdentifier,
) as exc:
self.result = StatesType.not_available
raise MetadataWarning(
f"{type(exc).__name__}: {str(exc)}"
) from exc
if self.seq_id_format is not None:
for record in seq_iter:
records += 1
try:
self._get_read_type(
seq_id=record[0],
regex=self.seq_id_format.value,
)
except (
InconsistentFastqIdentifiers,
UnknownFastqIdentifier,
) as exc:
self.result = StatesType.not_available
raise MetadataWarning(
f"{type(exc).__name__}: {str(exc)}"
) from exc
LOGGER.debug(f"Total records processed: {records}")
else:
LOGGER.debug(
"Could not determine sequence identifier format. "
"Skipping consistency check for the remaining reads."
)

except (OSError, ValueError) as exc:
self.result = StatesType.not_available
raise FileProblem(f"{type(exc).__name__}: {str(exc)}") from exc

LOGGER.debug(f"Total records processed: {records}")

def _get_read_type(
self,
seq_id: str,
Expand Down
2 changes: 1 addition & 1 deletion htsinfer/get_read_orientation.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def evaluate(self) -> ResultsOrientation:
or self.library_source.file_2.short_name is not None):
self.mapping.evaluate()
else:
LOGGER.warning(
LOGGER.debug(
"Library source is not determined, "
"read orientation cannot be inferred by alignment."
)
Expand Down
6 changes: 0 additions & 6 deletions tests/test_get_library_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,12 +214,6 @@ def test_evaluate_single(self):
test_instance.evaluate()
assert test_instance.result == StatesType.single

def test_evaluate_unknown_seq_id(self):
"""Evaluate file with identifiers of an unknown format."""
test_instance = GetFastqType(path=FILE_UNKNOWN_SEQ_ID)
with pytest.raises(MetadataWarning):
test_instance.evaluate()

def test_evaluate_inconsistent_identifiers_single_mate(self):
"""Raise ``MetadataWarning`` by passing a file with inconsistent
identifiers, suggesting a single-end library first, then a paired-end
Expand Down

0 comments on commit 1e992e1

Please sign in to comment.