aligner_result.h

/*
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef ALIGNER_RESULT_H_
#define ALIGNER_RESULT_H_

#include <utility>
#include <limits>
#include <vector>
#include "mem_ids.h"
#include "ref_coord.h"
#include "read.h"
#include "filebuf.h"
#include "ds.h"
#include "edit.h"
#include "limit.h"

typedef int64_t TAlScore;

#define VALID_AL_SCORE(x)   ((x).score_ > MIN_I64)
#define VALID_SCORE(x)      ((x) > MIN_I64)
#define INVALIDATE_SCORE(x) ((x) = MIN_I64)

/**
 * A generic score object for an alignment.  Used for accounting during
 * SW and elsewhere.  Encapsulates the score, the number of N positions
 * and the number gaps in the alignment.
 *
 * The scale for 'score' is such that a perfect alignment score is 0
 * and a score with non-zero penalty is less than 0.  So differences
 * between scores work as expected, but interpreting an individual
 * score (larger is better) as a penalty (smaller is better) requires
 * taking the absolute value.
 */
class AlnScore {

public:

	/**
	 * Gapped scores are invalid until proven valid.
	 */
	inline AlnScore() {
		reset();
		invalidate();
		assert(!valid());
	}

	/**
	 * Gapped scores are invalid until proven valid.
	 */
	inline AlnScore(TAlScore score) {
		score_ = score;
	}
	
	/**
	 * Reset the score.
	 */
	void reset() {
		score_ = 0;
	}

	/**
	 * Return an invalid SwScore.
	 */
	inline static AlnScore INVALID() {
		AlnScore s;
		s.invalidate();
		assert(!s.valid());
		return s;
	}
	
	/**
	 * Return true iff this score has a valid value.
	 */
	inline bool valid() const {
		return score_ != MIN_I64;
	}

	/**
	 * Make this score invalid (and therefore <= all other scores).
	 */
	inline void invalidate() {
		score_ = MIN_I64;
		assert(!valid());
	}
	

	/**
	 * Scores are equal iff they're bitwise equal.
	 */
	inline bool operator==(const AlnScore& o) const {
		// Profiling shows cache misses on following line
		return VALID_AL_SCORE(*this) && VALID_AL_SCORE(o) && score_ == o.score_;
	}

	/**
	 * Return true iff the two scores are unequal.
	 */
	inline bool operator!=(const AlnScore& o) const {
		return !(*this == o);
	}

	/**
	 * Return true iff this score is >= score o.
	 */
	inline bool operator>=(const AlnScore& o) const {
		if(!VALID_AL_SCORE(o)) {
			if(!VALID_AL_SCORE(*this)) {
				// both invalid
				return false;
			} else {
				// I'm valid, other is invalid
				return true;
			}
		} else if(!VALID_AL_SCORE(*this)) {
			// I'm invalid, other is valid
			return false;
		}
		return score_ >= o.score_;
	}

	/**
	 * Return true iff this score is < score o.
	 */
	inline bool operator<(const AlnScore& o) const {
		return !operator>=(o);
	}

	/**
	 * Return true iff this score is <= score o.
	 */
	inline bool operator<=(const AlnScore& o) const {
		return operator<(o) || operator==(o);
	}
    
    /**
     * Return true iff this score is < score o.
     */
    inline bool operator>(const AlnScore& o) const {
        return !operator<=(o);
    }

	TAlScore score()   const { return  score_; }

    // Score accumulated so far (penalties are subtracted starting at 0)
	TAlScore score_;
};

static inline ostream& operator<<(ostream& os, const AlnScore& o) {
	os << o.score();
	return os;
}

// Forward declaration
class BitPairReference;


/**
 * Encapsulates an alignment result.  The result comprises:
 *
 * 1. All the nucleotide edits for both mates ('ned').
 * 2. All "edits" where an ambiguous reference char is resolved to an
 *    unambiguous char ('aed').
 * 3. The score for the alginment, including summary information about the
 *    number of gaps and Ns involved.
 * 4. The reference id, strand, and 0-based offset of the leftmost character
 *    involved in the alignment.
 * 5. Information about trimming prior to alignment and whether it was hard or
 *    soft.
 * 6. Information about trimming during alignment and whether it was hard or
 *    soft.  Local-alignment trimming is usually soft when aligning nucleotide
 *    reads.
 *
 * Note that the AlnRes, together with the Read and an AlnSetSumm (*and* the
 * opposite mate's AlnRes and Read in the case of a paired-end alignment),
 * should contain enough information to print an entire alignment record.
 *
 * TRIMMING
 *
 * Accounting for trimming is tricky.  Trimming affects:
 *
 * 1. The values of the trim* and pretrim* fields.
 * 2. The offsets of the Edits in the EList<Edit>s.
 * 3. The read extent, if the trimming is soft.
 * 4. The read extent and the read sequence and length, if trimming is hard.
 *
 * Handling 1. is not too difficult.  2., 3., and 4. are handled in setShape().
 */
class AlnRes {

public:

	AlnRes()
	{
		reset();
	}
    
    AlnRes(const AlnRes& other)
    {
        score_ = other.score_;
        max_score_ = other.max_score_;
        uid_ = other.uid_;
        tid_ = other.tid_;
        taxRank_ = other.taxRank_;
        summedHitLen_ = other.summedHitLen_;
		readPositions_ = other.readPositions_;
		isFw_ = other.isFw_;
    }
    
    AlnRes& operator=(const AlnRes& other) {
        if(this == &other) return *this;
        score_ = other.score_;
        max_score_ = other.max_score_;
        uid_ = other.uid_;
        tid_ = other.tid_;
        taxRank_ = other.taxRank_;
        summedHitLen_ = other.summedHitLen_;
		readPositions_ = other.readPositions_;
		isFw_ = other.isFw_;
        return *this;
    }
    
    ~AlnRes() {}

	/**
	 * Clear all contents.
	 */
	void reset() {
        score_ = 0;
        max_score_ = 0;
        uid_ = "";
        tid_ = 0;
        taxRank_ = RANK_UNKNOWN;
        summedHitLen_ = 0.0;
		readPositions_.clear();
    }
    
	/**
	 * Set alignment score for this alignment.
	 */
	void setScore(TAlScore score) {
		score_ = score;
	}

	TAlScore           score()          const { return score_;     }
    TAlScore           max_score()      const { return max_score_; }
    string             uid()            const { return uid_;   }
    uint64_t           taxID()          const { return tid_;   }
    uint8_t            taxRank()        const { return taxRank_; }
    double             summedHitLen()   const { return summedHitLen_; }

	const EList<pair<uint32_t,uint32_t> >& readPositionsPtr() const { return readPositions_; }

	const pair<uint32_t,uint32_t> readPositions(size_t i) const { return readPositions_[i]; }
	size_t nReadPositions() const { return readPositions_.size(); }

	bool               isFw()           const { return isFw_;      }
    
   /**
	 * Print the sequence for the read that aligned using A, C, G and
	 * T.  This will simply print the read sequence (or its reverse
	 * complement).
	 */
 	void printSeq(
		const Read& rd,
		const BTDnaString* dns,
		BTString& o) const
    {
        assert(!rd.patFw.empty());
        ASSERT_ONLY(size_t written = 0);
        // Print decoded nucleotides
        assert(dns != NULL);
        size_t len = dns->length();
        size_t st = 0;
        size_t en = len;
        for(size_t i = st; i < en; i++) {
            int c = dns->get(i);
            assert_range(0, 3, c);
            o.append("ACGT"[c]);
            ASSERT_ONLY(written++);
        }
    }

	/**
	 * Print the quality string for the read that aligned.  This will
	 * simply print the read qualities (or their reverse).
	 */
 	void printQuals(
		const Read& rd,
		const BTString* dqs,
		BTString& o) const
    {
        assert(dqs != NULL);
        size_t len = dqs->length();
        // Print decoded qualities from upstream to downstream Watson
        for(size_t i = 1; i < len-1; i++) {
            o.append(dqs->get(i));
        }
    }
	

	/**
	 * Initialize new AlnRes.
	 */
	void init(
              TAlScore score,           // alignment score
              TAlScore max_score,
              const string& uniqueID,
              uint64_t taxID,
              uint8_t taxRank,
			  double summedHitLen,
			  const EList<pair<uint32_t, uint32_t> >& readPositions,
			  bool isFw)
    {
        score_  = score;
        max_score_ = max_score;
        uid_ = uniqueID;
        tid_ = taxID;
        taxRank_ = taxRank;
        summedHitLen_ = summedHitLen;
		readPositions_ = readPositions;
		isFw_ = isFw;
    }

protected:
	TAlScore     score_;        //
    TAlScore     max_score_;
    string       uid_;
    uint64_t     tid_;
    uint8_t      taxRank_;
    double       summedHitLen_; // sum of the length of all partial hits, divided by the number of genome matches
	bool         isFw_;
  
	EList<pair<uint32_t, uint32_t> > readPositions_;
};

typedef uint64_t TNumAlns;

/**
 * Encapsulates a concise summary of a set of alignment results for a
 * given pair or mate.  Referring to the fields of this object should
 * provide enough information to print output records for the read.
 */
class AlnSetSumm {

public:

	AlnSetSumm() { reset(); }

	/**
	 * Given an unpaired read (in either rd1 or rd2) or a read pair
	 * (mate 1 in rd1, mate 2 in rd2).
	 */
	explicit AlnSetSumm(
		const Read* rd1,
		const Read* rd2,
		const EList<AlnRes>* rs)
	{
		init(rd1, rd2, rs);
	}

	explicit AlnSetSumm(
		AlnScore best,
		AlnScore secbest)
	{
		init(best, secbest);
	}
	
	/**
	 * Set to uninitialized state.
	 */
	void reset() {
		best_.invalidate();
		secbest_.invalidate();
	}
	
    /**
     * Given all the paired and unpaired results involving mates #1 and #2,
     * calculate best and second-best scores for both mates.  These are
     * used for future MAPQ calculations.
     */
	void init(
		const Read* rd1,
		const Read* rd2,
		const EList<AlnRes>* rs)
    {
        assert(rd1 != NULL || rd2 != NULL);
        assert(rs != NULL);
        AlnScore best, secbest;
        size_t szs = 0;
        best.invalidate();    secbest.invalidate();
        szs = rs->size();
        //assert_gt(szs[j], 0);
        for(size_t i = 0; i < rs->size(); i++) {
            AlnScore sc = (*rs)[i].score();
            if(sc > best) {
                secbest = best;
                best = sc;
                assert(VALID_AL_SCORE(best));
            } else if(sc > secbest) {
                secbest = sc;
                assert(VALID_AL_SCORE(best));
                assert(VALID_AL_SCORE(secbest));
            }
        }
        if(szs > 0) {
            init(best, secbest);
        } else {
            reset();
        }
    }

	
	/**
	 * Initialize given fields.  See constructor for how fields are set.
	 */
	void init(
		AlnScore best,
		AlnScore secbest)
	{
		best_         = best;
		secbest_      = secbest;
		assert(repOk());
	}
	
	/**
	 * Return true iff there is at least a best alignment
	 */
	bool empty() const {
		assert(repOk());
		return !VALID_AL_SCORE(best_);
	}
	
#ifndef NDEBUG
	/**
	 * Check that the summary is internally consistent.
	 */
	bool repOk() const {
		return true;
	}
#endif
	
	AlnScore best()         const { return best_;         }
	AlnScore secbest()      const { return secbest_;      }


protected:
	
	AlnScore best_;         // best full-alignment score found for this read
	AlnScore secbest_;      // second-best
};

#endif