Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gallery example for profile alignments #692

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 186 additions & 0 deletions doc/examples/scripts/sequence/profile/scop_profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
"""
Note that in order to get a representative sequence profile, it is crucial to have a
high number of sequences the target sequence can be aligned to.
"""

import matplotlib.pyplot as plt
import numpy as np
import requests
import biotite.database.uniprot as uniprot
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.sequence.io.fasta as fasta
from biotite.database import RequestError

TARGET_UNIPROT_ID = "P04746"
TARGET_SCOP_FAMILY_ID = "4003138"
QUERY_UNIPROT_ID = "P00722"

# TARGET_UNIPROT_ID = "P09467"
# TARGET_SCOP_FAMILY_ID = "4002766"
# QUERY_UNIPROT_ID = "P73922"


target_sequence = fasta.get_sequence(
fasta.FastaFile.read(uniprot.fetch(TARGET_UNIPROT_ID, "fasta", "."))
)
query_sequence = fasta.get_sequence(
fasta.FastaFile.read(uniprot.fetch(QUERY_UNIPROT_ID, "fasta", "."))
)

aa_matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment = align.align_optimal(
query_sequence,
target_sequence,
aa_matrix,
gap_penalty=(-10, -1),
local=False,
max_number=1,
)[0]

fig, ax = plt.subplots(figsize=(8.0, 1.0))
graphics.plot_alignment_similarity_based(
ax, alignment, matrix=aa_matrix, labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID]
)


########################################################################################
# https://www.ebi.ac.uk/pdbe/scop/download


def get_sequence_family(scop_id):
scop_domains = requests.get(
f"https://www.ebi.ac.uk/pdbe/scop/api/domains/{scop_id}"
).json()
sequences = {}
for scop_domain in scop_domains["domains"]:
uniprot_id = scop_domain["uniprot_id"]
try:
sequence = fasta.get_sequence(
fasta.FastaFile.read(uniprot.fetch(uniprot_id, "fasta", "."))
)
except RequestError:
# The UniProt ID is not in the database -> skip this domain
continue
for start, end in scop_domain["protein_regions"]:
region = sequence[start : end + 1]
sequences[uniprot_id] = region
# Most domains have only one region within the sequence
# For simplicity we only consider the first region
break
return sequences


def merge_pairwise_alignments(alignments):
traces = []
sequences = []
for alignment in alignments:
trace = alignment.trace
# Remove gaps in first sequence
trace = trace[trace[:, 0] != -1]
traces.append(trace[:, 1])
sequences.append(alignment.sequences[1])
return align.Alignment(sequences, np.stack(traces, axis=-1))


sequences = get_sequence_family(TARGET_SCOP_FAMILY_ID)
# This is not a 'true' MSA, in the sense that it only merges the pairwise alignments
# with respect to the target sequence
pseudo_msa = merge_pairwise_alignments(
[
align.align_optimal(
target_sequence,
sequence,
aa_matrix,
gap_penalty=(-10, -1),
max_number=1,
)[0]
for sequence in sequences.values()
]
)

labels = np.array(list(sequences.keys()))

fig, ax = plt.subplots(figsize=(8.0, 24.0))
graphics.plot_alignment_type_based(ax, pseudo_msa, labels=labels, show_numbers=True)

########################################################################################
# :footcite:`Robinson1991`

# Must have the same order as `ProteinSequence.alphabet`
AA_FREQUENCY = {
"A": 35155,
"C": 8669,
"D": 24161,
"E": 28354,
"F": 17367,
"G": 33229,
"H": 9906,
"I": 23161,
"K": 25872,
"L": 40625,
"M": 10101,
"N": 20212,
"P": 23435,
"Q": 19208,
"R": 23105,
"S": 32070,
"T": 26311,
"V": 29012,
"W": 5990,
"Y": 14488,
# Set ambiguous amino acid count to 1 to avoid division by zero in log odds matrix
"B": 1,
"Z": 1,
"X": 1,
"*": 1,
}


profile = seq.SequenceProfile.from_alignment(pseudo_msa)
background_frequencies = np.array(list(AA_FREQUENCY.values()))
# Normalize background frequencies
background_frequencies = background_frequencies / background_frequencies.sum()

score_matrix = profile.log_odds_matrix(background_frequencies, pseudocount=1).T
score_matrix *= 10
score_matrix = score_matrix.astype(np.int32)

profile_seq = seq.PositionalSequence(profile.to_consensus())
positional_matrix = align.SubstitutionMatrix(
seq.ProteinSequence.alphabet, profile_seq.alphabet, score_matrix=score_matrix
)
profile_alignment = align.align_optimal(
query_sequence,
profile_seq,
positional_matrix,
gap_penalty=(-10, -1),
local=False,
terminal_penalty=False,
max_number=1,
)[0]


########################################################################################
# Map the profile alignment to the original target sequence.
# As the pseudo-MSA was designed to have the same length as the target sequence,
# the sequence needs to be exchanged only, the trace remains the same
refined_alignment = align.Alignment(
[query_sequence, target_sequence], profile_alignment.trace
)

fig, ax = plt.subplots(figsize=(8.0, 4.0))
graphics.plot_alignment_similarity_based(
ax,
refined_alignment,
matrix=aa_matrix,
labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID],
)


plt.show()

########################################################################################
#
# .. footbibliography::
4 changes: 2 additions & 2 deletions doc/tutorial/sequence/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ For an unambiguous DNA sequence, the alphabet comprises the four nucleobases.
align_optimal
align_heuristic
align_multiple
annotations
profiles
profiles
annotations
159 changes: 159 additions & 0 deletions doc/tutorial/sequence/profiles.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
.. include:: /tutorial/preamble.rst

Profiles and position-specific matrices
=======================================
Often sequences are not viewed in isolation:
For example, if you investigate a protein family, you do not handle a single sequence,
but an arbitrarily large collection of highly similar sequences.
At some point handling those sequences as a mere collection of individual sequences
becomes impractical for answering common questions, like which are the prevalent
amino acids at a certain positions of the sequences.

Sequence profiles
-----------------

.. currentmodule:: biotite.sequence

This is where *sequence profiles* come into play:
The condense the a collection of aligned sequences into a matrix that tracks the
frequency of each symbol at each position of the alignment.
Hence, asking the questions such as '*How frequent is an alanine at the n-th position?*'
becomes a trivial indexing operation.

As example for profiles, we will reuse the cyclotide sequence family from the
:doc:`previous chapter <align_multiple>`.

.. jupyter-execute::

from tempfile import NamedTemporaryFile
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez

query = (
entrez.SimpleQuery("Cyclotide") &
entrez.SimpleQuery("cter") &
entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^
entrez.SimpleQuery("Precursor")
)
uids = entrez.search(query, "protein")
temp_file = NamedTemporaryFile(suffix=".fasta", delete=False)
fasta_file = fasta.FastaFile.read(
entrez.fetch_single_file(uids, temp_file.name, "protein", "fasta")
)
sequences = {
# The cyclotide variant is the last character in the header
header[-1]: seq for header, seq in fasta.get_sequences(fasta_file).items()
}
# Extract cyclotide N as query sequence for later
query = sequences.pop("N")

To create a profile, we first need to align the sequences, so corresponding symbols
appear the same position.
Then we can create a :class:`SequenceProfile` object from the :class:`.Alignment`, which
simply counts for each alignment column (i.e. the sequence position) the number of
occurrences for each symbol.

.. jupyter-execute::

import biotite.sequence as seq
import biotite.sequence.align as align

alignment, _, _, _ = seq.align.align_multiple(
list(sequences.values()),
align.SubstitutionMatrix.std_protein_matrix(),
gap_penalty=-5,
)
profile = seq.SequenceProfile.from_alignment(alignment)
print(profile)

Each row in the displayed count matrix
(accessible via :attr:`SequenceProfile.symbols`) refers to a single position, i.e. a
column in the input MSA, and each column refers to a symbol in the underlying alphabet
(accessible via :attr:`SequenceProfile.alphabet`).
For completeness it should be noted that :attr:`SequenceProfile.gaps` also tracks the
gaps for each position in the alignment, but we will not further use them in this
tutorial.

Note that the information about the individual sequences is lost in the condensation
process: There is no way to reconstruct the original sequences from the profile.
However, we can still extract a consensus sequence from the profile, which is a
sequence that represents the most frequent symbol at each position.

.. jupyter-execute::

print(profile.to_consensus())

Profile visualization as sequence logo
--------------------------------------

.. currentmodule:: biotite.sequence.align

A common way to visualize a sequence profile is a sequence logo.
It depicts each profile position as a stack of letters:
The degree of conversation (more precisely the
`Shannon entropy <https://en.wikipedia.org/wiki/Entropy_(information_theory)>`_)
is the height of a stack and each letter's height in the stack is proportional to its
frequency at the respective position.

.. jupyter-execute::

import matplotlib.pyplot as plt
from biotite.sequence.graphics import plot_sequence_logo

fig, ax = plt.subplots(figsize=(8.0, 2.0), constrained_layout=True)
plot_sequence_logo(ax, profile)
ax.set_xlabel("Residue position")
ax.set_ylabel("Bits")

Position-specific scoring matrices
----------------------------------

.. currentmodule:: biotite.sequence.align

Sequence profiles can achieve even more:
The substitution matrices we have seen so far assign a score to a pair of two symbols,
regardless of their position in a sequence.
However, as discussed above, the position is crucial information to determine how
conserved a certain symbol is in a family of sequences.

Hence, we extend the concept of substitution matrices to *position-specific* matrices,
which assign a score to a symbol and a position (instead of another symbols).

.. jupyter-execute::

# For a real analysis each amino acid background frequencies should be provided
log_odds = profile.log_odds_matrix(pseudocount=1)

Now, we encounter a problem:
To create a :class:`SubstitutionMatrix` object from the log-odds matrix, we require two
:class:`.Alphabet` objects:
One is taken from the query sequence to be aligned to the profile, but which alphabet
do we use for the positional axis of the matrix?
Likewise, the alignment functions (e.g. :func:`align_optimal()`) require a two sequences
to be aligned, but we only have one query sequence.

To solve this problem :mod:`biotite.sequence` provides the :class:`.PositionalSequence`
which acts as a placeholder for the second sequence in the alignment.
Its alphabet contains a unique symbol for each position, i.e. the alphabet has the
sought length.

.. jupyter-execute::

positional_seq = seq.PositionalSequence(profile.to_consensus())
matrix = align.SubstitutionMatrix(
positional_seq.alphabet,
profile.alphabet,
# Substitution matrices require integer scores
# Multiply by 10 to increase value range and convert to integer
(log_odds * 10).astype(int)
)
alignment = align.align_optimal(
positional_seq, query, matrix, gap_penalty=-5, max_number=1
)[0]
print(alignment)

Only the length of the input sequence passed to :class:`.PositionalSequence` matters for
the alignment to work.
The consensus sequence of the profile was merely chosen for cosmetic reasons, i.e.
to have a meaningful string representation of the positional sequence and thus the
alignment.
Loading