biotite-dev · padix-key · Sep 2, 2024 · Sep 8, 2024 · Sep 8, 2024 · Sep 8, 2024
diff --git a/doc/examples/scripts/sequence/profile/scop_profile.py b/doc/examples/scripts/sequence/profile/scop_profile.py
@@ -0,0 +1,186 @@
+"""
+Note that in order to get a representative sequence profile, it is crucial to have a
+high number of sequences the target sequence can be aligned to.
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+import biotite.database.uniprot as uniprot
+import biotite.sequence as seq
+import biotite.sequence.align as align
+import biotite.sequence.graphics as graphics
+import biotite.sequence.io.fasta as fasta
+from biotite.database import RequestError
+
+TARGET_UNIPROT_ID = "P04746"
+TARGET_SCOP_FAMILY_ID = "4003138"
+QUERY_UNIPROT_ID = "P00722"
+
+# TARGET_UNIPROT_ID = "P09467"
+# TARGET_SCOP_FAMILY_ID = "4002766"
+# QUERY_UNIPROT_ID = "P73922"
+
+
+target_sequence = fasta.get_sequence(
+    fasta.FastaFile.read(uniprot.fetch(TARGET_UNIPROT_ID, "fasta", "."))
+)
+query_sequence = fasta.get_sequence(
+    fasta.FastaFile.read(uniprot.fetch(QUERY_UNIPROT_ID, "fasta", "."))
+)
+
+aa_matrix = align.SubstitutionMatrix.std_protein_matrix()
+alignment = align.align_optimal(
+    query_sequence,
+    target_sequence,
+    aa_matrix,
+    gap_penalty=(-10, -1),
+    local=False,
+    max_number=1,
+)[0]
+
+fig, ax = plt.subplots(figsize=(8.0, 1.0))
+graphics.plot_alignment_similarity_based(
+    ax, alignment, matrix=aa_matrix, labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID]
+)
+
+
+########################################################################################
+# https://www.ebi.ac.uk/pdbe/scop/download
+
+
+def get_sequence_family(scop_id):
+    scop_domains = requests.get(
+        f"https://www.ebi.ac.uk/pdbe/scop/api/domains/{scop_id}"
+    ).json()
+    sequences = {}
+    for scop_domain in scop_domains["domains"]:
+        uniprot_id = scop_domain["uniprot_id"]
+        try:
+            sequence = fasta.get_sequence(
+                fasta.FastaFile.read(uniprot.fetch(uniprot_id, "fasta", "."))
+            )
+        except RequestError:
+            # The UniProt ID is not in the database -> skip this domain
+            continue
+        for start, end in scop_domain["protein_regions"]:
+            region = sequence[start : end + 1]
+            sequences[uniprot_id] = region
+            # Most domains have only one region within the sequence
+            # For simplicity we only consider the first region
+            break
+    return sequences
+
+
+def merge_pairwise_alignments(alignments):
+    traces = []
+    sequences = []
+    for alignment in alignments:
+        trace = alignment.trace
+        # Remove gaps in first sequence
+        trace = trace[trace[:, 0] != -1]
+        traces.append(trace[:, 1])
+        sequences.append(alignment.sequences[1])
+    return align.Alignment(sequences, np.stack(traces, axis=-1))
+
+
+sequences = get_sequence_family(TARGET_SCOP_FAMILY_ID)
+# This is not a 'true' MSA, in the sense that it only merges the pairwise alignments
+# with respect to the target sequence
+pseudo_msa = merge_pairwise_alignments(
+    [
+        align.align_optimal(
+            target_sequence,
+            sequence,
+            aa_matrix,
+            gap_penalty=(-10, -1),
+            max_number=1,
+        )[0]
+        for sequence in sequences.values()
+    ]
+)
+
+labels = np.array(list(sequences.keys()))
+
+fig, ax = plt.subplots(figsize=(8.0, 24.0))
+graphics.plot_alignment_type_based(ax, pseudo_msa, labels=labels, show_numbers=True)
+
+########################################################################################
+# :footcite:`Robinson1991`
+
+# Must have the same order as `ProteinSequence.alphabet`
+AA_FREQUENCY = {
+    "A": 35155,
+    "C": 8669,
+    "D": 24161,
+    "E": 28354,
+    "F": 17367,
+    "G": 33229,
+    "H": 9906,
+    "I": 23161,
+    "K": 25872,
+    "L": 40625,
+    "M": 10101,
+    "N": 20212,
+    "P": 23435,
+    "Q": 19208,
+    "R": 23105,
+    "S": 32070,
+    "T": 26311,
+    "V": 29012,
+    "W": 5990,
+    "Y": 14488,
+    # Set ambiguous amino acid count to 1 to avoid division by zero in log odds matrix
+    "B": 1,
+    "Z": 1,
+    "X": 1,
+    "*": 1,
+}
+
+
+profile = seq.SequenceProfile.from_alignment(pseudo_msa)
+background_frequencies = np.array(list(AA_FREQUENCY.values()))
+# Normalize background frequencies
+background_frequencies = background_frequencies / background_frequencies.sum()
+
+score_matrix = profile.log_odds_matrix(background_frequencies, pseudocount=1).T
+score_matrix *= 10
+score_matrix = score_matrix.astype(np.int32)
+
+profile_seq = seq.PositionalSequence(profile.to_consensus())
+positional_matrix = align.SubstitutionMatrix(
+    seq.ProteinSequence.alphabet, profile_seq.alphabet, score_matrix=score_matrix
+)
+profile_alignment = align.align_optimal(
+    query_sequence,
+    profile_seq,
+    positional_matrix,
+    gap_penalty=(-10, -1),
+    local=False,
+    terminal_penalty=False,
+    max_number=1,
+)[0]
+
+
+########################################################################################
+# Map the profile alignment to the original target sequence.
+# As the pseudo-MSA was designed to have the same length as the target sequence,
+# the sequence needs to be exchanged only, the trace remains the same
+refined_alignment = align.Alignment(
+    [query_sequence, target_sequence], profile_alignment.trace
+)
+
+fig, ax = plt.subplots(figsize=(8.0, 4.0))
+graphics.plot_alignment_similarity_based(
+    ax,
+    refined_alignment,
+    matrix=aa_matrix,
+    labels=[TARGET_UNIPROT_ID, QUERY_UNIPROT_ID],
+)
+
+
+plt.show()
+
+########################################################################################
+#
+# .. footbibliography::
diff --git a/doc/tutorial/sequence/index.rst b/doc/tutorial/sequence/index.rst
@@ -43,5 +43,5 @@ For an unambiguous DNA sequence, the alphabet comprises the four nucleobases.
     align_optimal
     align_heuristic
     align_multiple
-    annotations
-    profiles
+    profiles
+    annotations
diff --git a/doc/tutorial/sequence/profiles.rst b/doc/tutorial/sequence/profiles.rst
@@ -0,0 +1,159 @@
+.. include:: /tutorial/preamble.rst
+
+Profiles and position-specific matrices
+=======================================
+Often sequences are not viewed in isolation:
+For example, if you investigate a protein family, you do not handle a single sequence,
+but an arbitrarily large collection of highly similar sequences.
+At some point handling those sequences as a mere collection of individual sequences
+becomes impractical for answering common questions, like which are the prevalent
+amino acids at a certain positions of the sequences.
+
+Sequence profiles
+-----------------
+
+.. currentmodule:: biotite.sequence
+
+This is where *sequence profiles* come into play:
+The condense the a collection of aligned sequences into a matrix that tracks the
+frequency of each symbol at each position of the alignment.
+Hence, asking the questions such as '*How frequent is an alanine at the n-th position?*'
+becomes a trivial indexing operation.
+
+As example for profiles, we will reuse the cyclotide sequence family from the
+:doc:`previous chapter <align_multiple>`.
+
+.. jupyter-execute::
+
+    from tempfile import NamedTemporaryFile
+    import biotite.sequence.io.fasta as fasta
+    import biotite.database.entrez as entrez
+
+    query = (
+        entrez.SimpleQuery("Cyclotide") &
+        entrez.SimpleQuery("cter") &
+        entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^
+        entrez.SimpleQuery("Precursor")
+    )
+    uids = entrez.search(query, "protein")
+    temp_file = NamedTemporaryFile(suffix=".fasta", delete=False)
+    fasta_file = fasta.FastaFile.read(
+        entrez.fetch_single_file(uids, temp_file.name, "protein", "fasta")
+    )
+    sequences = {
+        # The cyclotide variant is the last character in the header
+        header[-1]: seq for header, seq in fasta.get_sequences(fasta_file).items()
+    }
+    # Extract cyclotide N as query sequence for later
+    query = sequences.pop("N")
+
+To create a profile, we first need to align the sequences, so corresponding symbols
+appear the same position.
+Then we can create a :class:`SequenceProfile` object from the :class:`.Alignment`, which
+simply counts for each alignment column (i.e. the sequence position) the number of
+occurrences for each symbol.
+
+.. jupyter-execute::
+
+    import biotite.sequence as seq
+    import biotite.sequence.align as align
+
+    alignment, _, _, _ = seq.align.align_multiple(
+        list(sequences.values()),
+        align.SubstitutionMatrix.std_protein_matrix(),
+        gap_penalty=-5,
+    )
+    profile = seq.SequenceProfile.from_alignment(alignment)
+    print(profile)
+
+Each row in the displayed count matrix
+(accessible via :attr:`SequenceProfile.symbols`) refers to a single position, i.e. a
+column in the input MSA, and each column refers to a symbol in the underlying alphabet
+(accessible via :attr:`SequenceProfile.alphabet`).
+For completeness it should be noted that :attr:`SequenceProfile.gaps` also tracks the
+gaps for each position in the alignment, but we will not further use them in this
+tutorial.
+
+Note that the information about the individual sequences is lost in the condensation
+process: There is no way to reconstruct the original sequences from the profile.
+However, we can still extract a consensus sequence from the profile, which is a
+sequence that represents the most frequent symbol at each position.
+
+.. jupyter-execute::
+
+    print(profile.to_consensus())
+
+Profile visualization as sequence logo
+--------------------------------------
+
+.. currentmodule:: biotite.sequence.align
+
+A common way to visualize a sequence profile is a sequence logo.
+It depicts each profile position as a stack of letters:
+The degree of conversation (more precisely the
+`Shannon entropy <https://en.wikipedia.org/wiki/Entropy_(information_theory)>`_)
+is the height of a stack and each letter's height in the stack is proportional to its
+frequency at the respective position.
+
+.. jupyter-execute::
+
+    import matplotlib.pyplot as plt
+    from biotite.sequence.graphics import plot_sequence_logo
+
+    fig, ax = plt.subplots(figsize=(8.0, 2.0), constrained_layout=True)
+    plot_sequence_logo(ax, profile)
+    ax.set_xlabel("Residue position")
+    ax.set_ylabel("Bits")
+
+Position-specific scoring matrices
+----------------------------------
+
+.. currentmodule:: biotite.sequence.align
+
+Sequence profiles can achieve even more:
+The substitution matrices we have seen so far assign a score to a pair of two symbols,
+regardless of their position in a sequence.
+However, as discussed above, the position is crucial information to determine how
+conserved a certain symbol is in a family of sequences.
+
+Hence, we extend the concept of substitution matrices to *position-specific* matrices,
+which assign a score to a symbol and a position (instead of another symbols).
+
+.. jupyter-execute::
+
+    # For a real analysis each amino acid background frequencies should be provided
+    log_odds = profile.log_odds_matrix(pseudocount=1)
+
+Now, we encounter a problem:
+To create a :class:`SubstitutionMatrix` object from the log-odds matrix, we require two
+:class:`.Alphabet` objects:
+One is taken from the query sequence to be aligned to the profile, but which alphabet
+do we use for the positional axis of the matrix?
+Likewise, the alignment functions (e.g. :func:`align_optimal()`) require a two sequences
+to be aligned, but we only have one query sequence.
+
+To solve this problem :mod:`biotite.sequence` provides the :class:`.PositionalSequence`
+which acts as a placeholder for the second sequence in the alignment.
+Its alphabet contains a unique symbol for each position, i.e. the alphabet has the
+sought length.
+
+.. jupyter-execute::
+
+    positional_seq = seq.PositionalSequence(profile.to_consensus())
+    matrix = align.SubstitutionMatrix(
+        positional_seq.alphabet,
+        profile.alphabet,
+        # Substitution matrices require integer scores
+        # Multiply by 10 to increase value range and convert to integer
+        (log_odds * 10).astype(int)
+    )
+    alignment = align.align_optimal(
+        positional_seq, query, matrix, gap_penalty=-5, max_number=1
+    )[0]
+    print(alignment)
+
+Only the length of the input sequence passed to :class:`.PositionalSequence` matters for
+the alignment to work.
+The consensus sequence of the profile was merely chosen for cosmetic reasons, i.e.
+to have a meaningful string representation of the positional sequence and thus the
+alignment.