diff --git a/CHANGELOG.md b/CHANGELOG.md index 773e1262a..b8bab5a84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,5 +7,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Alternative start codons can now be used in the `synthesis/codon` DNA -> protein translation package (#305) +- Added a parser and writer for the `pileup` sequence alignment format (#329) +- Added statistics to the `synthesis/codon` package (keeping track of the observed start codon occurrences in a translation table) (#350) +- Added option to fragmenter to fragment with only certain overhangs (#387) +- Added seqhash v2 (#398) + ### Fixed -- Fixed bug that produced wrong overhang in linear, non-directional, single cut reactions. #408 \ No newline at end of file +- `fastq` parser no longer becomes de-aligned when reading (#325) +- `fastq` now handles optionals correctly (#323) +- No more data race in GoldenGate (#276) +- Fixed bug that produced wrong overhang in linear, non-directional, single cut reactions (#408) + +### Breaking +- CutWithEnzymeByName is now a receiver of EnzymeManager. GoldenGate now takes an Enzyme instead of the name of an enzyme. +This is an effort to remove dependence on some package level global state and build some flexibility managing enzymes +over the lifetime of the program. +- Enzyme.OverhangLen is now named Enzyme.OverhangLength + +## [0.26.0] - 2023-07-22 +Oops, we weren't keeping a changelog before this tag! + +[unreleased]: https://github.com/TimothyStiles/poly/compare/v0.26.0...main +[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0 diff --git a/seqhash/example_test.go b/seqhash/example_test.go index 02629c857..a19679c98 100644 --- a/seqhash/example_test.go +++ b/seqhash/example_test.go @@ -14,9 +14,9 @@ func Example_basic() { circular := false doubleStranded := true - sequenceSeqhash, _ := seqhash.Hash(sequence, sequenceType, circular, doubleStranded) + sequenceSeqhash, _ := seqhash.EncodeHashV2(seqhash.HashV2(sequence, sequenceType, circular, doubleStranded)) fmt.Println(sequenceSeqhash) - // Output: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350 + // Output: C_JPQCj5PgjFwjy7jaoYmwqQ== } func ExampleHash() { @@ -38,3 +38,14 @@ func ExampleRotateSequence() { fmt.Println(seqhash.RotateSequence(sequence.Sequence) == seqhash.RotateSequence(testSequence)) // output: true } + +func ExampleHashV2() { + sequence := "ATGC" + sequenceType := seqhash.DNA + circular := false + doubleStranded := true + + sequenceSeqhash, _ := seqhash.HashV2(sequence, sequenceType, circular, doubleStranded) + fmt.Println(sequenceSeqhash) + // Output: [36 244 2 143 147 224 140 92 35 203 184 218 161 137 176 169] +} diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go index 335a92545..5613ff0a5 100644 --- a/seqhash/seqhash.go +++ b/seqhash/seqhash.go @@ -3,6 +3,9 @@ Package seqhash contains the seqhash algorithm. This package contains the reference seqhash algorithm. +If you are new to using seqhash, use V2. V1 should only be used in situations +where full 256 rather than 120 bit hashing is needed. + There is a big problem with current sequence databases - they all use different identifiers and accession numbers. This means cross-referencing databases is a complicated exercise, especially as the quantity of databases increases, or if @@ -39,7 +42,9 @@ only ACDEFGHIKLMNPQRSTVWYUO*BXZ characters are allowed in sequences. Selenocyste (Pyl; O) are included in the protein character set - usually U and O don't occur within protein sequences, but for certain organisms they do, and it is certainly a relevant amino acid for those particular proteins. -A Seqhash is separated into 3 different elements divided by underscores. It looks like the following: +# Seqhash version 1 + +A version 1 seqhash is separated into 3 different elements divided by underscores. It looks like the following: v1_DCD_4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9 @@ -50,12 +55,38 @@ not the sequence is circular (C for Circular, L for Linear). The final letter co sequence is double stranded (D for Double stranded, S for Single stranded). The final element is the blake3 hash of the sequence (once rotated and complemented, as stated above). -Seqhash is a simple algorithm that allows for much better indexing of genetic sequences than what is -currently available. +# Seqhash version 2 + +Version 1 seqhashes are rather long, and version 2 seqhashes are built to be +much shorter. The intended use case are for handling sequences with LLM systems +since these system's context window is a value resource, and smaller references +allows the system to be more focused. Seqhash version 2 are approximately 3x +smaller than version 1 seqhashes. Officially, they are [16]byte arrays, but can +be also encoded with base64 to get a hash that can be used as a string across +different systems. Here is a length comparison: + + version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350 + version 2: C_JPQCj5PgjFwjy7jaoYmwqQ== + +The metadata is now encoded in a 1 byte flag rather than a metadata string, +instead of 7 rune like in version 1. Rather than use 256 bits for encoding +the hash, we use 120 bits. Since seqhashes are not meant for security, this +is good enough (50% collision with 1.3x10^18 hashes), while making them +conveniently only 16 btyes long. Additionally, encoded prefixes are added +to the front of the base64 encoded hash as a heuristic device for LLMs while +processing batches of seqhashes. + +In addition, seqhashes can now encode fragments. Fragments are double stranded +DNA that are the result of restriction digestion, with single stranded +overhangs flanking both sides. These fragments can encode genetic parts - and +an important part of any vector containing these parts would be the part +seqhash, rather than the vector seqhash. This enhancement allows you to +identify genetic parts irregardless of their context. */ package seqhash import ( + "encoding/base64" "encoding/hex" "errors" "sort" @@ -69,9 +100,10 @@ import ( type SequenceType string const ( - DNA SequenceType = "DNA" - RNA SequenceType = "RNA" - PROTEIN SequenceType = "PROTEIN" + DNA SequenceType = "DNA" + RNA SequenceType = "RNA" + PROTEIN SequenceType = "PROTEIN" + FRAGMENT SequenceType = "FRAGMENT" ) // boothLeastRotation gets the least rotation of a circular string. @@ -137,8 +169,11 @@ func RotateSequence(sequence string) string { return sequence } -// Hash is a function to create Seqhashes, a specific kind of identifier. -func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) { +// prepareDeterministicSequence prepares input data to be hashed by first running +// all of the checks for sequence typing, then by applying sequence +// manipulations to make a consistent hash for circular and double stranded +// sequences. +func prepareDeterministicSequence(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) { // By definition, Seqhashes are of uppercase sequences sequence = strings.ToUpper(sequence) // If RNA, convert to a DNA sequence. The hash itself between a DNA and RNA sequence will not @@ -174,7 +209,6 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran if sequenceType == PROTEIN && doubleStranded { return "", errors.New("Proteins cannot be double stranded") } - // Gets Deterministic sequence based off of metadata + sequence var deterministicSequence string switch { @@ -191,6 +225,15 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran case !circular && !doubleStranded: deterministicSequence = sequence } + return deterministicSequence, nil +} + +// Hash creates a version 1 seqhash. +func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) { + deterministicSequence, err := prepareDeterministicSequence(sequence, sequenceType, circular, doubleStranded) + if err != nil { + return "", err + } // Build 3 letter metadata var sequenceTypeLetter string @@ -222,3 +265,198 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran seqhash := "v1" + "_" + sequenceTypeLetter + circularLetter + doubleStrandedLetter + "_" + hex.EncodeToString(newhash[:]) return seqhash, nil } + +// The following consts are for seqhash version 2 +const ( + // Define bit masks for each part of the flag + hash2versionMask byte = 0b11110000 // Version occupies the first 4 bits + hash2circularityMask byte = 0b00001000 // Circularity occupies the 5th bit + hash2doubleStrandedMask byte = 0b00000100 // Double-strandedness occupies the 6th bit + hash2typeMask byte = 0b00000011 // DNA/RNA/PROTEIN occupies the last 2 bits + + // Define shift counts for each part + hash2versionShift = 4 + hash2circularityShift = 3 + hash2doubleStrandedShift = 2 +) + +var ( + // sequenceTypeStringToByteFlagMap converts a sequenceType to a byte + sequenceTypeStringToByteFlagMap = map[SequenceType]byte{ + DNA: 0b00, + RNA: 0b01, + PROTEIN: 0b10, + FRAGMENT: 0b11, + } + // sequenceTypeByteToStringFlagMap converts a byte to a sequenceType + sequenceTypeByteToStringFlagMap = map[byte]SequenceType{ + 0b00: DNA, + 0b01: RNA, + 0b10: PROTEIN, + 0b11: FRAGMENT, + } +) + +// EncodeFlag encodes the version, circularity, double-strandedness, and type into a single byte flag. +// Used for seqhash v2 +func EncodeFlag(version int, sequenceType SequenceType, circularity bool, doubleStranded bool) byte { + var flag byte + + // Encode the version (assuming version is in the range 0-15) + flag |= (byte(version) << hash2versionShift) + + // Encode the circularity + if circularity { + flag |= (1 << hash2circularityShift) + } + + // Encode the double-strandedness + if doubleStranded { + flag |= (1 << hash2doubleStrandedShift) + } + + // Encode the DNA/RNA/PROTEIN + dnaRnaProtein := sequenceTypeStringToByteFlagMap[sequenceType] + flag |= (dnaRnaProtein & hash2typeMask) + + return flag +} + +// DecodeFlag decodes the single byte flag into its constituent parts. +// Outputs: version, circularity, doubleStranded, dnaRnaProtein. +// Used for seqhash v2 +func DecodeFlag(flag byte) (int, SequenceType, bool, bool) { + version := int((flag & hash2versionMask) >> hash2versionShift) + circularity := (flag & hash2circularityMask) != 0 + doubleStranded := (flag & hash2doubleStrandedMask) != 0 + dnaRnaProtein := flag & hash2typeMask + sequenceType := sequenceTypeByteToStringFlagMap[dnaRnaProtein] + + return version, sequenceType, circularity, doubleStranded +} + +// HashV2 creates a version 2 seqhash. +func HashV2(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) ([16]byte, error) { + var result [16]byte + + // First, get the determistic sequence of the hash + deterministicSequence, err := prepareDeterministicSequence(sequence, sequenceType, circular, doubleStranded) + if err != nil { + return result, err + } + + // Build our byte flag + flag := EncodeFlag(2, sequenceType, circular, doubleStranded) + result[0] = flag + + // Compute BLAKE3, then copy those to the remaining 15 bytes + newhash := blake3.Sum256([]byte(deterministicSequence)) + copy(result[1:], newhash[:15]) + + return result, nil +} + +// HashV2Fragment creates a version 2 fragment seqhash. Fragment seqhashes are +// a special kind of seqhash that are used to identify fragments, usually +// released by restriction enzyme digestion, rather than complete DNA +// sequences. This is very useful for tracking genetic parts in a database: as +// abstractions away from their container vectors, so that many fragments in +// different vectors can be identified consistently. +// +// fwdOverhangLength and revOverhangLength are the lengths of both overhangs. +// Hashed sequences are hashed with their overhangs attached. Most of the time, +// both of these will equal 4, as they are released by TypeIIS restriction +// enzymes. +// +// In order to make sure fwdOverhangLength and revOverhangLength fit in the +// hash, the hash is truncated at 13 bytes rather than 15, and both int8 are +// inserted. So the bytes would be: +// +// flag + fwdOverhangLength + revOverhangLength + [13]byte(hash) +// +// fwdOverhangLength and revOverhangLength are both int8, and their negatives +// are considered if the the overhang is on the 3prime strand, rather than the +// 5prime strand. +// +// 13 bytes is considered enough, because the number of fragments is limited +// by our ability to physically produce them, while other other sequence types +// can be found in nature. +// +// The fwdOverhang and revOverhang are the lengths of the overhangs of the +// input sequence. The hash, however, contains the forward and reverse overhang +// lengths of the deterministic sequence - ie, the alphabetically less-than +// strand, when comparing the uppercase forward and reverse complement strand. +// This means if the input sequence is not less than its reverse complement (for +// example, GTT is greater than AAC), then the output hash will have the forward +// and reverse overhang lengths of the reverse complement, not the input strand. +func HashV2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength int8) ([16]byte, error) { + var result [16]byte + + // First, run checks and get the determistic sequence of the hash + for _, char := range sequence { + if !strings.Contains("ATUGCYRSWKMBDHVNZ", string(char)) { + return result, errors.New("Only letters ATUGCYRSWKMBDHVNZ are allowed for DNA/RNA. Got letter: " + string(char)) + } + } + sequence = strings.ToUpper(sequence) + var forward, reverse int8 + var deterministicSequence string + reverseComplement := transform.ReverseComplement(sequence) + if sequence > reverseComplement { + // If the reverse complement is smaller, reverse the overhangs forward and reverse + forward = revOverhangLength + reverse = fwdOverhangLength + deterministicSequence = reverseComplement + } else { + forward = fwdOverhangLength + reverse = revOverhangLength + deterministicSequence = sequence + } + + // Build our byte flag and copy length flags + flag := EncodeFlag(2, FRAGMENT, false, false) + result[0] = flag + result[1] = byte(forward) + result[2] = byte(reverse) + + // Compute BLAKE3, then copy those to the remaining 13 bytes + newhash := blake3.Sum256([]byte(deterministicSequence)) + copy(result[3:], newhash[:13]) + + return result, nil +} + +// HashV2MetadataKey is a key for a seqhash v2 single letter metadata tag. +type HashV2MetadataKey struct { + SequenceType SequenceType + Circular bool + DoubleStranded bool +} + +// HashV2Metadata contains the seqhash v2 single letter metadata tags. +var HashV2Metadata = map[HashV2MetadataKey]rune{ + {DNA, true, true}: 'A', + {DNA, true, false}: 'B', + {DNA, false, true}: 'C', + {DNA, false, false}: 'D', + {RNA, true, true}: 'E', + {RNA, true, false}: 'F', + {RNA, false, true}: 'G', + {RNA, false, false}: 'H', + {PROTEIN, false, false}: 'I', + {PROTEIN, true, false}: 'J', + {FRAGMENT, false, false}: 'K', + {FRAGMENT, true, false}: 'L', + {FRAGMENT, false, true}: 'M', + {FRAGMENT, true, true}: 'N', +} + +// EncodeHashV2 encodes HashV2 as a base64 string. It also adds a single +// letter metadata tag that can be used as an easy heuristic for an LLM to +// identify misbehaving code. +func EncodeHashV2(hash [16]byte, err error) (string, error) { + _, sequenceType, circularity, doubleStranded := DecodeFlag(hash[0]) + encoded := base64.StdEncoding.EncodeToString(hash[:]) + + return string(HashV2Metadata[HashV2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err +} diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go index 5787cc36b..f4da6648d 100644 --- a/seqhash/seqhash_test.go +++ b/seqhash/seqhash_test.go @@ -89,3 +89,37 @@ func TestLeastRotation(t *testing.T) { } } } + +func TestFlagEncoding(t *testing.T) { + version := 2 + sequenceType := DNA + circularity := true + doubleStranded := true + flag := EncodeFlag(version, sequenceType, circularity, doubleStranded) + decodedVersion, decodedSequenceType, decodedCircularity, decodedDoubleStranded := DecodeFlag(flag) + if (decodedVersion != version) || (decodedSequenceType != sequenceType) || (decodedCircularity != circularity) || (decodedDoubleStranded != doubleStranded) { + t.Errorf("Got different decoded flag.") + } +} + +func TestHashV2(t *testing.T) { + // Test TNA as sequenceType + _, err := HashV2("ATGGGCTAA", "TNA", true, true) + if err == nil { + t.Errorf("TestHashV2() has failed. TNA is not a valid sequenceType.") + } +} + +func TestHashV2Fragment(t *testing.T) { + // Test X failure + _, err := HashV2Fragment("ATGGGCTAX", 4, 4) + if err == nil { + t.Errorf("TestHashV2Fragment() has failed. X is not a valid sequenceType.") + } + // Test actual hash + sqHash, _ := EncodeHashV2(HashV2Fragment("ATGGGCTAA", 4, 4)) + expectedHash := "K_IwQEwsn8RN9yA1CCoVLpSw==" + if sqHash != expectedHash { + t.Errorf("Expected %s, Got: %s", expectedHash, sqHash) + } +}