From 2d5d8c4551121fca4c394835efd89bffb0c3c5ca Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 12 Oct 2023 14:00:10 -0700 Subject: [PATCH] patched genbank parser and builder to handle BASE COUNT. (#386) --- data/sample.gbk | 1 + io/genbank/genbank.go | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/data/sample.gbk b/data/sample.gbk index 5854e5d6..51d0fe4e 100644 --- a/data/sample.gbk +++ b/data/sample.gbk @@ -77,6 +77,7 @@ FEATURES Location/Qualifiers KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK LISGDDKILNGVYSQYEEGESIFGSLF" +BASE COUNT 67070277 a 48055043 c 48111528 g 67244164 t 18475410 n ORIGIN 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct diff --git a/io/genbank/genbank.go b/io/genbank/genbank.go index e023288b..468e17cf 100644 --- a/io/genbank/genbank.go +++ b/io/genbank/genbank.go @@ -57,6 +57,7 @@ type Meta struct { Origin string `json:"origin"` Locus Locus `json:"locus"` References []Reference `json:"references"` + BaseCount []BaseCount `json:"base_count"` Other map[string]string `json:"other"` Name string `json:"name"` SequenceHash string `json:"sequence_hash"` @@ -109,6 +110,12 @@ type Location struct { SubLocations []Location `json:"sub_locations"` } +// BaseCount is a struct that holds the base counts for a sequence. +type BaseCount struct { + Base string + Count int +} + // Precompiled regular expressions: var ( basePairRegex = regexp.MustCompile(` \d* \w{2} `) @@ -315,6 +322,13 @@ func BuildMulti(sequences []Genbank) ([]byte, error) { gbkString.WriteString(BuildFeatureString(feature)) } + if len(sequence.Meta.BaseCount) > 0 { + gbkString.WriteString("BASE COUNT ") + for _, baseCount := range sequence.Meta.BaseCount { + gbkString.WriteString(strconv.Itoa(baseCount.Count) + " " + baseCount.Base + " ") + } + gbkString.WriteString("\n") + } // start writing sequence section. gbkString.WriteString("ORIGIN\n") @@ -378,7 +392,7 @@ type parseLoopParameters struct { emptyAttribute bool sequenceBuilder strings.Builder parseStep string - genbank Genbank // since we are scanning lines we need a Genbank struct to store the data outside the loop.// since we are scanning lines we need a Genbank struct to store the data outside the loop. + genbank Genbank // since we are scanning lines we need a Genbank struct to store the data outside the loop. feature Feature features []Feature metadataTag string @@ -484,6 +498,23 @@ func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) { } case "features": + baseCountFlag := strings.Contains(line, "BASE COUNT") // example string for BASE COUNT: "BASE COUNT 67070277 a 48055043 c 48111528 g 67244164 t 18475410 n" + if baseCountFlag { + fields := strings.Fields(line) + for countIndex := 2; countIndex < len(fields)-1; countIndex += 2 { // starts at two because we don't want to include "BASE COUNT" in our fields + count, err := strconv.Atoi(fields[countIndex]) + if err != nil { + return []Genbank{}, err + } + + baseCount := BaseCount{ + Base: fields[countIndex+1], + Count: count, + } + parameters.genbank.Meta.BaseCount = append(parameters.genbank.Meta.BaseCount, baseCount) + } + break + } // Switch to sequence parsing originFlag := strings.Contains(line, "ORIGIN") // we detect the beginning of the sequence with "ORIGIN" if originFlag {