diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index b0a0318f..6cf55889 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -12,23 +12,25 @@ on: jobs: build: - runs-on: ubuntu-latest + env: + IDT_CREDENTIALS: ${{ secrets.IDT_CREDENTIALS }} + runs-on: ${{ matrix.os }} strategy: matrix: # Default builds are on Ubuntu os: [ubuntu-latest] - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] include: # Also test on macOS and Windows using the latest Python 3 - os: macos-latest - python-version: 3.x - - os: windows-latest - python-version: 3.x + python-version: 3.11 # Return to 3.x after resolution of https://github.com/RDFLib/pySHACL/issues/212 + - os: windows-2019 + python-version: 3.11 # Return to 3.x after resolution of https://github.com/RDFLib/pySHACL/issues/212 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -37,11 +39,16 @@ jobs: python -m pip install pytest python -m pip install interrogate - name: Setup Graphviz - uses: ts-graphviz/setup-graphviz@v1 + uses: ts-graphviz/setup-graphviz@v2 + with: + # Skip running of sometimes problematic brew update command on macOS. + # Remove after resolution of https://github.com/ts-graphviz/setup-graphviz/issues/593 + macos-skip-brew-update: 'true' # default false - name: Show Node.js version run: | node --version - name: Test with pytest run: | pip install . + echo "$IDT_CREDENTIALS" > test_secret_idt_credentials.json pytest --ignore=test/test_docstr_coverage.py -s diff --git a/.gitignore b/.gitignore index 563f9abf..16ddcf6e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,9 @@ test/__pycache__/ sbol_utilities/__pycache__/ __pycache__/ + +# test secrets +test_secret* + +.idea/ +*.egg-info/ diff --git a/README.md b/README.md index b32f9dd3..798ad77c 100644 --- a/README.md +++ b/README.md @@ -57,9 +57,9 @@ The `excel-to-sbol` utility reads an Excel file specifying a library of basic an The `sbol-converter` utility converts between any of the SBOL3, SBOL2, GenBank, and FASTA formats. Additional "macro" utilities convert specifically between SBOL3 and one of the other formats: -- `sbol2fasta` and `fasta2sbol` convert from SBOL3 to FASTA and vice versa -- `sbol2genbank` and `genbank2sbol` convert from SBOL3 to GenBank and vice versa -- `sbol3to2` and `sbol2to3` convert to and from SBOL2 +- `sbol-to-fasta` and `fasta-to-sbol` convert from SBOL3 to FASTA and vice versa +- `sbol-to-genbank` and `genbank-to-sbol` convert from SBOL3 to GenBank and vice versa +- `sbol3-to-sbol2` and `sbol2-to-sbol3` convert to and from SBOL2 ### Expand the combinatorial derivations in an SBOL file @@ -69,6 +69,14 @@ The `sbol-expand-derivations` utility searches through an SBOL file for Combinat The `sbol-calculate-sequences` utility attempts to calculate the sequence of any DNA Component that can be fully specified from the sequences of its sub-components. +### Calculate sequence synthesis complexity for DNA sequences in an SBOL file + +The `sbol-calculate-complexity` utility attempts to calculate the synthesis complexity of any DNA sequence in the file, by sending sequences to be evaluated by IDT's sequence calculator service. Sequences whose complexity is known are not re-calculated. + +The system uses the gBlock API, which is intended for sequences from 125 to 3000 bp in length. If it is more than 3000 bp or less than 125 bp your returned score will be 0. A complexity score in the range from 0 to 10 means your sequence is synthesizable, if the score is greater or equal than 10 means it is not synthesizable. + +Note that use of this utility requires an account with IDT that is set up to use IDT's online service API (see: https://www.idtdna.com/pages/tools/apidoc) + ### Compute the difference between two SBOL3 documents The `sbol-diff` utility computes the difference between two SBOL3 documents and reports the differences. @@ -77,4 +85,4 @@ and reports the differences. ## Contributing We welcome contributions that patch bugs, improve existing utilities or documentation, or add new utilities! -For guidance on how to contribute effectively to this project, see [CONTRIBUTING.md](CONTRIBUTING.md). \ No newline at end of file +For guidance on how to contribute effectively to this project, see [CONTRIBUTING.md](CONTRIBUTING.md). diff --git a/sbol_utilities/calculate_complexity_scores.py b/sbol_utilities/calculate_complexity_scores.py new file mode 100644 index 00000000..c94749ca --- /dev/null +++ b/sbol_utilities/calculate_complexity_scores.py @@ -0,0 +1,250 @@ +from __future__ import annotations + +import json + +from typing import Optional + +import datetime +import argparse +import logging +import uuid +from requests import post +from requests.auth import HTTPBasicAuth + +import sbol3 +import tyto + +from sbol_utilities.workarounds import type_to_standard_extension + +COMPLEXITY_SCORE_NAMESPACE = 'http://igem.org/IDT_complexity_score' +REPORT_ACTIVITY_TYPE = 'https://github.com/SynBioDex/SBOL-utilities/compute-sequence-complexity' + + +class IDTAccountAccessor: + """Class that wraps access to the IDT API""" + + _TOKEN_URL = 'https://www.idtdna.com/Identityserver/connect/token' + """API URL for obtaining session tokens""" + _SCORE_URL = 'https://www.idtdna.com/api/complexities/screengBlockSequences' + """APR URL for obtaining sequence scores""" + _BLOCK_SIZE = 1 # TODO: determine if it is possible to run multiple sequences in a single query + SCORE_TIMEOUT = 120 + """Number of seconds to wait for score query requests to complete""" + + def __init__(self, username: str, password: str, client_id: str, client_secret: str): + """Initialize with required access information for IDT API (see: https://www.idtdna.com/pages/tools/apidoc) + Automatically logs in and obtains a session token + + :param username: Username of your IDT account + :param password: Password of your IDT account + :param client_id: ClientID key of your IDT account + :param client_secret: ClientSecret key of your IDT account + """ + self.username = username + self.password = password + self.client_id = client_id + self.client_secret = client_secret + self.token = self._get_idt_access_token() + + @staticmethod + def from_json(json_object) -> IDTAccountAccessor: + """Initialize IDT account accessor from a JSON object with field values + + :param json_object: object with account information + :return: Account accessor object + """ + return IDTAccountAccessor(username=json_object['username'], password=json_object['password'], + client_id=json_object['ClientID'], client_secret=json_object['ClientSecret']) + + def _get_idt_access_token(self) -> str: + """Get access token for IDT API (see: https://www.idtdna.com/pages/tools/apidoc) + + :return: access token string + """ + logging.info('Connecting to IDT API') + data = {'grant_type': 'password', 'username': self.username, 'password': self.password, 'scope': 'test'} + auth = HTTPBasicAuth(self.client_id, self.client_secret) + result = post(IDTAccountAccessor._TOKEN_URL, data, auth=auth, timeout=IDTAccountAccessor.SCORE_TIMEOUT) + + if 'access_token' in result.json(): + return result.json()['access_token'] + else: + raise ValueError('Access token for IDT API could not be generated. Check your credentials.') + + def get_sequence_scores(self, sequences: list[sbol3.Sequence]) -> list: + """Retrieve synthesis complexity scores of sequences from the IDT API + This system uses the gBlock API, which is intended for sequences from 125 to 3000 bp in length. If it is more + than 3000 bp or less than 125 bp your returned score will be 0. A complexity score in the range from 0 to 10 means + your sequence is synthesizable, if the score is greater or equal than 10 means it is not synthesizable. + + :param sequences: sequences for which we want to calculate the complexity score + :return: dictionary mapping sequences to complexity Scores + :return: List of lists of dictionaries with information about sequence synthesis features + """ + # Set up list of query dictionaries + seq_dict = [{'Name': str(seq.display_name), 'Sequence': str(seq.elements)} for seq in sequences] + # Break into query blocks + partitions_sequences = [seq_dict[x:x + 1] for x in range(0, len(seq_dict), IDTAccountAccessor._BLOCK_SIZE)] + # Send each query to IDT and collect results + results = [] + for idx, partition in enumerate(partitions_sequences): + logging.debug('Sequence score request %i of %i', idx+1, len(partitions_sequences)) + resp = post(IDTAccountAccessor._SCORE_URL, json=partition, timeout=IDTAccountAccessor.SCORE_TIMEOUT, + headers={'Authorization': 'Bearer {}'.format(self.token), + 'Content-Type': 'application/json; charset=utf-8'}) + response_list = resp.json() + if len(response_list) != len(partition): + raise ValueError(f'Unexpected complexity score: expected {len(partition)} scores, ' + f'but got {len(response_list)}') + results.append(resp.json()) + logging.info('Requests to IDT API finished.') + return results + + def get_sequence_complexity(self, sequences: list[sbol3.Sequence]) -> dict[sbol3.Sequence, float]: + """ Extract complexity scores from IDT API for a list of SBOL Sequence objects + This works by computing full sequence evaluations, then compressing down to a single score for each sequence. + + :param sequences: list of SBOL Sequences to evaluate + :return: dictionary mapping sequences to complexity Scores + """ + # Retrieve full evaluations for sequences + scores = self.get_sequence_scores(sequences) + # Compute total score for each sequence as the sum all complexity scores for the sequence + score_list = [] + for score_set in scores: + for sequence_scores in score_set: + complexity_score = sum(score.get('Score') for score in sequence_scores) + score_list.append(complexity_score) + # Associate each sequence to its score + return dict(zip(sequences, score_list)) + + +def get_complexity_score(seq: sbol3.Sequence) -> Optional[float]: + """Given a sequence, return its previously computed complexity score, if such exists + + :param seq: SBOL Sequence object to check for score + :return: score if set, None if not + """ + scores = [score for score in seq.measures if tyto.EDAM.sequence_complexity_report in score.types] + if scores: + if len(scores) > 1: + raise ValueError(f'Found multiple complexity scores on Sequence {seq.identity}') + return scores[0].value + else: + return None + + +def get_complexity_scores(sequences: list[sbol3.Sequence], include_missing=False) -> \ + dict[sbol3.Sequence, Optional[float]]: + """Retrieve complexity scores for a list of sequences + + :param sequences: Sequences to get scores for + :param include_missing: if true, Sequences without scores are included, mapping to none + :return: dictionary mapping Sequence to score + """ + # TODO: change to run computations only on DNA sequences + score_map = {seq: get_complexity_score(seq) for seq in sequences} + if not include_missing: + score_map = {k: v for k, v in score_map.items() if v is not None} + return score_map + + +def idt_calculate_sequence_complexity_scores(accessor: IDTAccountAccessor, sequences: list[sbol3.Sequence]) -> \ + dict[sbol3.Sequence, float]: + """Given a list of sequences, compute the complexity scores for any sequences not currently scored + by sending the sequences to IDT's online service for calculating sequence synthesis complexity. + Also records the complexity computation with an activity + + :param accessor: IDT API access object + :param sequences: list of SBOL Sequences to evaluate + :return: Dictionary mapping Sequences to complexity scores for newly computed sequences + """ + # Determine which sequences need scores + need_scores = [seq for seq, score in get_complexity_scores(sequences, include_missing=True).items() + if score is None] + if not need_scores: + return dict() + + # Query for the scores of the sequences + score_dictionary = accessor.get_sequence_complexity(need_scores) + + # Create report generation activity + doc = need_scores[0].document + timestamp = datetime.datetime.utcnow().isoformat(timespec='seconds') + 'Z' + report_id = f'{COMPLEXITY_SCORE_NAMESPACE}/Complexity_Report_{timestamp.replace(":", "").replace("-", "")}_' \ + f'{str(uuid.uuid4())[0:8]}' + report_generation = sbol3.Activity(report_id, end_time=timestamp, types=[REPORT_ACTIVITY_TYPE]) + doc.add(report_generation) + + # Mark the sequences with their scores, where each score is a dimensionless measure + for sequence, score in score_dictionary.items(): + measure = sbol3.Measure(score, unit=tyto.OM.number_unit, types=[tyto.EDAM.sequence_complexity_report]) + measure.generated_by.append(report_generation) + sequence.measures.append(measure) + # return the dictionary of newly computed scores + return score_dictionary + + +def idt_calculate_complexity_scores(accessor: IDTAccountAccessor, doc: sbol3.Document) -> dict[sbol3.Sequence, float]: + """Given an SBOL Document, compute the complexity scores for any sequences in the Document not currently scored + by sending the sequences to IDT's online service for calculating sequence synthesis complexity. + Also records the complexity computation with an activity + + :param accessor: IDT API access object + :param doc: SBOL document with sequences of interest in it + :return: Dictionary mapping Sequences to complexity scores + """ + sequences = [obj for obj in doc if isinstance(obj, sbol3.Sequence)] + return idt_calculate_sequence_complexity_scores(accessor, sequences) + + +def main(): + """ + Main wrapper: read from input file, invoke idt_calculate_complexity_scores, then write to output file + """ + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--credentials', + help="""JSON file containing IDT API access credentials. +To obtain access credentials, follow the directions at https://www.idtdna.com/pages/tools/apidoc +The values of the IDT access credentials should be stored in a JSON of the following form: +{ "username": "username", "password": "password", "ClientID": "####", "ClientSecret": "XXXXXXXXXXXXXXXXXXX" }" +""") + parser.add_argument('--username', help="Username of your IDT account (if not using JSON credentials)") + parser.add_argument('--password', help="Password of your IDT account (if not using JSON credentials)") + parser.add_argument('--ClientID', help="ClientID of your IDT account (if not using JSON credentials)") + parser.add_argument('--ClientSecret', help="ClientSecret of your IDT account (if not using JSON credentials)") + parser.add_argument('input_file', help="Absolute path to sbol file with sequences") + parser.add_argument('output_name', help="Name of SBOL file to be written") + parser.add_argument('-t', '--file-type', dest='file_type', default=sbol3.SORTED_NTRIPLES, + help="Name of SBOL file to output to (excluding type)") + parser.add_argument('--verbose', '-v', dest='verbose', action='count', default=0) + args_dict = vars(parser.parse_args()) + + # Extract arguments: + verbosity = args_dict['verbose'] + logging.getLogger().setLevel(level=(logging.WARN if verbosity == 0 else + logging.INFO if verbosity == 1 else logging.DEBUG)) + input_file = args_dict['input_file'] + output_name = args_dict['output_name'] + + if args_dict['credentials'] != None: + with open(args_dict['credentials']) as credentials: + idt_accessor = IDTAccountAccessor.from_json(json.load(credentials)) + else: + idt_accessor = IDTAccountAccessor(args_dict['username'], args_dict['password'], args_dict['ClientID'], + args_dict['ClientSecret']) + + extension = type_to_standard_extension[args_dict['file_type']] + outfile_name = output_name if output_name.endswith(extension) else output_name + extension + + # Read file, convert, and write resulting document + logging.info('Reading SBOL file ' + input_file) + doc = sbol3.Document() + doc.read(input_file) + results = idt_calculate_complexity_scores(idt_accessor, doc) + doc.write(outfile_name, args_dict['file_type']) + logging.info('SBOL file written to %s with %i new scores calculated', outfile_name, len(results)) + + +if __name__ == '__main__': + main() diff --git a/sbol_utilities/conversion.py b/sbol_utilities/conversion.py index e19141ce..225c55b7 100644 --- a/sbol_utilities/conversion.py +++ b/sbol_utilities/conversion.py @@ -15,6 +15,8 @@ from sbol_utilities.helper_functions import strip_sbol2_version, GENETIC_DESIGN_FILE_TYPES, \ find_top_level +from sbol_utilities.sbol3_genbank_conversion import GenBankSBOL3Converter +import sbol_utilities.sbol3_sbol2_conversion from sbol_utilities.workarounds import id_sort # sbol javascript executable based on https://github.com/sboltools/sbolgraph @@ -68,13 +70,18 @@ def convert_identities2to3(sbol3_data: str) -> str: return g.serialize(format="xml") -def convert2to3(sbol2_doc: Union[str, sbol2.Document], namespaces=None) -> sbol3.Document: +def convert2to3(sbol2_doc: Union[str, sbol2.Document], namespaces=None, use_native_converter: bool = False) \ + -> sbol3.Document: """Convert an SBOL2 document to an equivalent SBOL3 document :param sbol2_doc: Document to convert :param namespaces: list of URI prefixes to treat as namespaces + :param use_native_converter: if true, use experimental Python converter instead of JavaScript call-out :return: equivalent SBOL3 document """ + if use_native_converter: + return sbol_utilities.sbol3_sbol2_conversion.convert2to3(sbol2_doc, namespaces) + # if we've started with a Document in memory, write it to a temp file if namespaces is None: namespaces = [] @@ -166,12 +173,16 @@ def change_orientation(o): return doc -def convert3to2(doc3: sbol3.Document) -> sbol2.Document: +def convert3to2(doc3: sbol3.Document, use_native_converter: bool = False) -> sbol2.Document: """Convert an SBOL3 document to an equivalent SBOL2 document :param doc3: Document to convert + :param use_native_converter: if true, use experimental Python converter instead of JavaScript call-out :return: equivalent SBOL2 document """ + if use_native_converter: + return sbol_utilities.sbol3_sbol2_conversion.convert3to2(doc3) + # TODO: remove workarounds after conversion errors fixed in https://github.com/sboltools/sbolgraph/issues/16 # remap sequence encodings: encoding_remapping = { @@ -293,7 +304,7 @@ def convert_from_fasta(path: str, namespace: str, identity_map: Dict[str, str] = # TODO: Figure out how to support multiple namespaces like we do for FASTA: currently, importing from multiple # namespaces will not work correctly -def convert_from_genbank(path: str, namespace: str, allow_genbank_online: bool = False) -> sbol3.Document: +def convert_from_genbank(path: str, namespace: str, allow_genbank_online: bool = False, force_new_converter: bool = False) -> sbol3.Document: """Convert a GenBank document on disk into an SBOL3 document Specifically, the GenBank document is first imported to SBOL2, then converted from SBOL2 to SBOL3 @@ -302,6 +313,9 @@ def convert_from_genbank(path: str, namespace: str, allow_genbank_online: bool = :param allow_genbank_online: Use the online converter, rather than the local converter :return: SBOL3 document containing converted materials """ + if force_new_converter: + converter = GenBankSBOL3Converter() + return converter.convert_genbank_to_sbol3(gb_file=path, namespace=namespace, write=False) doc2 = sbol2.Document() sbol2.setHomespace(namespace) # Convert document offline @@ -316,7 +330,7 @@ def convert_from_genbank(path: str, namespace: str, allow_genbank_online: bool = return doc -def convert_to_genbank(doc3: sbol3.Document, path: str, allow_genbank_online: bool = False) \ +def convert_to_genbank(doc3: sbol3.Document, path: str, allow_genbank_online: bool = False, force_new_converter: bool = False) \ -> List[SeqRecord.SeqRecord]: """Convert an SBOL3 document to a GenBank file, which is written to disk Note that for compatibility with version control software, if no prov:modified term is available on each Component, @@ -327,6 +341,10 @@ def convert_to_genbank(doc3: sbol3.Document, path: str, allow_genbank_online: bo :param allow_genbank_online: use the online converter rather than the local converter :return: BioPython SeqRecord of the GenBank that was written """ + if force_new_converter: + converter = GenBankSBOL3Converter() + result_dict = converter.convert_sbol3_to_genbank(sbol3_file=None, doc=doc3, gb_file=path, write=True) + return result_dict["seqrecords"] # first convert to SBOL2, then export to a temp GenBank file doc2 = convert3to2(doc3) @@ -395,7 +413,7 @@ def command_line_converter(args_dict: Dict[str, Any]): if input_file_type == 'FASTA': doc3 = convert_from_fasta(input_file, namespace) elif input_file_type == 'GenBank': - doc3 = convert_from_genbank(input_file, namespace, args_dict['allow_genbank_online']) + doc3 = convert_from_genbank(input_file, namespace, args_dict['allow_genbank_online'], args_dict['force_new_converter']) elif input_file_type == 'SBOL2': doc2 = sbol2.Document() doc2.read(input_file) @@ -411,7 +429,7 @@ def command_line_converter(args_dict: Dict[str, Any]): if output_file_type == 'FASTA': convert_to_fasta(doc3, output_file) elif output_file_type == 'GenBank': - convert_to_genbank(doc3, output_file, args_dict['allow_genbank_online']) + convert_to_genbank(doc3, output_file, args_dict['allow_genbank_online'], args_dict['force_new_converter']) elif output_file_type == 'SBOL2': doc2 = convert3to2(doc3) validate_online = sbol2.Config.getOption(sbol2.ConfigOptions.VALIDATE_ONLINE) @@ -440,6 +458,8 @@ def main(): help="Print running explanation of conversion process") parser.add_argument('--allow-genbank-online', dest='allow_genbank_online', action='store_true', default=False, help='Perform GenBank conversion using online converter') + parser.add_argument('--force-new-converter', dest='force_new_converter', action='store_true', default=False, + help='Force the usage of new (offline) converter instead of legacy (online) converter.') args_dict = vars(parser.parse_args()) # Call the shared command-line conversion routine command_line_converter(args_dict) @@ -474,6 +494,8 @@ def genbank2sbol(): help='Print running explanation of conversion process') parser.add_argument('--allow-genbank-online', dest='allow_genbank_online', action='store_true', default=False, help='Perform GenBank conversion using online converter') + parser.add_argument('--force-new-converter', dest='force_new_converter', action='store_true', default=False, + help='Force the usage of new (offline) converter instead of legacy (online) converter.') args_dict = vars(parser.parse_args()) args_dict['input_file_type'] = 'GenBank' args_dict['output_file_type'] = 'SBOL3' @@ -524,6 +546,8 @@ def sbol2genbank(): help="Print running explanation of conversion process") parser.add_argument('--allow-genbank-online', dest='allow_genbank_online', action='store_true', default=False, help='Perform GenBank conversion using online converter') + parser.add_argument('--force-new-converter', dest='force_new_converter', action='store_true', default=False, + help='Force the usage of new (offline) converter instead of legacy (online) converter.') args_dict = vars(parser.parse_args()) args_dict['input_file_type'] = 'SBOL3' args_dict['output_file_type'] = 'GenBank' diff --git a/sbol_utilities/gb2so.csv b/sbol_utilities/gb2so.csv new file mode 100644 index 00000000..764feb03 --- /dev/null +++ b/sbol_utilities/gb2so.csv @@ -0,0 +1,81 @@ +GenBank_Ontology,SO_Ontology +allele,SO:0001023 +attenuator,SO:0000140 +C_region,SO:0001834 +CAAT_signal,SO:0000172 +CDS,SO:0000316 +D-loop,SO:0000297 +D_segment,SO:0000458 +enhancer,SO:0000165 +exon,SO:0000147 +gene,SO:0000704 +GC_signal,SO:0000173 +iDNA,SO:0000723 +intron,SO:0000188 +J_region,SO:0000470 +LTR,SO:0000286 +mat_peptide,SO:0000419 +misc_binding,SO:0000409 +misc_difference,SO:0000413 +misc_feature,SO:0000001 +misc_marker,SO:0001645 +misc_recom, +misc_RNA,SO:0000233 +misc_signal,SO:0001411 +misc_structure,SO:0000002 +modified_base,SO:0000305 +mRNA,SO:0000234 +N_region,SO:0001835 +polyA_signal,SO:0000551 +polyA_site,SO:0000553 +precursor_RNA,SO:0000185 +prim_transcript,SO:0000185 +primer,SO:0000112 +primer_bind,SO:0005850 +promoter,SO:0000167 +protein_bind,SO:0000410 +RBS,SO:0000139 +rep_origin,SO:0000296 +repeat_region,SO:0000657 +repeat_unit,SO:0000726 +rRNA,SO:0000252 +S_region,SO:0001836 +satellite,SO:0000005 +scRNA,SO:0000013 +sig_peptide,SO:0000418 +snRNA,SO:0000274 +source,SO:0000149 +stem_loop,SO:0000313 +STS,SO:0000331 +TATA_signal,SO:0000174 +terminator,SO:0000141 +transit_peptide,SO:0000725 +transposon,SO:0001054 +tRNA,SO:0000253 +V_region,SO:0001833 +variation,SO:0001060 +-10_signal,SO:0000175 +-35_signal,SO:0000176 +3'clip,SO:0000557 +3'UTR,SO:0000205 +5'clip,SO:0000555 +5'UTR,SO:0000204 +regulatory,SO:0005836 +snoRNA,SO:0000275 +assembly_gap,SO:0000730 +gap,SO:0000730 +centromere,SO:0000577 +J_segment,SO:0000470 +J_gene_segemnt,SO:0000470 +mobile_element,SO:0001037 +mobile_genetic_element,SO:0001037 +ncRNA,SO:0000655 +operon,SO:0000178 +oriT,SO:0000724 +propeptide,SO:0001062 +telomere,SO:0000624 +tmRNA,SO:0000584 +unsure,SO:0001086 +sequence_uncertainty,SO:0001086 +V_segment,SO:000046 +V_gene_segment,SO:0000466 diff --git a/sbol_utilities/sbol3_genbank_conversion.py b/sbol_utilities/sbol3_genbank_conversion.py new file mode 100644 index 00000000..c7182755 --- /dev/null +++ b/sbol_utilities/sbol3_genbank_conversion.py @@ -0,0 +1,858 @@ +import os +import csv +import math +import sbol3 +import logging +from collections import OrderedDict + +import tyto +from typing import Dict, List, Sequence, Union, Optional +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation, Reference, \ + CompoundLocation, BeforePosition, ExactPosition, AfterPosition + +from sbol_utilities.workarounds import tyto_normalize_term + + +class GenBankSBOL3Converter: + """Main Converter class handling offline, direction conversion of files SBOL3 files to and from GenBank files""" + # dictionaries to store feature lookups for terms in GenBank and SO ontologies + gb2so_map = {} + so2gb_map = {} + # Conversion Constants : + # TODO: Temporarily assuming only dna components to be dealt with in genbank files + COMP_TYPES = [sbol3.SBO_DNA] + # TODO: Temporarily assuming components to only have the engineered_region role + COMP_ROLES = [sbol3.SO_ENGINEERED_REGION] + # TODO: Temporarily encoding sequence objects in IUPAC mode only + SEQUENCE_ENCODING = sbol3.IUPAC_DNA_ENCODING + # BIO_STRAND constants, which server as the GenBank counterparts to SBOL3's inline and reverse orientations + BIO_STRAND_FORWARD = 1 + BIO_STRAND_REVERSE = -1 + # Mapping int types to the types of locationPositions in GenBank (Before/After/Exact) + SBOL_LOCATION_POSITION = {BeforePosition: 0, ExactPosition: 1, AfterPosition: 2} + GENBANK_LOCATION_POSITION = {0: BeforePosition, 1: ExactPosition, 2: AfterPosition} + # Default value for the "sequence_version" annotation in GenBank files + DEFAULT_GB_SEQ_VERSION = 1 + # Default terms for SBOL3 and GenBank in case the feature lookup from + # respective dictionaries does not yield any ontology term + DEFAULT_SO_TERM = "SO:0000110" + DEFAULT_GB_TERM = "misc_feature" + # Namespace to be used be default if not provided, and also for all unit tests related to this converter + TEST_NAMESPACE = "https://test.sbol3.genbank/" + # File locations for required CSV data files which store the ontology term + # translations between GenBank and SO ontologies + GB2SO_MAPPINGS_CSV = os.path.join(os.path.dirname(os.path.realpath(__file__)), "gb2so.csv") + SO2GB_MAPPINGS_CSV = os.path.join(os.path.dirname(os.path.realpath(__file__)), "so2gb.csv") + + def __init__(self) -> None: + """While instantiating an instance of the converter, required builders + must be registered in order to accurately parse modified or new SBOL3 class objects + """ + def build_component_genbank_extension(*, identity, type_uri) -> GenBankSBOL3Converter.ComponentGenBankExtension: + """A builder function to be called by the SBOL3 parser + when it encounters a Component in an SBOL file. + :param identity: identity for new component class instance to have + :param type_uri: type_uri for new component class instance to have + """ + # `types` is required and not known at build time. + # Supply a missing value to the constructor, then clear + # the missing value before returning the built object. + obj = self.ComponentGenBankExtension(identity=identity, types=[sbol3.PYSBOL3_MISSING], type_uri=type_uri) + # Remove the placeholder value + obj.clear_property(sbol3.SBOL_TYPE) + return obj + + def build_feature_qualifiers_extension(*, identity, type_uri) -> GenBankSBOL3Converter.FeatureGenBankExtension: + """A builder function to be called by the SBOL3 parser + when it encounters a SequenceFeature in an SBOL file. + :param identity: identity for new feature qualifier class instance to have + :param type_uri: type_uri for new feature qualifier class instance to have + """ + # `types` is required and not known at build time. + # Supply a missing value to the constructor, then clear + # the missing value before returning the built object. + obj = self.FeatureGenBankExtension(identity=identity, type_uri=type_uri) + # Remove the placeholder value + obj.clear_property(sbol3.SBOL_TYPE) + return obj + + def build_location_extension(*, identity, type_uri) -> GenBankSBOL3Converter.LocationGenBankExtension: + """A builder function to be called by the SBOL3 parser + when it encounters a Custom location in an SBOL file. + :param identity: identity for new Location class instance to have + :param type_uri: type_uri for new Location class instance to have + """ + # `types` is required and not known at build time. + # Supply a missing value to the constructor, then clear + # the missing value before returning the built object. + obj = self.LocationGenBankExtension(identity=identity, type_uri=type_uri) + # Remove the placeholder value + # obj.clear_property(sbol3.SBOL_TYPE) + return obj + + def build_custom_reference_property(*, identity, type_uri) -> GenBankSBOL3Converter.CustomReferenceProperty: + """A builder function to be called by the SBOL3 parser + when it encounters a CustomReferenceProperty Toplevel object in an SBOL file. + :param identity: identity for custom reference property instance to have + :param type_uri: type_uri for custom reference property instance to have + """ + obj = self.CustomReferenceProperty(identity=identity, type_uri=type_uri) + return obj + + def build_custom_structured_comment_property( + *, identity, type_uri) -> GenBankSBOL3Converter.CustomStructuredCommentProperty: + """A builder function to be called by the SBOL3 parser + when it encounters a CustomStructuredCommentProperty Toplevel object in an SBOL file. + :param identity: identity for custom comment property instance to have + :param type_uri: type_uri for custom comment property instance to have + """ + obj = self.CustomStructuredCommentProperty(identity=identity, type_uri=type_uri) + return obj + + # Register the builder function so SBOL3 parser can build objects with a Component type URI + sbol3.Document.register_builder(sbol3.SBOL_COMPONENT, build_component_genbank_extension) + # Register the builder function for custom reference properties + sbol3.Document.register_builder(self.CustomReferenceProperty.CUSTOM_REFERENCE_NS, + build_custom_reference_property) + # Register the builder function for custom structured comment properties + sbol3.Document.register_builder( + self.CustomStructuredCommentProperty.CUSTOM_STRUCTURED_COMMENT_NS, + build_custom_structured_comment_property) + # Register the builder function so SBOL3 parser can build objects with a SequenceFeature type URI + sbol3.Document.register_builder(sbol3.SBOL_SEQUENCE_FEATURE, build_feature_qualifiers_extension) + # Register the builder function so SBOL3 parser can build objects with a Location type URI + sbol3.Document.register_builder(self.LocationGenBankExtension.GENBANK_RANGE_NS, build_location_extension) + + class CustomReferenceProperty(sbol3.CustomIdentified): + """Serves to store information and annotations for 'Reference' objects in + GenBank file to SBOL3 while parsing so that it may be retrieved back in a round trip + :extends: sbol3.CustomIdentified class + """ + CUSTOM_REFERENCE_NS = "http://www.ncbi.nlm.nih.gov/genbank#GenbankReference" + + def __init__(self, type_uri=CUSTOM_REFERENCE_NS, identity=None): + super().__init__(identity=identity, type_uri=type_uri) + self.authors = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#authors", 0, 1) + self.comment = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#comment", 0, 1) + self.journal = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#journal", 0, 1) + self.consrtm = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#consrtm", 0, 1) + self.title = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#title", 0, 1) + self.medline_id = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#medline_id", 0, 1) + self.pubmed_id = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#pubmed_id", 0, 1) + # stores the display id of parent component for a particular CustomReferenceProperty object + self.component = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#component", 0, 1) + # TODO: support cut locations? + # there can be multiple locations described for a reference, thus upper + # bound needs to be > 1 in order to use ListProperty + self.location = sbol3.OwnedObject( + self, f"{self.CUSTOM_REFERENCE_NS}#location", 0, math.inf, type_constraint=sbol3.Range) + + class CustomStructuredCommentProperty(sbol3.CustomIdentified): + """Serves to store information and annotations for 'Structured_Comment' objects in + GenBank file to SBOL3 while parsing so that it may be retrieved back in a round trip + Complete reference available at: https://www.ncbi.nlm.nih.gov/genbank/structuredcomment/ + :extends: sbol3.CustomIdentified class + """ + CUSTOM_STRUCTURED_COMMENT_NS = "http://www.ncbi.nlm.nih.gov/genbank#GenbankStructuredComment" + + def __init__(self, type_uri=CUSTOM_STRUCTURED_COMMENT_NS, identity=None): + super().__init__(identity=identity, type_uri=type_uri) + self.heading = sbol3.TextProperty(self, f"{self.CUSTOM_STRUCTURED_COMMENT_NS}#heading", 0, 1) + # stores the display id of parent component for a particular CustomReferenceProperty object + self.component = sbol3.TextProperty(self, f"{self.CUSTOM_STRUCTURED_COMMENT_NS}#component", 0, 1) + # there can be multiple key/values described for a structured_comment, + # thus upper bound needs to be > 1 in order to use ListProperty + self.structured_keys = sbol3.TextProperty( + self, f"{self.CUSTOM_STRUCTURED_COMMENT_NS}#structuredKeys", 0, math.inf) + self.structured_values = sbol3.TextProperty( + self, f"{self.CUSTOM_STRUCTURED_COMMENT_NS}#structuredValues", 0, math.inf) + + class FeatureGenBankExtension(sbol3.SequenceFeature): + """Overrides the sbol3 SequenceFeature class to include fields to directly read and write + qualifiers of GenBank features not storable in any SBOL3 data field. + :extends: sbol3.SequenceFeature class + """ + GENBANK_FEATURE_QUALIFIER_NS = "http://www.ncbi.nlm.nih.gov/genbank#featureQualifier" + + def __init__(self, locations: List[sbol3.Location] = None, **kwargs) -> None: + if locations is None: + locations = [] + # instantiating sbol3 SequenceFeature object + super().__init__(locations=locations, **kwargs) + # Setting properties for GenBank's qualifiers not settable in any SBOL3 field. + self.qualifier_key = sbol3.TextProperty(self, f"{self.GENBANK_FEATURE_QUALIFIER_NS}#key", 0, math.inf) + self.qualifier_value = sbol3.TextProperty(self, f"{self.GENBANK_FEATURE_QUALIFIER_NS}#value", 0, math.inf) + + class LocationGenBankExtension(sbol3.Location): + """Overrides the sbol3 Location class to include fields to store the + start and end position types (AfterPosition / BeforePosition / ExactPosition). + :extends: sbol3.Location class + """ + GENBANK_RANGE_NS = "http://www.ncbi.nlm.nih.gov/genbank#locationPosition" + + def __init__(self, sequence: sbol3.Sequence = sbol3.Sequence("autoCreatedSequence"), + *, identity: str = None, type_uri: str = GENBANK_RANGE_NS, + **kwargs) -> None: + super().__init__(sequence=sequence, identity=identity, type_uri=type_uri, **kwargs) + self.start = sbol3.IntProperty(self, f"{self.GENBANK_RANGE_NS}#start", 0, 1) + self.end = sbol3.IntProperty(self, f"{self.GENBANK_RANGE_NS}#end", 0, 1) + # Setting properties for GenBank's location position not settable in any SBOL3 field. + self.start_position = sbol3.IntProperty(self, f"{self.GENBANK_RANGE_NS}#start_position", 0, 1) + self.end_position = sbol3.IntProperty(self, f"{self.GENBANK_RANGE_NS}#end_position", 0, 1) + + class ComponentGenBankExtension(sbol3.Component): + """Overrides the sbol3 Component class to include fields to directly read and write + extraneous properties of GenBank not storable in any SBOL3 data field. + :extends: sbol3.Component class + """ + GENBANK_EXTRA_PROPERTY_NS = "http://www.ncbi.nlm.nih.gov/genbank" + + def __init__(self, identity: str, types: Optional[Union[str, Sequence[str]]], **kwargs) -> None: + # instantiating sbol3 component object + super().__init__(identity=identity, types=types, **kwargs) + # Setting properties for GenBank's extraneous properties not settable in any SBOL3 field. + self.genbank_seq_version = sbol3.IntProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#seq_version", 0, 1) + self.genbank_name = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#name", 0, 1) + self.genbank_date = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#date", 0, 1) + self.genbank_division = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#division", 0, 1) + self.genbank_locus = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#locus", 0, 1) + self.genbank_molecule_type = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#molecule", 0, 1) + self.genbank_organism = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#organism", 0, 1) + self.genbank_source = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#source", 0, 1) + self.genbank_topology = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#topology", 0, 1) + self.genbank_gi = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#gi", 0, 1) + self.genbank_comment = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#comment", 0, 1) + self.genbank_dblink = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#dbxrefs", 0, 1) + self.genbank_record_id = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#id", 0, 1) + # TODO : add note linking issue here + self.genbank_taxonomy = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#taxonomy", 0, 1) + self.genbank_keywords = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#keywords", 0, 1) + # there can be multiple accessions, thus upper bound needs to be > 1 in order to use TextListProperty + self.genbank_accessions = sbol3.TextProperty( + self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#accession", 0, math.inf) + self.fuzzy_features = sbol3.OwnedObject( + self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#fuzzyFeature", 0, math.inf, + type_constraint=sbol3.SequenceFeature) + self.genbank_references = sbol3.OwnedObject( + self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#reference", 0, math.inf, + type_constraint=GenBankSBOL3Converter.CustomReferenceProperty) + self.genbank_structured_comments = sbol3.OwnedObject( + self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#structuredComment", 0, math.inf, + type_constraint=GenBankSBOL3Converter.CustomStructuredCommentProperty) + + def create_gb2so_role_mappings(self, gb2so_csv: str = GB2SO_MAPPINGS_CSV, so2gb_csv: str = SO2GB_MAPPINGS_CSV, + convert_gb2so: bool = True, convert_so2gb: bool = True) -> int: + """Reads 2 CSV Files containing mappings for converting between GenBank and SequenceOntology (SO) roles + :param gb2so_csv: path to read genbank to so conversion csv file + :param so2gb_csv: path to read so to genbank conversion csv file + :param convert_gb2so: bool stating whether to read csv for genbank to SO mappings + :param convert_so2gb: bool stating whether to read csv for SO to genbank mappings + :return: int 1 / 0 denoting the status of whether the mappings were created and stored in dictionaries + """ + if convert_gb2so: + logging.debug("Parsing %s for GenBank to SO ontology mappings.", gb2so_csv) + try: + with open(gb2so_csv, mode="r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + self.gb2so_map[row["GenBank_Ontology"]] = row["SO_Ontology"] + except FileNotFoundError: + logging.error("No GenBank to SO Ontology Mapping CSV File Exists!") + return 0 + if convert_so2gb: + logging.debug("Parsing %s for SO to GenBank ontology mappings.", so2gb_csv) + try: + with open(so2gb_csv, mode="r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + self.so2gb_map[row["SO_Ontology"]] = row["GenBank_Ontology"] + except FileNotFoundError: + logging.error("No SO to Genbank Ontology Mapping CSV File Exists!") + return 0 + return 1 + + def convert_genbank_to_sbol3(self, gb_file: str, sbol3_file: str = "sbol3.nt", namespace: str = TEST_NAMESPACE, + write: bool = False) -> sbol3.Document: + """Convert a GenBank document on disk into an SBOL3 document + The GenBank document is parsed using BioPython, and corresponding objects of SBOL3 document are created + + :param gb_file: path to read GenBank file from + :param sbol3_file: path to write SBOL3 file to, if write set to true + :param namespace: URIs of Components will be set to {namespace}/{genbank_id}, + defaults to "https://test.sbol3.genbank/" + :param write: writes the generated sbol3 document in SORTED_NTRIPLES + format to provided sbol3_file path + :return: SBOL3 document containing converted materials + """ + # create sbol3 document, and record parser handler for gb file + sbol3.set_namespace(namespace) + doc = sbol3.Document() + # create updated py dict to store mappings between gb and so ontologies + logging.debug("Creating GenBank and SO ontologies mappings for sequence feature roles") + map_created = self.create_gb2so_role_mappings(gb2so_csv=self.GB2SO_MAPPINGS_CSV, convert_so2gb=False) + if not map_created: + # TODO: Need better SBOL3-GenBank specific error classes in future + raise ValueError("Required CSV data files are not present in your package.\n " + "Please reinstall the sbol_utilities package.\n Stopping current conversion process.\n " + "Reverting to legacy converter if new Conversion process is not forced.") + # access records by parsing gb file using SeqIO class + logging.debug("Parsing Genbank records using SeqIO class, using GenBank file %s", gb_file) + for record in list(SeqIO.parse(gb_file, "genbank").records): + # TODO: Currently we assume only linear or circular topology is possible + logging.debug("Parsing record - `%s` in genbank file.", record.id) + topology = "linear" + if "topology" in record.annotations: + topology = record.annotations["topology"] + # sometimes topology is specified in the 'data_file_division' field + elif record.annotations['data_file_division'] in ['circular', 'linear']: + topology = record.annotations['data_file_division'] + if topology == "linear": + extra_comp_types = [sbol3.SO_LINEAR] + else: + extra_comp_types = [sbol3.SO_CIRCULAR] + # creating component extended Component class to include GenBank extraneous properties + comp = self.ComponentGenBankExtension(identity=sbol3.string_to_display_id(record.name), + types=self.COMP_TYPES + extra_comp_types, roles=self.COMP_ROLES, + description=record.description) + # since SBOL3 requires display_id to have only alphanumeric characters and start not with a number; + # and these constraints are not present in GenBank, we pass the GenBank locus name through a filter + # helper method ('string_to_display_id'), which conforms it to SBOL's standard, and also store the + # original name in an extraneous property field 'genbank_name' which is reset later on during round trips. + comp.genbank_name = record.name + doc.add(comp) + + # TODO: Currently we use a fixed method of encoding (IUPAC) + seq = sbol3.Sequence(identity=str(comp.display_id) + "_sequence", elements=str(record.seq.lower()), + encoding=self.SEQUENCE_ENCODING) + doc.add(seq) + comp.sequences = [seq] + + # Setting properties for GenBank's extraneous properties not settable in any SBOL3 field. + self._store_extra_properties_in_sbol3(comp, seq, record) + + # create all sequence features, and tag all encountered feature qualifiers + # via extended Feature_GenBank_Extension class + self._handle_features_gb_to_sbol(record, comp, seq) + + if write: + logging.debug("Writing created sbol3 document to disk in sorted ntriples format: %s", sbol3_file) + doc.write(fpath=sbol3_file, file_format=sbol3.SORTED_NTRIPLES) + return doc + + def convert_sbol3_to_genbank(self, sbol3_file: str, doc: sbol3.Document = None, gb_file: str = "genbank.out", + # write: bool = False) -> List[SeqRecord]: + write: bool = False) -> Dict: + """Convert a SBOL3 document on disk into a GenBank document + The GenBank document is made using an array of SeqRecords using BioPython, by parsing SBOL3 objects + + :param sbol3_file: path to read SBOL3 file from + :param gb_file: path to write GenBank file to, if write set to true + :param write: writes the generated genbank document to provided path + :return: Array of SeqRecord objects which comprise the generated GenBank document + """ + if not doc: + doc = sbol3.Document() + doc.read(sbol3_file) + seq_records = [] + # create logs dict to be returned as conversion status of the SBOL3 file provided + logs: Dict[sbol3.TopLevel, bool] = {} + logging.debug("Creating GenBank and SO ontologies mappings for sequence feature roles") + # create updated py dict to store mappings between gb and so ontologies + map_created = self.create_gb2so_role_mappings(so2gb_csv=self.SO2GB_MAPPINGS_CSV, convert_gb2so=False) + if not map_created: + # TODO: Need better SBOL3-GenBank specific error classes in future + raise ValueError("Required CSV data files are not present in your package.\n " + "Please reinstall the sbol_utilities package.\n Stopping current conversion process.\n " + "Reverting to legacy converter if new Conversion process is not forced.") + # consider sbol3 objects which are components + logging.debug("Parsing SBOL3 Document components using SBOL3 Document: %s", doc) + for obj in doc.objects: + if isinstance(obj, sbol3.TopLevel): + # create a key for the top level object if it is not already parsed + if obj not in logs: + logs[obj] = False + if isinstance(obj, sbol3.Component): + logging.debug("Parsing component - `%s` in sbol3 document.", obj.display_id) + # NOTE: A single component/record cannot have multiple sequences + seq = None # If no sequence is found for a component + if obj.sequences and len(obj.sequences) == 1: + if doc.find(obj.sequences[0]): + obj_seq = doc.find(obj.sequences[0]) + seq = Seq(obj_seq.elements.upper()) + # mark the status of this top level sequence object as parsed and converted + if isinstance(obj_seq, sbol3.TopLevel): + logs[obj_seq] = True + elif len(obj.sequences) > 1: + raise ValueError(f"Component `{obj.display_id}` of given SBOL3 document has more than 1 sequence\n \ + (`{len(obj.sequences)}`). This is invalid; a component may only have 1 or 0 sequences.") + # Locus name for the sequence record is just the display id if SBOL3 component was not extended + # to include extraneous properties (in which case, we use the directly stored 'genbank_name' field) + locus_name = obj.display_id + if isinstance(obj, self.ComponentGenBankExtension) and obj.genbank_name: + locus_name = obj.genbank_name + seq_rec = SeqRecord(seq=seq, id=obj.display_id, description=obj.description or '', name=locus_name) + # Resetting extraneous genbank properties from extended component-genbank class + self._reset_extra_properties_in_genbank(obj, seq_rec) + + # recreate all sequence features, and tag all encountered feature + # qualifiers via extended Feature_GenBank_Extension class + self._handle_features_sbol_to_gb(seq_rec, obj) + + # mark the top level component object as parsed and converter + logs[obj] = True + seq_records.append(seq_rec) + # writing generated genbank document to disk at path provided + if write: + logging.debug("Writing created genbank file to disk: %s", gb_file) + SeqIO.write(seq_records, gb_file, "genbank") + return {"status": logs, "seqrecords": seq_records} + + def _store_extra_properties_in_sbol3(self, comp: ComponentGenBankExtension, + seq: sbol3.Sequence, record: SeqRecord) -> None: + """Helper function for setting properties for GenBank's extraneous properties not directly settable in any + SBOL3 field, using a modified, extended SBOL3 Component class, and a new CustomReferenceProperty TopLevel class. + :param comp: Instance of the extended SBOL3 Component class (Component_GenBank_Extension) + :param seq: The Sequence used in the GenBank record corresponding to sbol3 comp + :param record: GenBank SeqRecord instance for the record which contains extra properties + """ + comp.genbank_record_id = record.id + # set dblinks from the dbxrefs property of biopython + if record.dbxrefs: + # dbxrefs are parsed in a list by biopython from `record.dbxrefs`; we are storing them as a flat string + # to maintain order. Thus, we are creating a custom delimiter of `::`, by which we shall separate + # individual dbxrefs in the string and later split them to a list while resetting them in genbank + comp.genbank_dblink = "::".join(record.dbxrefs) + for annotation in record.annotations: + # Sending out warnings for genbank info not storable in sbol3 + logging.warning("Extraneous information not directly storable in SBOL3 - %s: %s", annotation, + record.annotations[annotation]) + # 1. GenBank Record Date + if annotation == 'date': + comp.genbank_date = record.annotations['date'] + # 2. GenBank Record Division + elif annotation == 'data_file_division': + # FIX for iGEM files not having data file division but topology stored in its key + if record.annotations['data_file_division'] in ['circular', 'linear']: + comp.genbank_topology = record.annotations['data_file_division'] + else: + comp.genbank_division = record.annotations['data_file_division'] + # 3. GenBank Record Keywords + elif annotation == 'keywords': + comp.genbank_keywords = ",".join(record.annotations['keywords']) + # 4. GenBank Record Molecule Type + elif annotation == 'molecule_type': + comp.genbank_molecule_type = record.annotations['molecule_type'] + # 5. GenBank Record Organism + elif annotation == 'organism': + comp.genbank_organism = record.annotations['organism'] + # 6. GenBank Record Source + elif annotation == 'source': + comp.genbank_source = record.annotations['source'] + # 7. GenBank Record Taxonomy + elif annotation == 'taxonomy': + comp.genbank_taxonomy = ",".join(record.annotations['taxonomy']) + # 8. GenBank Record Topology + elif annotation == 'topology': + comp.genbank_topology = record.annotations['topology'] + # 9. GenBank Record GI Property + elif annotation == 'gi': + comp.genbank_gi = record.annotations['gi'] + # 10. GenBank Record Accessions + elif annotation == 'accessions': + comp.genbank_accessions = sorted(record.annotations['accessions']) + # 11. GenBank Sequence Version + elif annotation == 'sequence_version': + comp.genbank_seq_version = record.annotations['sequence_version'] + # 12. GenBank Record References + elif annotation == 'references': + references = [] + for ind, reference in enumerate(record.annotations['references']): + # create a custom reference property instance for each reference + custom_reference = self.CustomReferenceProperty() + custom_reference.authors = reference.authors + custom_reference.comment = reference.comment + custom_reference.journal = reference.journal + custom_reference.title = reference.title + custom_reference.consrtm = reference.consrtm + custom_reference.medline_id = reference.medline_id + custom_reference.pubmed_id = reference.pubmed_id + for gb_loc in reference.location: + feat_loc_orientation = sbol3.SO_FORWARD + if gb_loc.strand == -1: + feat_loc_orientation = sbol3.SO_REVERSE + if gb_loc.start == gb_loc.end: + locs = sbol3.Cut(sequence=seq, at=int(gb_loc.start), orientation=feat_loc_orientation) + else: + locs = sbol3.Range(sequence=seq, start=int(gb_loc.start), + end=int(gb_loc.end), orientation=feat_loc_orientation) + custom_reference.location.append(locs) + # link the parent component for each custom reference property objects + if comp.display_id: + custom_reference.component = comp.display_id + # TODO: Raise error, no name for component + # else: + references.append(custom_reference) + comp.genbank_references = references + # 14. GenBank Record Comment + elif annotation == 'comment': + comp.genbank_comment = record.annotations['comment'] + # 15. GenBank Record Structured comments + elif annotation == 'structured_comment': + identity_ind = 1 + comments = [] + for heading in record.annotations['structured_comment']: + structured_comment_object = self.CustomStructuredCommentProperty() + identity_ind += 1 + if comp.display_id: + structured_comment_object.component = comp.display_id + structured_comment_object.heading = heading + structured_dict = record.annotations['structured_comment'][heading] + key_value_ind = 1 + for key in structured_dict: + # NOTE: if storing list as string for keys and values both, have a check + # of them having same length user uses our delimiter while writing + structured_comment_object.structured_keys.append(f"{key_value_ind}::{key}") + structured_comment_object.structured_values.append(f"{key_value_ind}::{structured_dict[key]}") + key_value_ind += 1 + comments.append(structured_comment_object) + comp.genbank_structured_comments = comments + else: + raise ValueError(f"The annotation `{annotation}` in the GenBank record `{record.id}`\n \ + is not recognized as a standard annotation.") + # TODO: BioPython's parsing doesn't explicitly place a "locus" data field? + # 13. GenBank Record Locus + comp.genbank_locus = record.name + + def _reset_extra_properties_in_genbank(self, obj: sbol3.Component, seq_rec: SeqRecord) -> None: + """Helper function for resetting properties for GenBank's extraneous properties from SBOL3 object's properties, + by using a modified, extended SBOL3 Component class, and a new CustomReferenceProperty TopLevel class. + :param obj: SBOL3 component, extra properties are stored within if an instance of the extended class + :param seq_rec: GenBank SeqRecord instance for the record in which to reset extra properties + """ + if isinstance(obj, self.ComponentGenBankExtension): + if obj.genbank_record_id: + seq_rec.id = obj.genbank_record_id + # set db links using dbxrefs property of biopython + if obj.genbank_dblink: + # NOTE: see comment on `_store_extra_properties_in_sbol3`'s dbxrefs section, where we describe how '::' + # is used as a delimiter to store the dbxrefs list as a string to maintain order. Here, we split the + # string by the same delimiter to restore the list in resetting GenBank properties. + seq_rec.dbxrefs = str(obj.genbank_dblink).split("::") + # 1. GenBank Record Date + if obj.genbank_date: + seq_rec.annotations['date'] = obj.genbank_date + # 2. GenBank Record Division + if obj.genbank_division: + seq_rec.annotations['data_file_division'] = obj.genbank_division + # 3. GenBank Record Keywords + # seq_rec.annotations['keywords'] = sorted(list(obj.genbank_keywords)) + if obj.genbank_keywords: + seq_rec.annotations['keywords'] = str(obj.genbank_keywords).split(",") + # 4. GenBank Record Molecule Type + if obj.genbank_molecule_type: + seq_rec.annotations['molecule_type'] = obj.genbank_molecule_type + # 5. GenBank Record Organism + if obj.genbank_organism: + seq_rec.annotations['organism'] = obj.genbank_organism + # 6. GenBank Record Source + # FIXME: Apparently, if a default source was used during in the GenBank file + # during conversion of GenBank -> SBOL, component.genbank_source is "", + # and while plugging it back in during conversion of SBOL -> GenBank, it + # simply prints "", whereas the default "." should have been printed + if obj.genbank_source: + seq_rec.annotations['source'] = obj.genbank_source + # 7. GenBank Record taxonomy + # TODO : link gh issue for note below + # FIXME: Even though component.genbank_taxonomy is stored in sorted order, it + # becomes unsorted while retrieving from the sbol file + if obj.genbank_taxonomy: + seq_rec.annotations['taxonomy'] = str(obj.genbank_taxonomy).split(",") + # 8. GenBank Record Topology + if obj.genbank_topology: + seq_rec.annotations['topology'] = obj.genbank_topology + # 9. GenBank Record GI Property + if obj.genbank_gi: + seq_rec.annotations['gi'] = obj.genbank_gi + # 10. GenBank Record Accessions + if obj.genbank_accessions: + seq_rec.annotations['accessions'] = sorted(list(obj.genbank_accessions)) + # 11. GenBank Sequence Version + if obj.genbank_seq_version: + seq_rec.annotations['sequence_version'] = obj.genbank_seq_version + # 12. GenBank Record References + if obj.genbank_references: + # if sbol3 object has references + record_references = [] + for reference in obj.genbank_references: + reference_object = Reference() + reference_object.title = reference.title + reference_object.authors = reference.authors + reference_object.comment = reference.comment + reference_object.journal = reference.journal + reference_object.consrtm = reference.consrtm + reference_object.pubmed_id = reference.pubmed_id + reference_object.medline_id = reference.medline_id + for obj_feat_loc in reference.location: + feat_strand = self.BIO_STRAND_FORWARD + # feature strand value which denotes orientation of the location of the feature + # By default its 1 for SO_FORWARD orientation of sbol3 feature location, and -1 for SO_REVERSE + if obj_feat_loc.orientation == sbol3.SO_REVERSE: + feat_strand = self.BIO_STRAND_REVERSE + # elif obj_feat_loc.orientation != sbol3.SO_FORWARD: + # raise ValueError(f"Location orientation: `{obj_feat_loc.orientation}` for feature: \n \ + # `{obj_feat.name}` of component: `{obj.display_id}` is not a valid orientation.\n \ + # Valid orientations are `{sbol3.SO_FORWARD}`, `{sbol3.SO_REVERSE}`") + # TODO: Raise custom converter class ERROR for `else:` + feat_loc_object = FeatureLocation( + start=obj_feat_loc.start, + end=obj_feat_loc.end, + strand=feat_strand, + ) + reference_object.location.append(feat_loc_object) + record_references.append(reference_object) + seq_rec.annotations['references'] = record_references + # 13. GenBank Record Locus + # TODO: No explicit way to set locus via BioPython? + # 14. GenBank Record Comments + if obj.genbank_comment: + seq_rec.annotations['comment'] = obj.genbank_comment + # 15. GenBank Record Structured Comments + if obj.genbank_structured_comments: + comment_annotation = OrderedDict() + for structured_comment in obj.genbank_structured_comments: + structured_comment_object = OrderedDict() + total_keys = len(structured_comment.structured_keys) + structured_keys = sorted(list(structured_comment.structured_keys), + key=lambda t: int(t.split("::", 1)[0])) + structured_values = sorted(list(structured_comment.structured_values), + key=lambda t: int(t.split("::", 1)[0])) + for ind in range(total_keys): + key = structured_keys[ind].split("::", 1)[1] + value = structured_values[ind].split("::", 1)[1] + structured_comment_object[key] = value + comment_annotation[structured_comment.heading] = structured_comment_object + seq_rec.annotations['structured_comment'] = comment_annotation + # 4. GenBank Record Molecule Type: Set molecule type if not already annotated + if 'molecule_type' not in seq_rec.annotations: + if sbol3.SBO_DNA in obj.types: + seq_rec.annotations['molecule_type'] = 'DNA' + elif sbol3.SBO_RNA in obj.types: + seq_rec.annotations['molecule_type'] = 'RNA' + elif sbol3.SBO_PROTEIN in obj.types: + seq_rec.annotations['molecule_type'] = 'protein' + else: + raise ValueError('Cannot determine molecule type for object %s', obj.identity) + # 8. GenBank Record Topology: Set topology if not already annotated + if 'topology' not in seq_rec.annotations: + if sbol3.SO_CIRCULAR in obj.types: + seq_rec.annotations['topology'] = 'circular' + else: # either linear or not set + seq_rec.annotations['topology'] = 'linear' + # 11. GenBank Sequence Version: default to 1 if not already annotated, and also add version to ID + if 'sequence_version' not in seq_rec.annotations: + seq_rec.annotations['sequence_version'] = self.DEFAULT_GB_SEQ_VERSION + seq_rec.id = f'{seq_rec.id}.{self.DEFAULT_GB_SEQ_VERSION}' + + def _handle_features_gb_to_sbol(self, record: SeqRecord, comp: ComponentGenBankExtension, + seq: sbol3.Sequence) -> None: + """Helper function for setting sequence features and their qualifiers to SBOL, + by using a modified, extended SBOL3 Sequence Feature class - Feature_GenBank_Extension. + :param record: GenBank SeqRecord instance for the record which contains sequence features + :param comp: Instance of the SBOL3 Component + :param seq: The Sequence used in the GenBank record corresponding to sbol3 comp + """ + # parse if genbank record has any features + if not record.features: + return + comp.features = [] + for ind, gb_feat in enumerate(record.features): + feat_locations = [] + fuzzy_feature = False + feat_name = None + if "label" in gb_feat.qualifiers: + feat_name = gb_feat.qualifiers["label"][0] + logging.debug("Parsing feature `%s` for record `%s`", feat_name or ind, record.id) + for gb_loc in gb_feat.location.parts: + # Default orientation is "inline" except if complement is specified via strand + feat_loc_orientation = sbol3.SO_FORWARD + if gb_loc.strand == -1: + feat_loc_orientation = sbol3.SO_REVERSE + # create "Range/Cut" FeatureLocation by parsing genbank record location + # Create a cut or range as feature location depending on whether location is specified as + # Cut (eg: "n^n+1", parsed as [n:n] by biopython) or Range (eg: "n..m", parsed as [n:m] by biopython) + if gb_loc.start == gb_loc.end: + locs = sbol3.Cut(sequence=seq, at=int(gb_loc.start), orientation=feat_loc_orientation) + else: + # find int mappings for positions of start and end locations, + # as defined in the static class variable 'SBOL_LOCATION_POSITION' + # 0->BeforePosition, 1->ExactPosition, 2->AfterPosition + end_position = self.SBOL_LOCATION_POSITION[type(gb_loc.end)] + start_position = self.SBOL_LOCATION_POSITION[type(gb_loc.start)] + # If both start and end positions are exact positions, the + # feature location can be created simply as a range object + # Kludge truncation of fuzzy ranges (https://github.com/SynBioDex/SBOL-utilities/issues/200) + if start_position == 1 and end_position == 1 or True: + locs = sbol3.Range(sequence=seq, orientation=feat_loc_orientation, end=int(gb_loc.end), + # add 1 to start, as BioPython parses GenBank start locations as 0-indexed + start=int(gb_loc.start) + 1) + # If either or both of start and end locations are fuzzy, then + # the location object needs to be of the custom class 'Location_GenBank_Extension' + else: + locs = self.LocationGenBankExtension(sequence=seq, orientation=feat_loc_orientation) + # start and end int positions specified + locs.end = int(gb_loc.end) + # add 1, as BioPython parses GenBank start locations as 0-indexed instead of 1-indexed + locs.start = int(gb_loc.start) + 1 + # storing location types in IntProperties of SBOL3 + locs.end_position = end_position + locs.start_position = start_position + # if any of the location endpoints of a feature (start/end) has a fuzzy end + # (i.e., not Exact position) like BeforePosition/AfterPosition, we mark the + # feature as a 'fuzzy_feature' which decides whether to store the feature or not + if not fuzzy_feature and locs.end_position != 1 or locs.start_position != 1: + fuzzy_feature = True + feat_locations.append(locs) + # Obtain sequence feature role from Genbank to SO type mappings + feat_role = sbol3.SO_NS[:-3] + if self.gb2so_map.get(gb_feat.type): + feat_role += self.gb2so_map[gb_feat.type] + else: + logging.warning(f"Feature type: `{gb_feat.type}` for feature: `{gb_feat.qualifiers['label'][0]}` \n \ + of record: `{record.name}` has no corresponding ontology term for SO, using the default SO term, " + f"{self.DEFAULT_SO_TERM}") + feat_role += self.DEFAULT_SO_TERM + # assign feature orientation based on the strand value in genbank feature + feat_orientation = sbol3.SO_FORWARD + if gb_feat.strand == -1: + feat_orientation = sbol3.SO_REVERSE + feat = self.FeatureGenBankExtension( + locations=feat_locations, + roles=[feat_role], + # name=gb_feat.qualifiers["label"][0], + name=feat_name, + orientation=feat_orientation + ) + # store qualifiers key value pairs + for index, qualifier in enumerate(gb_feat.qualifiers): + feat.qualifier_key.append(f"{index}:" + qualifier) + feat.qualifier_value.append(f"{index}:" + gb_feat.qualifiers[qualifier][0]) + # if feature has any fuzzy location, since SBOL does not support storing such location endpoints, + # instead of presenting incomplete/incorrect information to users, we would store the feature assume + # a property of the Extended GenBank Component class, instead as a feature of the component. + # See: issue -> + # Once the above issue gets addressed, we can remove the 'fuzzy_feature' property and simply add the + # concerned feature to the features of the component. + if not fuzzy_feature: + comp.features.append(feat) + else: + comp.fuzzy_features.append(feat) + + def _handle_features_sbol_to_gb(self, seq_rec: SeqRecord, obj: ComponentGenBankExtension) -> None: + """Helper function for resetting sequence features and their qualifiers to GenBank, + by using a modified, extended SBOL3 Sequence Feature class - Feature_GenBank_Extension. + :param seq_rec: GenBank SeqRecord instance for the record which contains sequence features + :param obj: Instance of the SBOL3 Component + """ + # parse if sbol object has any features + if not obj.features: + return + seq_rec_features = [] + # for round trip conversion, consider all features - exact and fuzzy ones too + all_features = list(obj.features) + if isinstance(obj, self.ComponentGenBankExtension): + all_features += list(obj.fuzzy_features) + # converting all sequence features + for obj_feat in all_features: + # TODO: Also add ability to parse subcomponent feature type + # Note: Currently we only parse sequence features from sbol3 to genbank + if isinstance(obj_feat, sbol3.SequenceFeature): + logging.debug("Parsing feature `%s` for component `%s`", obj_feat.name, obj.display_id) + # TODO: There may be multiple locations for a feature from sbol3; + # add ability to parse them into a single genbank feature + feat_loc_parts = [] + feat_loc_object = None + feat_loc_positions = [] + feat_strand = self.BIO_STRAND_FORWARD + for obj_feat_loc in obj_feat.locations: + feat_strand = self.BIO_STRAND_FORWARD + # feature strand value which denotes orientation of the location of the feature + # By default its 1 for SO_FORWARD orientation of sbol3 feature location, and -1 for SO_REVERSE + if obj_feat_loc.orientation in {sbol3.SO_REVERSE, sbol3.SBOL_REVERSE_COMPLEMENT}: + feat_strand = self.BIO_STRAND_REVERSE + elif obj_feat_loc.orientation not in {sbol3.SO_FORWARD, sbol3.SBOL_INLINE}: + raise ValueError(f"Location orientation: `{obj_feat_loc.orientation}` for feature: \n \ + `{obj_feat.name}` of component: `{obj.display_id}` is not a valid orientation.\n \ + Valid orientations are `{sbol3.SO_FORWARD}`, `{sbol3.SO_REVERSE}`, `{sbol3.SBOL_INLINE}`, " + f"`{sbol3.SBOL_REVERSE_COMPLEMENT}`") + # TODO: Raise custom converter class ERROR for `else:` + # creating start and end Positions + end_position = ExactPosition(obj_feat_loc.end) + # subtract 1, as BioPython parses GenBank start locations as 0-indexed instead of 1-indexed + start_position = ExactPosition(int(obj_feat_loc.start) - 1) + # if custom range object, check for position being Before / After Positions + if isinstance(obj_feat_loc, self.LocationGenBankExtension): + # change end and start Positions only if user has made integer entries into them + if obj_feat_loc.end_position is not None: + position_class = self.GENBANK_LOCATION_POSITION[obj_feat_loc.end_position] + end_position = position_class(obj_feat_loc.end) + if obj_feat_loc.start_position is not None: + position_class = self.GENBANK_LOCATION_POSITION[obj_feat_loc.start_position] + # subtract 1, as BioPython parses GenBank start locations as 0-indexed instead of 1-indexed + start_position = position_class(int(obj_feat_loc.start) - 1) + feat_loc_object = FeatureLocation(start=start_position, end=end_position, strand=feat_strand) + feat_loc_parts.append(feat_loc_object) + # sort feature locations lexicographically internally first + # NOTE: If the feature location has an outer "complement" location + # operator, the sort needs to be in reverse order + if obj_feat.orientation == sbol3.SO_REVERSE: + feat_loc_parts.sort(key=lambda loc: (loc.start, loc.end, loc.strand), reverse=True) + else: + feat_loc_parts.sort(key=lambda loc: (loc.start, loc.end, loc.strand)) + for location in feat_loc_parts: + feat_loc_positions += [location.start, location.end] + if len(feat_loc_parts) > 1: + feat_loc_object = CompoundLocation(parts=feat_loc_parts, operator="join") + elif len(feat_loc_parts) == 1: + feat_loc_object = feat_loc_parts[0] + # action to perform if no location found? + # else: + + # FIXME: order of features not same as original genbank doc? + # Obtain sequence feature role from Sequence Ontology to GenBank role mappings + so_roles = list(filter(None, (tyto_normalize_term(tyto.SO, role) for role in obj_feat.roles))) + feat_role = self.DEFAULT_GB_TERM + if len(so_roles): + if len(so_roles)>1: + logging.warning('Found multiple SequenceOntology roles %s for feature %s, using first' + 'for mapping to GenBank term', str(so_roles), obj_feat.identity) + if self.so2gb_map.get(so_roles[0]): + feat_role = self.so2gb_map[so_roles[0]] + else: + logging.warning('Feature role %s (%s) for feature %s, has no corresponding ontology term for ' + 'GenBank, using the default GenBank term, %s', so_roles[0], + tyto.SO.get_term_by_uri(so_roles[0]), obj_feat.identity, self.DEFAULT_GB_TERM) + else: + logging.warning('No SequenceOntology roles found for feature %s, sing the default GenBank term, %s', + obj_feat.identity, self.DEFAULT_GB_TERM) + # create sequence feature object with label qualifier + # TODO: create issue for presence of genbank file with features without the "label" qualifier + # TODO: feat_strand value ambiguous in case of multiple locations? + feature = SeqFeature(location=feat_loc_object, type=feat_role) + feature.loc_positions = feat_loc_positions + if isinstance(obj_feat, self.FeatureGenBankExtension): + keys = sorted(obj_feat.qualifier_key, key=lambda x: int(x.split(":", 1)[0])) + values = sorted(obj_feat.qualifier_value, key=lambda x: int(x.split(":", 1)[0])) + for qualifier_ind in range(len(keys)): + feature.qualifiers[keys[qualifier_ind].split(":", 1)[1]] = \ + values[qualifier_ind].split(":", 1)[1] + if obj_feat.name: + feature.qualifiers['label'] = obj_feat.name + seq_rec_features.append(feature) + + # Sort features based on feature location start/end, lexicographically, and then by + # strand / number of qualifiers / type of feature string comparison + seq_rec_features.sort(key=lambda feat: (feat.loc_positions, feat.strand, len(feat.qualifiers), feat.type)) + seq_rec.features = seq_rec_features diff --git a/sbol_utilities/sbol3_sbol2_conversion.py b/sbol_utilities/sbol3_sbol2_conversion.py new file mode 100644 index 00000000..ac44446f --- /dev/null +++ b/sbol_utilities/sbol3_sbol2_conversion.py @@ -0,0 +1,571 @@ +import sbol3 +import sbol2 +from sbol2 import mapsto, model, sequenceconstraint + +# Namespaces +from rdflib import URIRef + +BACKPORT_NAMESPACE = 'http://sboltools.org/backport#' +BACKPORT2_VERSION = f'{BACKPORT_NAMESPACE}sbol2version' +BACKPORT3_NAMESPACE = f'{BACKPORT_NAMESPACE}sbol3namespace' + +NON_EXTENSION_PROPERTY_PREFIXES = {sbol3.SBOL3_NS, sbol3.SBOL2_NS, # SBOL 2 & 3 namespaces + sbol3.RDF_NS, sbol3.PROV_NS, sbol3.OM_NS, # Standard ontologies + BACKPORT_NAMESPACE} # Information added by this converter +SBOL2_NON_EXTENSION_PROPERTY_PREFIXES = NON_EXTENSION_PROPERTY_PREFIXES.union({ + 'http://purl.org/dc/terms/description', 'http://purl.org/dc/terms/title'}) + + +class SBOL3To2ConversionVisitor: + """This class is used to map every object in an SBOL3 document into an empty SBOL2 document""" + + doc2: sbol2.Document + + def __init__(self, doc3: sbol3.Document): + # Create the target document + self.doc2 = sbol2.Document() + # # Immediately run the conversion + self._convert(doc3) + + def _convert(self, doc3: sbol3.Document): + # Bind standard namespaces that aren't bound by default in pySBOL2 + self.doc2.addNamespace(BACKPORT_NAMESPACE, 'backport') + self.doc2.addNamespace(sbol3.PROV_NS, 'prov') + self.doc2.addNamespace(sbol3.OM_NS, 'om') + self.doc2.addNamespace('http://purl.org/dc/terms/', 'dcterms') + + # Override parameters that will otherwise interfere in conversion, saving old values + saved_compliance = sbol2.Config.getOption(sbol2.ConfigOptions.SBOL_COMPLIANT_URIS.value) + sbol2.Config.setOption(sbol2.ConfigOptions.SBOL_COMPLIANT_URIS.value, False) + saved_homespace = sbol2.getHomespace() + sbol2.setHomespace('') + + # Try conversion, resetting saved parameter values afterward + try: + doc3.accept(self) + # TODO: make sure that complex extension objects (e.g., from SBOLFactory) are properly converted + # TODO: make sure that unhandled SBOL child objects / properties will throw errors + # TODO: check if we need to add post-creation fix-up of links, to ensure they point to objects + finally: + sbol2.Config.setOption(sbol2.ConfigOptions.SBOL_COMPLIANT_URIS.value, saved_compliance) + sbol2.setHomespace(saved_homespace) + + @staticmethod + def _convert_extension_properties(obj3: sbol3.Identified, obj2: sbol2.Identified): + """Copy over extension properties""" + extension_properties = (p for p in obj3.properties + if not any(p.startswith(prefix) for prefix in NON_EXTENSION_PROPERTY_PREFIXES)) + for p in extension_properties: + obj2.properties[p] = obj3._properties[p].copy() # Can't use setPropertyValue because it may not be a string + + @staticmethod + def _value_or_property(obj3: sbol3.Identified, value, prop: str): + if prop in obj3._properties and len(obj3._properties[prop]) == 1: + return value or obj3._properties[prop][0] + return value + + def _convert_identified(self, obj3: sbol3.Identified, obj2: sbol2.Identified): + """Map over the other properties of an identified object""" + self._convert_extension_properties(obj3, obj2) + # Map over equivalent properties + obj2.displayId = obj3.display_id + obj2.name = self._value_or_property(obj3, obj3.name, 'http://purl.org/dc/terms/title') + obj2.description = self._value_or_property(obj3, obj3.description, 'http://purl.org/dc/terms/description') + obj2.wasDerivedFrom = obj3.derived_from + obj2.wasGeneratedBy = obj3.generated_by + # Turn measures into extension properties + if obj3.measures: + raise NotImplementedError('Conversion of measures from SBOL3 to SBOL2 not yet implemented') + + def _convert_toplevel(self, obj3: sbol3.TopLevel, obj2: sbol2.TopLevel): + """Map over the other properties of a TopLevel object""" + self._convert_identified(obj3, obj2) + obj2.attachments = [a.identity for a in obj3.attachments] + obj2.properties[BACKPORT3_NAMESPACE] = [URIRef(obj3.namespace)] + + @staticmethod + def _sbol2_version(obj: sbol3.Identified): + if not hasattr(obj, 'sbol2_version'): + obj.sbol2_version = sbol3.TextProperty(obj, BACKPORT2_VERSION, 0, 1) + # TODO: since version is optional, if it's missing, should this be returning '1' or None? + return obj.sbol2_version or '1' + + def visit_activity(self, act3: sbol3.Activity): + # Make the Activity object and add it to the document + act2 = sbol2.Activity(act3.identity, version=self._sbol2_version(act3)) + self.doc2.activities.add(act2) + if act3.types: + if len(act3.types) > 1: + raise NotImplementedError('Conversion of multi-type Activities to SBOL2 not yet implemented:' + 'pySBOL2 currently supports a maximum of one type per activity' + 'Bug: https://github.com/SynBioDex/pySBOL2/issues/428') + act2.types = act3.types[0] # Take first type from list of length 1 + act2.startedAtTime = act3.start_time + act2.endedAtTime = act3.end_time + if act3.usage or act3.association: + raise NotImplementedError('Conversion of Activity usage and association properties to SBOL2 ' + 'not yet implemented, due to visitors failing to return values' + 'Bug: https://github.com/SynBioDex/pySBOL3/issues/437') + act2.usages = [usage.accept(self) for usage in act3.usage] + act2.associations = [assoc.accept(self) for assoc in act3.association] + # TODO: pySBOL3 is currently missing wasInformedBy (https://github.com/SynBioDex/pySBOL3/issues/436 + # act2.wasInformedBy = act3.informed_by + # Map over all other TopLevel properties and extensions not covered by the constructor + self._convert_toplevel(act3, act2) + + def visit_agent(self, a: sbol3.Agent): + # Priority: 3 + raise NotImplementedError('Conversion of Agent from SBOL3 to SBOL2 not yet implemented') + + def visit_association(self, a: sbol3.Association): + # Priority: 3 + raise NotImplementedError('Conversion of Association from SBOL3 to SBOL2 not yet implemented') + + def visit_attachment(self, a: sbol3.Attachment): + # Priority: 2 + raise NotImplementedError('Conversion of Attachment from SBOL3 to SBOL2 not yet implemented') + + def visit_binary_prefix(self, a: sbol3.BinaryPrefix): + # Priority: 4 + raise NotImplementedError('Conversion of BinaryPrefix from SBOL3 to SBOL2 not yet implemented') + + def visit_collection(self, coll3: sbol3.Collection): + # Priority: 1 + # Make the Collection object and add it to the document + coll2 = sbol2.Collection(coll3.identity) + coll2.members = coll3.members + self.doc2.addCollection(coll2) + # Map over all other TopLevel properties and extensions not covered by the constructor + self._convert_toplevel(coll3, coll2) + + def visit_combinatorial_derivation(self, a: sbol3.CombinatorialDerivation): + # Priority: 2 + raise NotImplementedError('Conversion of CombinatorialDerivation from SBOL3 to SBOL2 not yet implemented') + + def visit_component(self, cp3: sbol3.Component): + # Remap type if it's one of the ones that needs remapping; otherwise pass through unchanged + type_map = {sbol3.SBO_DNA: sbol2.BIOPAX_DNA, # TODO: distinguish BioPAX Dna from DnaRegion + sbol3.SBO_RNA: sbol2.BIOPAX_RNA, # TODO: distinguish BioPAX Rna from RnaRegion + sbol3.SBO_PROTEIN: sbol2.BIOPAX_PROTEIN, + sbol3.SBO_SIMPLE_CHEMICAL: sbol2.BIOPAX_SMALL_MOLECULE, + sbol3.SBO_NON_COVALENT_COMPLEX: sbol2.BIOPAX_COMPLEX} + types2 = [type_map.get(t, t) for t in cp3.types] + # Make the Component object and add it to the document + cp2 = sbol2.ComponentDefinition(cp3.identity, types2, version=self._sbol2_version(cp3)) + self.doc2.addComponentDefinition(cp2) + # Convert the Component properties not covered by the constructor + cp2.roles = cp3.roles + cp2.sequences = cp3.sequences + if cp3.features: + raise NotImplementedError('Conversion of Component features from SBOL3 to SBOL2 not yet implemented') + if cp3.interactions: + raise NotImplementedError('Conversion of Component interactions from SBOL3 to SBOL2 not yet implemented') + if cp3.constraints: + raise NotImplementedError('Conversion of Component constraints from SBOL3 to SBOL2 not yet implemented') + if cp3.interface: + raise NotImplementedError('Conversion of Component interface from SBOL3 to SBOL2 not yet implemented') + if cp3.models: + raise NotImplementedError('Conversion of Component models from SBOL3 to SBOL2 not yet implemented') + # Map over all other TopLevel properties and extensions not covered by the constructor + self._convert_toplevel(cp3, cp2) + + def visit_component_reference(self, a: sbol3.ComponentReference): + # Priority: 3 + raise NotImplementedError('Conversion of ComponentReference from SBOL3 to SBOL2 not yet implemented') + + def visit_constraint(self, a: sbol3.Constraint): + # Priority: 2 + raise NotImplementedError('Conversion of Constraint from SBOL3 to SBOL2 not yet implemented') + + def visit_cut(self, a: sbol3.Cut): + # Priority: 2 + raise NotImplementedError('Conversion of Cut from SBOL3 to SBOL2 not yet implemented') + + def visit_document(self, doc3: sbol3.Document): + for obj in doc3.objects: + obj.accept(self) + + def visit_entire_sequence(self, a: sbol3.EntireSequence): + # Priority: 3 + raise NotImplementedError('Conversion of EntireSequence from SBOL3 to SBOL2 not yet implemented') + + def visit_experiment(self, a: sbol3.Experiment): + # Priority: 3 + raise NotImplementedError('Conversion of Experiment from SBOL3 to SBOL2 not yet implemented') + + def visit_experimental_data(self, a: sbol3.ExperimentalData): + # Priority: 3 + raise NotImplementedError('Conversion of ExperimentalData from SBOL3 to SBOL2 not yet implemented') + + def visit_externally_defined(self, a: sbol3.ExternallyDefined): + # Priority: 3 + raise NotImplementedError('Conversion of ExternallyDefined from SBOL3 to SBOL2 not yet implemented') + + def visit_implementation(self, imp3: sbol3.Implementation): + # Priority: 1 + # Make the Implement object and add it to the document + imp2 = sbol2.Implementation(imp3.identity, version=self._sbol2_version(imp3)) + imp2.built = imp3.built + self.doc2.addImplementation(imp2) + # Map over all other TopLevel properties and extensions not covered by the constructor + self._convert_toplevel(imp3, imp2) + + def visit_interaction(self, a: sbol3.Interaction): + # Priority: 2 + raise NotImplementedError('Conversion of Interaction from SBOL3 to SBOL2 not yet implemented') + + def visit_interface(self, a: sbol3.Interface): + # Priority: 3 + raise NotImplementedError('Conversion of Interface from SBOL3 to SBOL2 not yet implemented') + + def visit_local_sub_component(self, a: sbol3.LocalSubComponent): + # Priority: 2 + raise NotImplementedError('Conversion of LocalSubComponent from SBOL3 to SBOL2 not yet implemented') + + def visit_measure(self, a: sbol3.Measure): + # Priority: 3 + raise NotImplementedError('Conversion of Measure from SBOL3 to SBOL2 not yet implemented') + + def visit_model(self, a: sbol3.Model): + # Priority: 3 + raise NotImplementedError('Conversion of Model from SBOL3 to SBOL2 not yet implemented') + + def visit_participation(self, a: sbol3.Participation): + # Priority: 2 + raise NotImplementedError('Conversion of Participation from SBOL3 to SBOL2 not yet implemented') + + def visit_plan(self, a: sbol3.Plan): + # Priority: 3 + raise NotImplementedError('Conversion of Plan from SBOL3 to SBOL2 not yet implemented') + + def visit_prefixed_unit(self, a: sbol3.PrefixedUnit): + # Priority: 4 + raise NotImplementedError('Conversion of PrefixedUnit from SBOL3 to SBOL2 not yet implemented') + + def visit_range(self, a: sbol3.Range): + # Priority: 2 + raise NotImplementedError('Conversion of Range from SBOL3 to SBOL2 not yet implemented') + + def visit_si_prefix(self, a: sbol3.SIPrefix): + # Priority: 4 + raise NotImplementedError('Conversion of SIPrefix from SBOL3 to SBOL2 not yet implemented') + + def visit_sequence(self, seq3: sbol3.Sequence): + # Remap encoding if it's one of the ones that needs remapping; otherwise pass through unchanged + encoding_map = {sbol3.IUPAC_DNA_ENCODING: sbol2.SBOL_ENCODING_IUPAC, + sbol3.IUPAC_PROTEIN_ENCODING: sbol2.SBOL_ENCODING_IUPAC_PROTEIN, + sbol3.SMILES_ENCODING: sbol2.SBOL_ENCODING_SMILES} + encoding2 = encoding_map.get(seq3.encoding, seq3.encoding) + # Make the Sequence object and add it to the document + seq2 = sbol2.Sequence(seq3.identity, seq3.elements, encoding=encoding2, version=self._sbol2_version(seq3)) + self.doc2.addSequence(seq2) + # Map over all other TopLevel properties and extensions not covered by the constructor + self._convert_toplevel(seq3, seq2) + + def visit_sequence_feature(self, a: sbol3.SequenceFeature): + # Priority: 1 + raise NotImplementedError('Conversion of SequenceFeature from SBOL3 to SBOL2 not yet implemented') + + def visit_singular_unit(self, a: sbol3.SingularUnit): + # Priority: 4 + raise NotImplementedError('Conversion of SingularUnit from SBOL3 to SBOL2 not yet implemented') + + def visit_sub_component(self, a: sbol3.SubComponent): + # Priority: 1 + raise NotImplementedError('Conversion of SubComponent from SBOL3 to SBOL2 not yet implemented') + + def visit_unit_division(self, a: sbol3.UnitDivision): + # Priority: 4 + raise NotImplementedError('Conversion of UnitDivision from SBOL3 to SBOL2 not yet implemented') + + def visit_unit_exponentiation(self, a: sbol3.UnitExponentiation): + # Priority: 4 + raise NotImplementedError('Conversion of UnitExponentiation from SBOL3 to SBOL2 not yet implemented') + + def visit_unit_multiplication(self, a: sbol3.UnitMultiplication): + # Priority: 4 + raise NotImplementedError('Conversion of UnitMultiplication from SBOL3 to SBOL2 not yet implemented') + + def visit_usage(self, a: sbol3.Usage): + # Priority: 3 + raise NotImplementedError('Conversion of Usage from SBOL3 to SBOL2 not yet implemented') + + def visit_variable_feature(self, a: sbol3.VariableFeature): + # Priority: 2 + raise NotImplementedError('Conversion of VariableFeature from SBOL3 to SBOL2 not yet implemented') + + +class SBOL2To3ConversionVisitor: + """This class is used to map every object in an SBOL3 document into an empty SBOL2 document""" + + doc3: sbol3.Document + namespaces: list + + def __init__(self, doc2: sbol2.Document, namespaces: list): + # Create the target document + self.doc3 = sbol3.Document() + self.namespaces = namespaces + # # Immediately run the conversion + self._convert(doc2) + + def _convert(self, doc2: sbol2.Document): + # Note: namespaces don't need to be bound for SBOL3 documents, which don't usually use XML + # We can skip all the preliminaries and just go to conversion + self.visit_document(doc2) + # TODO: check if there is additional work needed for Annotation & GenericTopLevel conversion + + @staticmethod + def _convert_extension_properties(obj2: sbol2.Identified, obj3: sbol3.Identified): + """Copy over extension properties""" + extension_properties = (p for p in obj2.properties + if not any(p.startswith(prefix) for prefix in SBOL2_NON_EXTENSION_PROPERTY_PREFIXES)) + for p in extension_properties: + obj3._properties[p] = obj2.properties[p] + + def _convert_identified(self, obj2: sbol2.Identified, obj3: sbol3.Identified): + """Map over the other properties of an Identified object""" + self._convert_extension_properties(obj2, obj3) + # Map over equivalent properties + # display_id and namespace are handled during creation + if obj2.version: # Save version for unpacking later if needed + obj3.sbol2_version = sbol3.TextProperty(obj3, BACKPORT2_VERSION, 0, 1) + obj3.sbol2_version = obj2.version + obj3.name = obj2.name + obj3.description = obj2.description + obj3.derived_from = obj2.wasDerivedFrom + obj3.generated_by = obj2.wasGeneratedBy + # TODO: unpack measures from extension properties + + def _convert_toplevel(self, obj2: sbol2.TopLevel, obj3: sbol3.TopLevel): + """Map over the other properties of a TopLevel object""" + self._convert_identified(obj2, obj3) + obj3.attachments = [a.identity for a in obj2.attachments] + + def _sbol3_namespace(self, obj2: sbol2.TopLevel): + # If a namespace is explicitly set, that takes priority + if BACKPORT3_NAMESPACE in obj2.properties: + namespaces = obj2.properties[BACKPORT3_NAMESPACE] + if len(namespaces) != 1: + raise ValueError(f'Object {obj2.identity} backport namespace property should have precisely one value, ' + f'but was {namespaces}') + return namespaces[0] + # Check if the object starts with any of the provided namespaces + for namespace in self.namespaces: + if obj2.identity.startswith(namespace): + return namespace + # Otherwise, use default behavior + return None + + def visit_activity(self, act2: sbol2.Activity): + # Make the Activity object and add it to the document + act3 = sbol3.Activity(act2.identity, namespace=self._sbol3_namespace(act2), + start_time=act2.startedAtTime, end_time=act2.endedAtTime) + self.doc3.add(act3) + # Convert child objects after adding to document + if act2.types: # TODO: wrapping not needed after resolution of https://github.com/SynBioDex/pySBOL2/issues/428 + act3.types = [act2.types] + act3.usage = [usage.visit_usage(self) for usage in act2.usages] + act3.association = [assoc.visit_association(self) for assoc in act2.associations] + # TODO: pySBOL3 is currently missing wasInformedBy (https://github.com/SynBioDex/pySBOL3/issues/436 + # act3.informed_by = act2.wasInformedBy + # Map over all other TopLevel properties and extensions not covered by the constructor + self._convert_toplevel(act2, act3) + + def visit_agent(self, a: sbol2.Agent): + # Priority: 3 + raise NotImplementedError('Conversion of Agent from SBOL2 to SBOL3 not yet implemented') + + def visit_association(self, a: sbol2.Association): + # Priority: 3 + raise NotImplementedError('Conversion of Association from SBOL2 to SBOL3 not yet implemented') + + def visit_attachment(self, a: sbol2.Attachment): + # Priority: 2 + raise NotImplementedError('Conversion of Attachment from SBOL2 to SBOL3 not yet implemented') + + def visit_collection(self, coll2: sbol2.Collection): + # Priority: 1 + # Make the Collection object and add it to the document + coll3 = sbol3.Collection(coll2.identity, members=coll2.members) + self.doc3.add(coll3) + # Map over all other TopLevel properties and extensions not covered by the constructor + self._convert_toplevel(coll2, coll3) + + def visit_combinatorial_derivation(self, a: sbol2.CombinatorialDerivation): + # Priority: 2 + raise NotImplementedError('Conversion of CombinatorialDerivation from SBOL2 to SBOL3 not yet implemented') + + def visit_component_definition(self, cd2: sbol2.ComponentDefinition): + # Remap type if it's one of the ones that needs remapping; otherwise pass through unchanged + type_map = {sbol2.BIOPAX_DNA: sbol3.SBO_DNA, + 'http://www.biopax.org/release/biopax-level3.owl#Dna': sbol3.SBO_DNA, # TODO: make reversible + sbol2.BIOPAX_RNA: sbol3.SBO_RNA, + 'http://www.biopax.org/release/biopax-level3.owl#Rna': sbol3.SBO_RNA, # TODO: make reversible + sbol2.BIOPAX_PROTEIN: sbol3.SBO_PROTEIN, + sbol2.BIOPAX_SMALL_MOLECULE: sbol3.SBO_SIMPLE_CHEMICAL, + sbol2.BIOPAX_COMPLEX: sbol3.SBO_NON_COVALENT_COMPLEX} + types3 = [type_map.get(t, t) for t in cd2.types] + # Make the Component object and add it to the document + cp3 = sbol3.Component(cd2.identity, types3, namespace=self._sbol3_namespace(cd2), + roles=cd2.roles, sequences=cd2.sequences) + self.doc3.add(cp3) + # Convert the Component properties not covered by the constructor + if cd2.components: + raise NotImplementedError('Conversion of ComponentDefinition components ' + 'from SBOL2 to SBOL3 not yet implemented') + if cd2.sequenceAnnotations: + raise NotImplementedError('Conversion of ComponentDefinition sequenceAnnotations ' + 'from SBOL2 to SBOL3 not yet implemented') + if cd2.sequenceConstraints: + raise NotImplementedError('Conversion of ComponentDefinition sequenceConstraints ' + 'from SBOL2 to SBOL3 not yet implemented') + # Map over all other TopLevel properties and extensions not covered by the constructor + self._convert_toplevel(cd2, cp3) + + def visit_component(self, a: sbol2.Component): + # Priority: 2 + raise NotImplementedError('Conversion of Component from SBOL2 to SBOL3 not yet implemented') + + def visit_cut(self, a: sbol2.Cut): + # Priority: 2 + raise NotImplementedError('Conversion of Cut from SBOL2 to SBOL3 not yet implemented') + + def visit_document(self, doc2: sbol2.Document): + for obj in doc2.componentDefinitions: + self.visit_component_definition(obj) + for obj in doc2.moduleDefinitions: + self.visit_module_definition(obj) + for obj in doc2.models: + self.visit_model(obj) + for obj in doc2.sequences: + self.visit_sequence(obj) + for obj in doc2.collections: + self.visit_collection(obj) + for obj in doc2.activities: + self.visit_activity(obj) + for obj in doc2.plans: + self.visit_plan(obj) + for obj in doc2.agents: + self.visit_agent(obj) + for obj in doc2.attachments: + self.visit_attachment(obj) + for obj in doc2.combinatorialderivations: + self.visit_combinatorial_derivation(obj) + for obj in doc2.implementations: + self.visit_implementation(obj) + for obj in doc2.experiments: + self.visit_experiment(obj) + for obj in doc2.experimentalData: + self.visit_experimental_data(obj) + # TODO: handle "standard extensions" in pySBOL2: + # designs, builds, tests, analyses, sampleRosters, citations, keywords + + def visit_experiment(self, a: sbol2.Experiment): + # Priority: 3 + raise NotImplementedError('Conversion of Experiment from SBOL2 to SBOL3 not yet implemented') + + def visit_experimental_data(self, a: sbol2.ExperimentalData): + # Priority: 3 + raise NotImplementedError('Conversion of ExperimentalData from SBOL2 to SBOL3 not yet implemented') + + def visit_functional_component(self, a: sbol2.FunctionalComponent): + # Priority: 3 + raise NotImplementedError('Conversion of FunctionalComponent from SBOL2 to SBOL3 not yet implemented') + + def visit_generic_location(self, a: sbol2.GenericLocation): + # Priority: 3 + raise NotImplementedError('Conversion of GenericLocation from SBOL2 to SBOL3 not yet implemented') + + def visit_implementation(self, imp2: sbol2.Implementation): + # Priority: 1 + # Make the Implementation object and add it to the document + imp3 = sbol3.Implementation(imp2.identity, namespace=self._sbol3_namespace(imp2), built=imp2.built) + self.doc3.add(imp3) + # Map over all other TopLevel properties and extensions not covered by the constructor + self._convert_toplevel(imp2, imp3) + + def visit_interaction(self, a: sbol2.Interaction): + # Priority: 2 + raise NotImplementedError('Conversion of Interaction from SBOL2 to SBOL3 not yet implemented') + + def visit_maps_to(self, a: sbol2.mapsto.MapsTo): + # Priority: 3 + raise NotImplementedError('Conversion of MapsTo from SBOL2 to SBOL3 not yet implemented') + + def visit_measure(self, a: sbol2.measurement.Measurement): + # Priority: 3 + raise NotImplementedError('Conversion of Measure from SBOL2 to SBOL3 not yet implemented') + + def visit_model(self, a: sbol2.model.Model): + # Priority: 3 + raise NotImplementedError('Conversion of Model from SBOL2 to SBOL3 not yet implemented') + + def visit_module(self, a: sbol2.Module): + # Priority: 3 + raise NotImplementedError('Conversion of Module from SBOL2 to SBOL3 not yet implemented') + + def visit_module_definition(self, a: sbol2.ModuleDefinition): + # Priority: 3 + raise NotImplementedError('Conversion of ModuleDefinition from SBOL2 to SBOL3 not yet implemented') + + def visit_participation(self, a: sbol2.Participation): + # Priority: 2 + raise NotImplementedError('Conversion of Participation from SBOL2 to SBOL3 not yet implemented') + + def visit_plan(self, a: sbol2.Plan): + # Priority: 3 + raise NotImplementedError('Conversion of Plan from SBOL2 to SBOL3 not yet implemented') + + def visit_range(self, a: sbol2.Range): + # Priority: 2 + raise NotImplementedError('Conversion of Range from SBOL2 to SBOL3 not yet implemented') + + def visit_sequence(self, seq2: sbol2.Sequence): + # Remap encoding if it's one of the ones that needs remapping; otherwise pass through unchanged + encoding_map = {sbol2.SBOL_ENCODING_IUPAC: sbol3.IUPAC_DNA_ENCODING, + sbol2.SBOL_ENCODING_IUPAC_PROTEIN: sbol3.IUPAC_PROTEIN_ENCODING, + sbol2.SBOL_ENCODING_SMILES: sbol3.SMILES_ENCODING} + encoding3 = encoding_map.get(seq2.encoding, seq2.encoding) + # Make the Sequence object and add it to the document + seq3 = sbol3.Sequence(seq2.identity, namespace=self._sbol3_namespace(seq2), + elements=seq2.elements, encoding=encoding3) + self.doc3.add(seq3) + # Map over all other TopLevel properties and extensions not covered by the constructor + self._convert_toplevel(seq2, seq3) + + def visit_sequence_annotation(self, seq2: sbol2.SequenceAnnotation): + # Priority: 1 + raise NotImplementedError('Conversion of SequenceAnnotation from SBOL2 to SBOL3 not yet implemented') + + def visit_sequence_constraint(self, seq2: sbol2.sequenceconstraint.SequenceConstraint): + # Priority: 2 + raise NotImplementedError('Conversion of SequenceConstraint from SBOL2 to SBOL3 not yet implemented') + + def visit_usage(self, a: sbol2.Usage): + # Priority: 3 + raise NotImplementedError('Conversion of Usage from SBOL2 to SBOL3 not yet implemented') + + def visit_variable_component(self, a: sbol2.VariableComponent): + # Priority: 2 + raise NotImplementedError('Conversion of VariableComponent from SBOL2 to SBOL3 not yet implemented') + + +def convert3to2(doc3: sbol3.Document) -> sbol2.Document: + """Convert an SBOL3 document to an SBOL2 document + + :param doc3: SBOL3 document to convert + :returns: SBOL2 document + """ + converter = SBOL3To2ConversionVisitor(doc3) + return converter.doc2 + + +def convert2to3(doc2: sbol2.Document, namespaces=None) -> sbol3.Document: + """Convert an SBOL2 document to an SBOL3 document + + :param doc2: SBOL2 document to convert + :param namespaces: list of URI prefixes to treat as namespaces + :returns: SBOL3 document + """ + converter = SBOL2To3ConversionVisitor(doc2, namespaces) + return converter.doc3 diff --git a/sbol_utilities/so2gb.csv b/sbol_utilities/so2gb.csv new file mode 100644 index 00000000..0865ccb0 --- /dev/null +++ b/sbol_utilities/so2gb.csv @@ -0,0 +1,79 @@ +SO_Ontology,GenBank_Ontology +https://identifiers.org/SO:0001023,allele +https://identifiers.org/SO:0000730,assembly_gap +https://identifiers.org/SO:0002174,assembly_gap +https://identifiers.org/SO:0000140,attenuator +https://identifiers.org/SO:0001834,C_region +https://identifiers.org/SO:0000172,CAAT_signal +https://identifiers.org/SO:0000316,CDS +https://identifiers.org/SO:0000577,centromere +https://identifiers.org/SO:0000297,D-loop +https://identifiers.org/SO:0000458,D_segment +https://identifiers.org/SO:0000165,enhancer +https://identifiers.org/SO:0000147,exon +https://identifiers.org/SO:0000704,gene +https://identifiers.org/SO:0000173,GC_signal +https://identifiers.org/SO:0000723,iDNA +https://identifiers.org/SO:0000188,intron +https://identifiers.org/SO:0000470,J_region +https://identifiers.org/SO:0000286,LTR +https://identifiers.org/SO:0000419,mat_peptide +https://identifiers.org/SO:0000409,misc_binding +https://identifiers.org/SO:0000413,misc_difference +https://identifiers.org/SO:0000001,misc_feature +https://identifiers.org/SO:0001411,misc_feature +https://identifiers.org/SO:0001645,misc_marker +https://identifiers.org/SO:0000298,misc_recomb +https://identifiers.org/SO:0000233,misc_RNA +https://identifiers.org/SO:0000673,misc_RNA +https://identifiers.org/SO:0005836,regulatory +https://identifiers.org/SO:0000002,misc_structure +https://identifiers.org/SO:0001037,mobile_element +https://identifiers.org/SO:0000305,modified_base +https://identifiers.org/SO:0000234,mRNA +https://identifiers.org/SO:0001835,N_region +https://identifiers.org/SO:0000655,ncRNA +https://identifiers.org/SO:0000178,operon +https://identifiers.org/SO:0000724,oriT +https://identifiers.org/SO:0000551,polyA_signal +https://identifiers.org/SO:0000553,polyA_site +https://identifiers.org/SO:0000185,precursor_RNA +https://identifiers.org/SO:0000112,primer +https://identifiers.org/SO:0005850,primer_bind +https://identifiers.org/SO:0000167,promoter +https://identifiers.org/SO:0001062,propeptide +https://identifiers.org/SO:0000410,protein_bind +https://identifiers.org/SO:0000139,RBS +https://identifiers.org/SO:0000552,RBS +https://identifiers.org/SO:0000296,rep_origin +https://identifiers.org/SO:0000657,repeat_region +https://identifiers.org/SO:0000726,repeat_unit +https://identifiers.org/SO:0000252,rRNA +https://identifiers.org/SO:0001836,S_region +https://identifiers.org/SO:0000005,satellite +https://identifiers.org/SO:0000013,scRNA +https://identifiers.org/SO:0000418,sig_peptide +https://identifiers.org/SO:0000274,snRNA +https://identifiers.org/SO:0000149,source +https://identifiers.org/SO:0002206,source +https://identifiers.org/SO:0000019,stem_loop +https://identifiers.org/SO:0000313,stem_loop +https://identifiers.org/SO:0000331,STS +https://identifiers.org/SO:0000174,TATA_signal +https://identifiers.org/SO:0000624,telomere +https://identifiers.org/SO:0000141,terminator +https://identifiers.org/SO:0000584,tmRNA +https://identifiers.org/SO:0000725,transit_peptide +https://identifiers.org/SO:0001054,transposon +https://identifiers.org/SO:0000253,tRNA +https://identifiers.org/SO:0001086,unsure +https://identifiers.org/SO:0001833,V_region +https://identifiers.org/SO:0000109,variation +https://identifiers.org/SO:0001060,variation +https://identifiers.org/SO:0000466,V_segment +https://identifiers.org/SO:0000175,-10_signal +https://identifiers.org/SO:0000176,-35_signal +https://identifiers.org/SO:0000557,3'clip +https://identifiers.org/SO:0000205,3'UTR +https://identifiers.org/SO:0000555,5'clip +https://identifiers.org/SO:0000204,5'UTR diff --git a/sbol_utilities/workarounds.py b/sbol_utilities/workarounds.py index 5cdc8757..e0e083f0 100644 --- a/sbol_utilities/workarounds.py +++ b/sbol_utilities/workarounds.py @@ -21,6 +21,21 @@ def tyto_lookup_with_caching(term: str) -> str: return tyto.SO.get_uri_by_term(term) +# TODO: remove kludge after resolution of https://github.com/SynBioDex/tyto/issues/75 +def tyto_normalize_term(ontology: tyto.Ontology, uri: str) -> Optional[str]: + """Change an ontology term into the "standard" form returned by tyto.ontology.get_uri_by_term + Workaround for tyto issue https://github.com/SynBioDex/tyto/issues/75, which will be removed after that + issue is addressed. + + :param ontology: Ontology containing term + :param uri: URI to be normalized + :return: normalized URI (or None if term is not in the ontology) + """ + try: + return ontology.get_uri_by_term(ontology.get_term_by_uri(uri)) + except LookupError: + return None + ######################### # This file contains workarounds for known issues in pySBOL3 # They will be removed when pySBOL3 upgrades fix the associated issues diff --git a/setup.py b/setup.py index 00a04cc5..7c355ed3 100644 --- a/setup.py +++ b/setup.py @@ -28,19 +28,21 @@ 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10' + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11' ], # What does your project relate to? keywords='synthetic biology', install_requires=[ - 'sbol3>=1.0b11', + 'sbol3>=1.1', 'sbol2>=1.4', 'rdflib>=6.2', 'biopython', 'graphviz', - 'tyto>=1.2.1', + 'tyto>=1.4', 'openpyxl', - 'sbol_factory>=1.0a11', + 'requests', + 'sbol_factory>=1.1' 'pydna' ], extras_require={ # requirements for development @@ -51,16 +53,23 @@ 'graph-sbol=sbol_utilities.graph_sbol:main', 'sbol-expand-derivations=sbol_utilities.expand_combinatorial_derivations:main', 'sbol-calculate-sequences=sbol_utilities.calculate_sequences:main', + 'sbol-calculate-complexity=sbol_utilities.calculate_complexity_scores:main', 'sbol-converter=sbol_utilities.conversion:main', - 'sbol2to3=sbol_utilities.conversion:sbol2to3', - 'sbol3to2=sbol_utilities.conversion:sbol3to2', - 'sbol2genbank=sbol_utilities.conversion:sbol2genbank', - 'sbol2fasta=sbol_utilities.conversion:sbol2fasta', - 'genbank2sbol=sbol_utilities.conversion:genbank2sbol', - 'fasta2sbol=sbol_utilities.conversion:fasta2sbol', + 'sbol2-to-sbol3=sbol_utilities.conversion:sbol2to3', + 'sbol3-to-sbol2=sbol_utilities.conversion:sbol3to2', + 'sbol-to-genbank=sbol_utilities.conversion:sbol2genbank', + 'sbol-to-fasta=sbol_utilities.conversion:sbol2fasta', + 'genbank-to-sbol=sbol_utilities.conversion:genbank2sbol', + 'fasta-to-sbol=sbol_utilities.conversion:fasta2sbol', 'sbol-diff=sbol_utilities.sbol_diff:main'] }, packages=['sbol_utilities'], - package_data={'sbol_utilities': ['sbolgraph-standalone.js']}, + package_data={ + 'sbol_utilities': [ + 'gb2so.csv', + 'sbolgraph-standalone.js', + 'so2gb.csv', + ], + }, include_package_data=True ) diff --git a/test/helpers.py b/test/helpers.py index 32779306..c08c43d6 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -2,7 +2,7 @@ import tempfile import os from shutil import copy -from typing import List, Dict +from typing import List, Dict, Union def copy_to_tmp(package: List[str] = None, renames: Dict[str, str] = None) -> str: @@ -29,7 +29,7 @@ def copy_to_tmp(package: List[str] = None, renames: Dict[str, str] = None) -> st return tmp_sub -def assert_files_identical(file1: os.PathLike, file2: os.PathLike) -> None: +def assert_files_identical(file1: Union[os.PathLike, str], file2: Union[os.PathLike, str]) -> None: """check if two files are identical; if not, report their diff :param file1: path of first file to compare :param file2: path of second file to compare diff --git a/test/test_calculate_complexity_scores.py b/test/test_calculate_complexity_scores.py new file mode 100644 index 00000000..70bddee8 --- /dev/null +++ b/test/test_calculate_complexity_scores.py @@ -0,0 +1,90 @@ +"""Tests for calculating sequence synthesis complexity scores via the IDT interface + +To run these tests, you will need IDT access credentials (see: https://www.idtdna.com/pages/tools/apidoc) +The values of the IDT access credentials should be stored in a file in the top level directory called +'test_secret_idt_credentials.json', with the contents of the form: +{ "username": "username", "password": "password", "ClientID": "####", "ClientSecret": "XXXXXXXXXXXXXXXXXXX" } +""" +from pathlib import Path + +import json + +import unittest +import sys +import tempfile +import sbol3 +from unittest.mock import patch +from sbol_utilities.calculate_complexity_scores import IDTAccountAccessor, idt_calculate_complexity_scores, \ + idt_calculate_sequence_complexity_scores, get_complexity_scores +import sbol_utilities.sbol_diff + +# TODO: add to readme + +def same_except_timestamps(doc1: sbol3.Document, doc2: sbol3.Document) -> bool: + """Check that the only triple-level difference between two SBOL documents is their time-stamps + + :param doc1: first document to compare + :param doc2: second document to compare + :returns: True if identical, false if not + """ + _, first_graph, second_graph = sbol_utilities.sbol_diff._diff_graphs(doc1.graph(), doc2.graph()) + replaced_subject = 'http://igem.org/IDT_complexity_score/Complexity_Report_20230516T194547Z_a2efceb0' + # Return true only if all differences are time-stamps or the activity name + ignored_predicates = {sbol3.PROV_ENDED_AT_TIME, sbol3.SBOL_DISPLAY_ID} + return all(p1 == p2 and (str(p1) in ignored_predicates or + (str(s1) == replaced_subject and o1 == o2) or + (s1 == s2 and str(o1) == replaced_subject)) + for (s1, p1, o1), (s2, p2, o2) in zip(sorted(first_graph), sorted(second_graph))) + + +class TestIDTCalculateComplexityScore(unittest.TestCase): + + @unittest.skipIf(sys.platform == 'win32', reason='Not working on Windows https://github.com/SynBioDex/SBOL-utilities/issues/221') + def test_IDT_calculate_complexity_score(self): + """Test that a library-call invocation of complexity scoring works""" + test_dir = Path(__file__).parent + with open(test_dir.parent / 'test_secret_idt_credentials.json') as test_credentials: + idt_accessor = IDTAccountAccessor.from_json(json.load(test_credentials)) + + doc = sbol3.Document() + doc.read(test_dir / 'test_files' / 'BBa_J23101.nt') + + # Check the scores - they should initially be all missing + sequences = [obj for obj in doc if isinstance(obj, sbol3.Sequence)] + scores = get_complexity_scores(sequences) + self.assertEqual(scores, dict()) + # Compute sequences for + results = idt_calculate_sequence_complexity_scores(idt_accessor, sequences) + self.assertEqual(len(results), 1) + self.assertEqual(results[sequences[0]], 0) # score is zero because the sequence both short and easy + scores = get_complexity_scores(sequences) + self.assertEqual(scores, results) + + # Compute results again: results should be blank, because the calculation is already made + results = idt_calculate_complexity_scores(idt_accessor, doc) + self.assertEqual(len(results), 0) + self.assertEqual(results, dict()) + scores = get_complexity_scores(sequences) + self.assertEqual(scores, {sequences[0]: 0}) + + @unittest.skipIf(sys.platform == 'win32', reason='Not working on Windows https://github.com/SynBioDex/SBOL-utilities/issues/221') + def test_commandline(self): + """Test that a command-line invocation of complexity scoring works""" + test_dir = Path(__file__).parent + temp_name = tempfile.mkstemp(suffix='.nt')[1] + test_args = ['calculate_complexity_scores.py', + '--credentials', str(test_dir.parent / 'test_secret_idt_credentials.json'), + str(test_dir / 'test_files' / 'Test_file_Complexity_Scores.nt'), temp_name] + with patch.object(sys, 'argv', test_args): + sbol_utilities.calculate_complexity_scores.main() + + # Compare expected results to actual output file + expected = sbol3.Document() + expected.read(test_dir / 'test_files' / 'Comparison_file_Complexity_Scores.nt') + generated = sbol3.Document() + generated.read(temp_name) + self.assertTrue(same_except_timestamps(expected, generated)) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_conversion.py b/test/test_conversion.py index d16db9a7..2411eaab 100644 --- a/test/test_conversion.py +++ b/test/test_conversion.py @@ -1,3 +1,5 @@ +from pathlib import Path + import sys import tempfile import unittest @@ -12,17 +14,19 @@ from sbol_utilities.conversion import convert2to3, convert3to2, convert_to_genbank, convert_to_fasta, \ convert_from_fasta, convert_from_genbank, \ main, sbol2fasta, sbol2genbank, sbol2to3, sbol3to2, fasta2sbol, genbank2sbol -from helpers import copy_to_tmp +from sbol_utilities.sbol3_genbank_conversion import GenBankSBOL3Converter +from helpers import copy_to_tmp, assert_files_identical from sbol_utilities.sbol_diff import doc_diff # TODO: Add command-line utilities and test them too +TEST_FILES = Path(__file__).parent / 'test_files' + class Test2To3Conversion(unittest.TestCase): def test_convert_identities(self): """Test conversion of a complex file""" test_dir = os.path.dirname(os.path.realpath(__file__)) - input_path = os.path.join(test_dir, 'test_files', 'sbol3-small-molecule.rdf') - doc = convert2to3(input_path) + doc = convert2to3(str(TEST_FILES / 'sbol3-small-molecule.rdf')) # check for issues in converted document report = doc.validate() assert len(report) == 0, "\n".join(str(issue) for issue in report) @@ -59,7 +63,7 @@ def test_3to2_conversion(self): sbol2.Config.setOption(sbol2.ConfigOptions.VALIDATE_ONLINE, validate_online) assert len(doc2.componentDefinitions) == 1, f'Expected 1 CD, but found {len(doc2.componentDefinitions)}' # TODO: bring this back after resolution of https://github.com/sboltools/sbolgraph/issues/15 - #assert len(doc2.activities) == 1, f'Expected 1 Activity, but found {len(doc2.activities)}' + # assert len(doc2.activities) == 1, f'Expected 1 Activity, but found {len(doc2.activities)}' assert len(doc2.sequences) == 1, f'Expected 1 Sequence, but found {len(doc2.sequences)}' assert doc2.componentDefinitions[0].identity == 'https://synbiohub.org/public/igem/BBa_J23101' assert doc2.componentDefinitions[0].sequences[0] == 'https://synbiohub.org/public/igem/BBa_J23101_sequence' @@ -148,16 +152,14 @@ def test_genbank_conversion(self): # Convert to GenBank and check contents outfile = os.path.join(tmp_sub, 'BBa_J23101.gb') convert_to_genbank(doc3, outfile) - - test_dir = os.path.dirname(os.path.realpath(__file__)) - comparison_file = os.path.join(test_dir, 'test_files', 'BBa_J23101.gb') - assert filecmp.cmp(outfile, comparison_file), f'Converted GenBank file {comparison_file} is not identical' + assert_files_identical(outfile, TEST_FILES / 'BBa_J23101.gb') def test_conversion_from_genbank(self): """Test ability to convert from GenBank to SBOL3""" # Get the GenBank test document and convert tmp_sub = copy_to_tmp(package=['BBa_J23101.gb']) - doc3 = convert_from_genbank(os.path.join(tmp_sub, 'BBa_J23101.gb'), 'https://synbiohub.org/public/igem') + doc3 = convert_from_genbank(os.path.join(tmp_sub, 'BBa_J23101.gb'), 'https://synbiohub.org/public/igem', + force_new_converter=False) # Note: cannot directly round-trip because converter is a) lossy, and b) inserts extra materials test_dir = os.path.dirname(os.path.realpath(__file__)) @@ -166,6 +168,35 @@ def test_conversion_from_genbank(self): comparison_doc.read(comparison_file) assert not doc_diff(doc3, comparison_doc), f'Converted GenBank file not identical to {comparison_file}' + def test_genbank_conversion_new_converter(self): + """Test ability to convert from SBOL3 to GenBank using new converter + by specifying the `--force-new-converter` flag """ + # Get the SBOL3 test document + tmp_sub = copy_to_tmp(package=['sbol3_genbank_conversion/BBa_J23101_from_genbank_to_sbol3_direct.nt']) + doc3 = sbol3.Document() + doc3.read(os.path.join(tmp_sub, 'BBa_J23101_from_genbank_to_sbol3_direct.nt')) + # Convert to GenBank and check contents + outfile = os.path.join(tmp_sub, 'BBa_J23101.gb') + convert_to_genbank(doc3=doc3, path=outfile, allow_genbank_online=False, force_new_converter=True) + assert_files_identical(outfile, TEST_FILES / 'sbol3_genbank_conversion' / 'BBa_J23101_from_sbol3_direct.gb') + + def test_conversion_from_genbank_new_converter(self): + """Test ability to convert from GenBank to SBOL3 using new converter + by specifying the `--force-new-converter` flag """ + # Get the GenBank test document and convert + tmp_sub = copy_to_tmp(package=['BBa_J23101.gb']) + doc3 = convert_from_genbank(path=os.path.join(tmp_sub, 'BBa_J23101.gb'), + namespace=GenBankSBOL3Converter.TEST_NAMESPACE, + allow_genbank_online=False, + force_new_converter=True) + + # Note: cannot directly round-trip because converter is a) lossy, and b) inserts extra materials + test_dir = os.path.dirname(os.path.realpath(__file__)) + comparison_file = os.path.join(test_dir, 'test_files', 'sbol3_genbank_conversion', 'BBa_J23101_from_genbank_to_sbol3_direct.nt') + comparison_doc = sbol3.Document() + comparison_doc.read(comparison_file) + assert not doc_diff(doc3, comparison_doc), f'Converted SBOL3 file not identical to {comparison_file}' + def test_genbank_multi_conversion(self): """Test ability to convert from SBOL3 to GenBank""" # Get the SBOL3 test document @@ -175,11 +206,8 @@ def test_genbank_multi_conversion(self): # Convert to GenBank and check contents outfile = os.path.join(tmp_sub, 'iGEM_SBOL2_imports.gb') - convert_to_genbank(doc3, outfile) - - test_dir = os.path.dirname(os.path.realpath(__file__)) - comparison_file = os.path.join(test_dir, 'test_files', 'iGEM_SBOL2_imports.gb') - assert filecmp.cmp(outfile, comparison_file), f'Converted GenBank file {comparison_file} is not identical' + convert_to_genbank(doc3, outfile, force_new_converter=False) + assert_files_identical(outfile, TEST_FILES / 'iGEM_SBOL2_imports.gb') def test_fasta_conversion(self): """Test ability to convert from SBOL3 to FASTA""" @@ -250,33 +278,33 @@ def test_commandline(self): assert filecmp.cmp(temp_name, test_file['sbol3']), f'Converted file {temp_name} is not identical' # Run the other six tests - test_args = ['fasta2sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['fasta']] + test_args = ['fasta-to-sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['fasta']] with patch.object(sys, 'argv', test_args): fasta2sbol() assert filecmp.cmp(temp_name, test_file['from_fasta']), f'Converted file {temp_name} is not identical' # genbank conversion should succeed the same way when not online if not given an online argument - test_args = ['genbank2sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['genbank']] + test_args = ['genbank-to-sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['genbank']] with patch.object(sys, 'argv', test_args): genbank2sbol() assert filecmp.cmp(temp_name, test_file['from_genbank']), f'Converted file {temp_name} is not identical' - test_args = ['sbol2fasta', '-o', temp_name, test_file['sbol3']] + test_args = ['sbol-to-fasta', '-o', temp_name, test_file['sbol3']] with patch.object(sys, 'argv', test_args): sbol2fasta() assert filecmp.cmp(temp_name, test_file['fasta']), f'Converted file {temp_name} is not identical' - test_args = ['sbol2genbank', '-o', temp_name, test_file['sbol3']] + test_args = ['sbol-to-genbank', '-o', temp_name, test_file['sbol3']] with patch.object(sys, 'argv', test_args): sbol2genbank() assert filecmp.cmp(temp_name, test_file['genbank']), f'Converted file {temp_name} is not identical' # SBOL2 serialization is not stable, so test via round-trip instead - test_args = ['sbol3to2', '-o', temp_name, test_file['sbol3']] + test_args = ['sbol3-to-sbol2', '-o', temp_name, test_file['sbol3']] with patch.object(sys, 'argv', test_args): sbol3to2() temp_name_2 = tempfile.mkstemp()[1] - test_args = ['sbol2to3', '-o', temp_name_2, temp_name] + test_args = ['sbol2-to-sbol3', '-o', temp_name_2, temp_name] with patch.object(sys, 'argv', test_args): sbol2to3() assert filecmp.cmp(temp_name_2, test_file['sbol323']), f'Converted file {temp_name} is not identical' @@ -290,7 +318,7 @@ def test_online_conversion(self): 'from_genbank': os.path.join(test_files, 'BBa_J23101_from_genbank.nt'), } - test_args = ['genbank2sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['genbank'], + test_args = ['genbank-to-sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['genbank'], '--allow-genbank-online'] with patch.object(sys, 'argv', test_args): genbank2sbol() diff --git a/test/test_files/BBa_J23101.xml b/test/test_files/BBa_J23101.xml new file mode 100644 index 00000000..62c8c727 --- /dev/null +++ b/test/test_files/BBa_J23101.xml @@ -0,0 +1,64 @@ + + + later + + + 1 + N/A + In stock + BBa_J23101 + Released HQ 2013 + true + later + + true + + true + + + + false + 2015-08-31T04:08:40Z + + 2006-08-03T11:00:00Z + + BBa_J23101 + constitutive promoter family member + + + 483 + 95 + _52_ + + John Anderson + 0 + + + + BBa_J23101_sequence + + + + + + + + tttacagctagctcagtcctaggtattatgctagc + + 1 + + + Chris J. Myers + + 2017-03-06T15:00:00+00:00 + 1 + + + James Alastair McLaughlin + + + igem2sbol + iGEM to SBOL conversion + Conversion of the iGEM parts registry to SBOL2.1 + + diff --git a/test/test_files/BBa_J23101_patched.nt b/test/test_files/BBa_J23101_patched.nt new file mode 100644 index 00000000..08763480 --- /dev/null +++ b/test/test_files/BBa_J23101_patched.nt @@ -0,0 +1,57 @@ + "John Anderson" . + "2006-08-03T11:00:00Z" . + "2015-08-31T04:08:40Z" . + "constitutive promoter family member" . + "BBa_J23101" . + . + . + "BBa_J23101" . + . + . + . + "1" . + "false" . + "true" . + . + "_52_" . + "0" . + "483" . + "95" . + "Released HQ 2013" . + "In stock" . + . + "true" . + "later" . + "N/A" . + "later" . + . + . + "true" . + . + . + . + . + "BBa_J23101_sequence" . + "tttacagctagctcagtcctaggtattatgctagc" . + . + . + "1" . + . + . + . + . + . + . + "Chris J. Myers" . + "James Alastair McLaughlin" . + "Conversion of the iGEM parts registry to SBOL2.1" . + "iGEM to SBOL conversion" . + "igem2sbol" . + . + "1" . + . + . + . + . + . + "2017-03-06T15:00:00+00:00"^^ . diff --git a/test/test_files/BBa_J23101_v2.gb b/test/test_files/BBa_J23101_v2.gb new file mode 100644 index 00000000..9fc3c5a4 --- /dev/null +++ b/test/test_files/BBa_J23101_v2.gb @@ -0,0 +1,12 @@ +LOCUS BBa_J23101 35 bp DNA linear UNK 01-JAN-1980 +DEFINITION constitutive promoter family member. +ACCESSION BBa_J23101 +VERSION BBa_J23101.2 +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers +ORIGIN + 1 tttacagcta gctcagtcct aggtattatg ctagc +// diff --git a/test/test_files/Comparison_file_Complexity_Scores.nt b/test/test_files/Comparison_file_Complexity_Scores.nt new file mode 100644 index 00000000..5e8acc14 --- /dev/null +++ b/test/test_files/Comparison_file_Complexity_Scores.nt @@ -0,0 +1,58 @@ + "Complexity_Report_20230516T194547Z_a2efceb0" . + . + . + . + . + "2023-05-16T19:45:47+00:00"^^ . + "Measure1" . + . + "13.2"^^ . + . + . + . + . + "X2018_Interlab_Devices_BBa_I20270" . + "ttgatggctagctcagtcctaggtacaatgctagctactagagtcacacaggaaagtactagatgcgtaaaggagaagaacttttcactggagttgtcccaattcttgttgaattagatggtgatgttaatgggcacaaattttctgtcagtggagagggtgaaggtgatgcaacatacggaaaacttacccttaaatttatttgcactactggaaaactacctgttccatggccaacacttgtcactactttcggttatggtgttcaatgctttgcgagatacccagatcatatgaaacagcatgactttttcaagagtgccatgcccgaaggttatgtacaggaaagaactatatttttcaaagatgacgggaactacaagacacgtgctgaagtcaagtttgaaggtgatacccttgttaatagaatcgagttaaaaggtattgattttaaagaagatggaaacattcttggacacaaattggaatacaactataactcacacaatgtatacatcatggcagacaaacaaaagaatggaatcaaagttaacttcaaaattagacacaacattgaagatggaagcgttcaactagcagaccattatcaacaaaatactccaattggcgatggccctgtccttttaccagacaaccattacctgtccacacaatctgccctttcgaaagatcccaacgaaaagagagaccacatggtccttcttgagtttgtaacagctgctgggattacacatggcatggatgaactatacaaataataatactagagccaggcatcaaataaaacgaaaggctcagtcgaaagactgggcctttcgttttatctgttgtttgtcggtgaacgctctctactagagtcacactggctcaccttcgggtgggcctttctgcgtttata" . + . + . + . + . + "Measure1" . + . + "54.0"^^ . + . + . + . + . + "X2018_Interlab_Devices_BBa_R0040" . + "tccctatcagtgatagagattgacatccctatcagtgatagagatactgagcac" . + . + . + . + . + "Measure1" . + . + "9.9"^^ . + . + . + . + . + "X2018_Interlab_Devices_J364000" . + "tttacagctagctcagtcctaggtattatgctagctactagagaaagaggagaaatactagatgcgtaaaggagaagaacttttcactggagttgtcccaattcttgttgaattagatggtgatgttaatgggcacaaattttctgtcagtggagagggtgaaggtgatgcaacatacggaaaacttacccttaaatttatttgcactactggaaaactacctgttccatggccaacacttgtcactactttcggttatggtgttcaatgctttgcgagatacccagatcatatgaaacagcatgactttttcaagagtgccatgcccgaaggttatgtacaggaaagaactatatttttcaaagatgacgggaactacaagacacgtgctgaagtcaagtttgaaggtgatacccttgttaatagaatcgagttaaaaggtattgattttaaagaagatggaaacattcttggacacaaattggaatacaactataactcacacaatgtatacatcatggcagacaaacaaaagaatggaatcaaagttaacttcaaaattagacacaacattgaagatggaagcgttcaactagcagaccattatcaacaaaatactccaattggcgatggccctgtccttttaccagacaaccattacctgtccacacaatctgccctttcgaaagatcccaacgaaaagagagaccacatggtccttcttgagtttgtaacagctgctgggattacacatggcatggatgaactatacaaataataatactagagccaggcatcaaataaaacgaaaggctcagtcgaaagactgggcctttcgttttatctgttgtttgtcggtgaacgctctctactagagtcacactggctcaccttcgggtgggcctttctgcgtttata" . + . + . + . + . + "Measure1" . + . + "23.299999999999997"^^ . + . + . + . + . + "XBHETase_library_K3039006" . + "ggtctcgaatgggcagcagccatcatcatcatcatcacagcagcggcctggtgccgcgcggcagccatatgatgggcggtggctctacccccctgcccctgccgcagcagcaaccgccacaacaagaacccccaccgcctccggtcccactggcgtcacgcgctgcatgcgaggccctgaaagatgggaacggcgatatggtttggccgaacgccgcgaccgtagtggaagttgctgcttggcgcgacgcagcgccggctaccgcaagcgcagcggccttaccggaacattgtgaagtatcaggcgcgattgccaaacgcactggcattgatggctatccctatgaaatcaaattccgtctgcgcatgccagcagagtggaatggccgcttcttcatggagggaggctcaggcacaaatgggtccttgagcgccgcaaccggctcaattgggggtggacaaattgcgtccgcccttagccggaactttgcaacgattgctaccgatggcggtcatgataacgccgtcaatgacaacccggatgccttgggcaccgtcgcatttggtctggatccgcaggctcgcctggacatgggctacaacagctacgaccaggtgacgcaggcagggaaagcagcggtcgcccggttttacggccgtgctgcagataaatcatactttattggatgttccgaaggcgggcgcgaaggtatgatgctgagtcagcgtttcccgtcgcattatgacggtattgtggcgggtgccccggggtatcagctgccaaaggcgggcatttcgggagcctggaccacacagtcactggctccagcggctgtgggcttggatgcccagggcgttcctctgattaacaaatcatttagtgatgccgatctgcacctgttatcgcaggcaatccttggtacctgtgatgccctggacggcctggcggatggtattgttgataactaccgggcctgtcaggcggcgttcgatcctgccactgcggcgaacccggctaatggccaagccttacaatgtgtgggcgcaaaaacggccgattgtctctccccagttcaggtgactgcgattaagcgtgccatggcgggccctgtgaattcggcgggtactccgctgtataaccggtgggcgtgggatgcgggtatgtcgggcctgagcggcactacatacaaccaaggttggcgttcttggtggctgggctcattcaactcttcagccaacaacgcacaggcagtcagcggctttggcgcacgcagctggctggtcgacaacgcaacccctcctgaacccatgcccatgacccaggtagcggcccgtatgatgaaattcgatttcgatatcgatccgcttaaaatttgggcgacgagtggacagttcacgcagtcttccatggattggcatggggcaacgagcacggatctggcggcgtttcgtgatcgcgggggtaagatgattttatatcacggaatgtcagatgccgccttctccgccctggatacagccgattactatgagcgtttaggggcggcaatgccgggcgcggctgggttcgcccgccttttcctggtaccgggtatgaatcactgctctggcggcccaggcaccgatcgctttgatatgcttacgccgctggttgcctgggtggaacgcggtgaagccccggatcagattagtgcctggagcggcacaccaggctatttcggtgtagccgctcgtacccgcccgctgtgcccgtatcctcaaatcgcacgctataagggctccggtgatatcaacacggaagcgaattttgcgtgtgccgcccctccgtaagctttgagacc" . + . + . + . + . diff --git a/test/test_files/Test_file_Complexity_Scores.nt b/test/test_files/Test_file_Complexity_Scores.nt new file mode 100644 index 00000000..b4eef3fa --- /dev/null +++ b/test/test_files/Test_file_Complexity_Scores.nt @@ -0,0 +1,20 @@ + "X2018_Interlab_Devices_BBa_I20270" . + "ttgatggctagctcagtcctaggtacaatgctagctactagagtcacacaggaaagtactagatgcgtaaaggagaagaacttttcactggagttgtcccaattcttgttgaattagatggtgatgttaatgggcacaaattttctgtcagtggagagggtgaaggtgatgcaacatacggaaaacttacccttaaatttatttgcactactggaaaactacctgttccatggccaacacttgtcactactttcggttatggtgttcaatgctttgcgagatacccagatcatatgaaacagcatgactttttcaagagtgccatgcccgaaggttatgtacaggaaagaactatatttttcaaagatgacgggaactacaagacacgtgctgaagtcaagtttgaaggtgatacccttgttaatagaatcgagttaaaaggtattgattttaaagaagatggaaacattcttggacacaaattggaatacaactataactcacacaatgtatacatcatggcagacaaacaaaagaatggaatcaaagttaacttcaaaattagacacaacattgaagatggaagcgttcaactagcagaccattatcaacaaaatactccaattggcgatggccctgtccttttaccagacaaccattacctgtccacacaatctgccctttcgaaagatcccaacgaaaagagagaccacatggtccttcttgagtttgtaacagctgctgggattacacatggcatggatgaactatacaaataataatactagagccaggcatcaaataaaacgaaaggctcagtcgaaagactgggcctttcgttttatctgttgtttgtcggtgaacgctctctactagagtcacactggctcaccttcgggtgggcctttctgcgtttata" . + . + . + . + "X2018_Interlab_Devices_BBa_R0040" . + "tccctatcagtgatagagattgacatccctatcagtgatagagatactgagcac" . + . + . + . + "X2018_Interlab_Devices_J364000" . + "tttacagctagctcagtcctaggtattatgctagctactagagaaagaggagaaatactagatgcgtaaaggagaagaacttttcactggagttgtcccaattcttgttgaattagatggtgatgttaatgggcacaaattttctgtcagtggagagggtgaaggtgatgcaacatacggaaaacttacccttaaatttatttgcactactggaaaactacctgttccatggccaacacttgtcactactttcggttatggtgttcaatgctttgcgagatacccagatcatatgaaacagcatgactttttcaagagtgccatgcccgaaggttatgtacaggaaagaactatatttttcaaagatgacgggaactacaagacacgtgctgaagtcaagtttgaaggtgatacccttgttaatagaatcgagttaaaaggtattgattttaaagaagatggaaacattcttggacacaaattggaatacaactataactcacacaatgtatacatcatggcagacaaacaaaagaatggaatcaaagttaacttcaaaattagacacaacattgaagatggaagcgttcaactagcagaccattatcaacaaaatactccaattggcgatggccctgtccttttaccagacaaccattacctgtccacacaatctgccctttcgaaagatcccaacgaaaagagagaccacatggtccttcttgagtttgtaacagctgctgggattacacatggcatggatgaactatacaaataataatactagagccaggcatcaaataaaacgaaaggctcagtcgaaagactgggcctttcgttttatctgttgtttgtcggtgaacgctctctactagagtcacactggctcaccttcgggtgggcctttctgcgtttata" . + . + . + . + "XBHETase_library_K3039006" . + "ggtctcgaatgggcagcagccatcatcatcatcatcacagcagcggcctggtgccgcgcggcagccatatgatgggcggtggctctacccccctgcccctgccgcagcagcaaccgccacaacaagaacccccaccgcctccggtcccactggcgtcacgcgctgcatgcgaggccctgaaagatgggaacggcgatatggtttggccgaacgccgcgaccgtagtggaagttgctgcttggcgcgacgcagcgccggctaccgcaagcgcagcggccttaccggaacattgtgaagtatcaggcgcgattgccaaacgcactggcattgatggctatccctatgaaatcaaattccgtctgcgcatgccagcagagtggaatggccgcttcttcatggagggaggctcaggcacaaatgggtccttgagcgccgcaaccggctcaattgggggtggacaaattgcgtccgcccttagccggaactttgcaacgattgctaccgatggcggtcatgataacgccgtcaatgacaacccggatgccttgggcaccgtcgcatttggtctggatccgcaggctcgcctggacatgggctacaacagctacgaccaggtgacgcaggcagggaaagcagcggtcgcccggttttacggccgtgctgcagataaatcatactttattggatgttccgaaggcgggcgcgaaggtatgatgctgagtcagcgtttcccgtcgcattatgacggtattgtggcgggtgccccggggtatcagctgccaaaggcgggcatttcgggagcctggaccacacagtcactggctccagcggctgtgggcttggatgcccagggcgttcctctgattaacaaatcatttagtgatgccgatctgcacctgttatcgcaggcaatccttggtacctgtgatgccctggacggcctggcggatggtattgttgataactaccgggcctgtcaggcggcgttcgatcctgccactgcggcgaacccggctaatggccaagccttacaatgtgtgggcgcaaaaacggccgattgtctctccccagttcaggtgactgcgattaagcgtgccatggcgggccctgtgaattcggcgggtactccgctgtataaccggtgggcgtgggatgcgggtatgtcgggcctgagcggcactacatacaaccaaggttggcgttcttggtggctgggctcattcaactcttcagccaacaacgcacaggcagtcagcggctttggcgcacgcagctggctggtcgacaacgcaacccctcctgaacccatgcccatgacccaggtagcggcccgtatgatgaaattcgatttcgatatcgatccgcttaaaatttgggcgacgagtggacagttcacgcagtcttccatggattggcatggggcaacgagcacggatctggcggcgtttcgtgatcgcgggggtaagatgattttatatcacggaatgtcagatgccgccttctccgccctggatacagccgattactatgagcgtttaggggcggcaatgccgggcgcggctgggttcgcccgccttttcctggtaccgggtatgaatcactgctctggcggcccaggcaccgatcgctttgatatgcttacgccgctggttgcctgggtggaacgcggtgaagccccggatcagattagtgcctggagcggcacaccaggctatttcggtgtagccgctcgtacccgcccgctgtgcccgtatcctcaaatcgcacgctataagggctccggtgatatcaacacggaagcgaattttgcgtgtgccgcccctccgtaagctttgagacc" . + . + . + . \ No newline at end of file diff --git a/test/test_files/sbol3_collection.nt b/test/test_files/sbol3_collection.nt new file mode 100644 index 00000000..cc65b569 --- /dev/null +++ b/test/test_files/sbol3_collection.nt @@ -0,0 +1,22 @@ + . + . + . + "col1" . + . + . + "LacI protein" . + "LacI" . + . + . + "LacI_protein" . + . + . + "TetR protein" . + "TetR" . + . + . + "TetR_protein" . + . + "1" . + "1" . + "1" . diff --git a/test/test_files/sbol3_genbank_conversion/BBa_J23101_from_genbank_to_sbol3_direct.nt b/test/test_files/sbol3_genbank_conversion/BBa_J23101_from_genbank_to_sbol3_direct.nt new file mode 100644 index 00000000..11cebba2 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/BBa_J23101_from_genbank_to_sbol3_direct.nt @@ -0,0 +1,27 @@ + + "constitutive promoter family member" . + "BBa_J23101" . + . + . + . + . + . + "BBa_J23101" . + "01-JAN-1980" . + "UNK" . + "BBa_J23101.1" . + "" . + "BBa_J23101" . + "DNA" . + "BBa_J23101" . + "." . + "1"^^ . + "" . + "" . + "linear" . + . + "BBa_J23101_sequence" . + "tttacagctagctcagtcctaggtattatgctagc" . + . + . + . diff --git a/test/test_files/sbol3_genbank_conversion/BBa_J23101_from_sbol3_direct.gb b/test/test_files/sbol3_genbank_conversion/BBa_J23101_from_sbol3_direct.gb new file mode 100644 index 00000000..99d07a87 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/BBa_J23101_from_sbol3_direct.gb @@ -0,0 +1,12 @@ +LOCUS BBa_J23101 35 bp DNA linear UNK 01-JAN-1980 +DEFINITION constitutive promoter family member. +ACCESSION BBa_J23101 +VERSION BBa_J23101.1 +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers +ORIGIN + 1 tttacagcta gctcagtcct aggtattatg ctagc +// diff --git a/test/test_files/sbol3_genbank_conversion/feature_qualifier_storage.gb b/test/test_files/sbol3_genbank_conversion/feature_qualifier_storage.gb new file mode 100644 index 00000000..430753a7 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/feature_qualifier_storage.gb @@ -0,0 +1,147 @@ +LOCUS BBF10K_000034 2828 bp DNA circular UNK 25-DEC-2019 +DEFINITION (NOTE: This is a Modified version of the BBF10K_000034.gb GenBank + File) Yeast promoter Adh in pOpen-v3. +ACCESSION BBF10K_000034 +VERSION BBF10K_000034.1 +KEYWORDS BBFD_0017; FreeGenes; Golden Gate Assembly; Open Yeast Collection + Part; OpenMTA. +SOURCE synthetic DNA construct + ORGANISM recombinant plasmid + . +REFERENCE 1 (bases 1 to 2001) + AUTHORS Keoni Gandall + TITLE Direct Submission +REFERENCE 2 (bases 2002 to 2828) + AUTHORS Scott Pownall, Open Science Network + TITLE Direct Submission + JOURNAL https://biobricks.org/freegenes/ +FEATURES Location/Qualifiers + source 1..2001 + /label="first_feature" + /organism="recombinant plasmid" + /mol_type="other DNA" + /original_record="pOpen_v3.1" + primer_bind complement(24..42) + /label="T7 Reverse" + /note="/ApEinfo_fwdcolor=#b7e6d7 /ApEinfo_revcolor=#b7e6d7" + /original_record="pOpen_v3.1" + terminator 38..85 + /label="T7 terminator" + /note="/ApEinfo_fwdcolor=#d6b295 /ApEinfo_revcolor=#d6b295" + /original_record="pOpen_v3.1" + primer_bind complement(86..106) + /label="M13 rev" + /note="/ApEinfo_fwdcolor=#ff9ccd /ApEinfo_revcolor=#ff9ccd" + /original_record="pOpen_v3.1" + misc_feature 107..121 + /label="AarI" + /note="/ApEinfo_fwdcolor=#b1ff67 /ApEinfo_revcolor=#b1ff67" + /original_record="pOpen_v3.1" + misc_feature 122..172 + /label="spacer" + /note="/ApEinfo_fwdcolor=#b1ff67 /ApEinfo_revcolor=#b1ff67" + /original_record="pOpen_v3.1" + rep_origin complement(173..761) + /direction=LEFT + /label="ori" + /note="/ApEinfo_fwdcolor=#84b0dc /ApEinfo_revcolor=#84b0dc" + /original_record="pOpen_v3.1" + CDS complement(952..1812) + /label="AmpR" + /note="/ApEinfo_fwdcolor=#ff9ccd /ApEinfo_revcolor=#ff9ccd" + /translation="MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYI + ELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYS + PVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRW + EPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSA + LPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGAS + LIKHW*" + /original_record="pOpen_v3.1" + misc_feature 1918..1932 + /label="AarI" + /note="/ApEinfo_fwdcolor=#d6b295 /ApEinfo_revcolor=#d6b295" + /original_record="pOpen_v3.1" + primer_bind 1933..1949 + /label="M13 fwd" + /note="/ApEinfo_fwdcolor=#84b0dc /ApEinfo_revcolor=#84b0dc" + /original_record="pOpen_v3.1" + terminator 1950..1981 + /label="tonB terminator" + /note="/ApEinfo_fwdcolor=#b4abac /ApEinfo_revcolor=#b4abac" + /original_record="pOpen_v3.1" + misc_feature 2002..2021 + /label="FreeGenes dialout primer" + /ApEinfo_fwdcolor="#0048BA" + /ApEinfo_revcolor="#0048BA" + protein_bind join(2022..2027,2029..2032) + /label="BsaI" + /ApEinfo_fwdcolor="#00FF2C" + /ApEinfo_revcolor="#00FF2C" + promoter 2033..2737 + /label="Adh1 Promoter" + /original_record="part" + protein_bind complement(join(2738..2741,2743..2748)) + /label="BsaI" + /ApEinfo_fwdcolor="#00FF2C" + /ApEinfo_revcolor="#00FF2C" + misc_feature complement(2749..2768) + /label="FreeGenes dialout primer" + /ApEinfo_fwdcolor="#0048BA" + /ApEinfo_revcolor="#0048BA" + misc_feature 2769..2828 + /label="FreeGenes barcode" + /ApEinfo_fwdcolor="#FF7E00" + /ApEinfo_revcolor="#FF7E00" + protein_bind 2790..2807 + /label="I-SceI" + /ApEinfo_fwdcolor="#4F0080" + /ApEinfo_revcolor="#4F0080" +ORIGIN + 1 cgctgaggtg tcaatcgtcg gagccgctga gcaataacta gcataacccc ttggggcctc + 61 taaacgggtc ttgaggggtt ttttgcatgg tcatagctgt ttcctgagag cttggcaggt + 121 gatgacacac attaacaaat ttcgtgagga gtctccagaa gaatgccatt aatttccata + 181 ggctccgccc ccctgacgag catcacaaaa atcgacgctc aagtcagagg tggcgaaacc + 241 cgacaggact ataaagatac caggcgtttc cccctggaag ctccctcgtg cgctctcctg + 301 ttccgaccct gccgcttacc ggatacctgt ccgcctttct cccttcggga agcgtggcgc + 361 tttctcatag ctcacgctgt aggtatctca gttcggtgta ggtcgttcgc tccaagctgg + 421 gctgtgtgca cgaacccccc gttcagcccg accgctgcgc cttatccggt aactatcgtc + 481 ttgagtccaa cccggtaaga cacgacttat cgccactggc agcagccact ggtaacagga + 541 ttagcagagc gaggtatgta ggcggtgcta cagagttctt gaagtggtgg cctaactacg + 601 gctacactag aagaacagta tttggtatct gcgctctgct gaagccagtt accttcggaa + 661 aaagagttgg tagctcttga tccggcaaac aaaccaccgc tggtagcggt ggtttttttg + 721 tttgcaagca gcagattacg cgcagaaaaa aaggatctca agaaggccta ctattagcaa + 781 caacgatcct ttgatctttt ctacggggtc tgacgctcag tggaacgaaa actcacgtta + 841 agggattttg gtcatgagat tatcaaaaag gatcttcacc tagatccttt taaattaaaa + 901 atgaagtttt aaatcaatct aaagtatata tgagtaaact tggtctgaca gttaccaatg + 961 cttaatcagt gaggcaccta tctcagcgat ctgtctattt cgttcatcca tagttgcctg + 1021 actccccgtc gtgtagataa ctacgatacg ggagggctta ccatctggcc ccagtgctgc + 1081 aatgataccg cgagaaccac gctcaccggc tccagattta tcagcaataa accagccagc + 1141 cggaagggcc gagcgcagaa gtggtcctgc aactttatcc gcctccatcc agtctattaa + 1201 ttgttgccgg gaagctagag taagtagttc gccagttaat agtttgcgca acgttgttgc + 1261 cattgctaca ggcatcgtgg tgtcacgctc gtcgtttggt atggcttcat tcagctccgg + 1321 ttcccaacga tcaaggcgag ttacatgatc ccccatgttg tgcaaaaaag cggttagctc + 1381 cttcggtcct ccgatcgttg tcagaagtaa gttggccgca gtgttatcac tcatggttat + 1441 ggcagcactg cataattctc ttactgtcat gccatccgta agatgctttt ctgtgactgg + 1501 tgagtactca accaagtcat tctgagaata gtgtatgcgg cgaccgagtt gctcttgccc + 1561 ggcgtcaata cgggataata ccgcgccaca tagcagaact ttaaaagtgc tcatcattgg + 1621 aaaacgttct tcggggcgaa aactctcaag gatcttaccg ctgttgagat ccagttcgat + 1681 gtaacccact cgtgcaccca actgatcttc agcatctttt actttcacca gcgtttctgg + 1741 gtgagcaaaa acaggaaggc aaaatgccgc aaaaaaggga ataagggcga cacggaaatg + 1801 ttgaatactc atactcttcc tttttcaata ttattgaagc atttatcagg gttattgtct + 1861 catgagcgga tacatatttg aatgtattta gaaaaataaa caaatagggg ttccgcgcac + 1921 ctgcaccagt cagtaaaacg acggccagta gtcaaaagcc tccgaccgga ggcttttgac + 1981 ttggttcagg tggagtggga gaacgatgat gctcactctc gggtctctgg agatcctttt + 2041 gttgtttccg ggtgtacaat atggacttcc tcttttctgg caaccaaacc catacatcgg + 2101 gattcctata ataccttcgt tggactccct aacatgtagg tggcggaggg gagatataca + 2161 atagaacaga taccagacaa gacataatgg gctaaacaag actacaccaa ttacactgcc + 2221 tcattgatgg tggtacataa cgaactaata ctgtagccct agacttgata gccatcatca + 2281 tatcgaagtt tcactaccct ttttccattt gccatctatt gaagtaataa taggcgcatg + 2341 caacttcttt tctttttttt tcttttctct ctcccccgtt gttgtctcac catatccgca + 2401 atgacaaaaa aatgatggaa gtcactaaag gaaaaaatta acgacaaaga cagcaccaac + 2461 agatgtcgtt gttccagagc tgatgagggg tatctcgaag cacacgaaac tttttccttc + 2521 cttcattcac gcacactact ctctaatgag caacggtata cggccttcct tccagttact + 2581 tgaatttgaa ataaaaaaaa gtttgctgtc ttgctatcaa gtataaatag acctgcaatt + 2641 attaatcttt tgtttcctcg tcattgttct cgttcccttt cttccttgtt tctttttctg + 2701 cacaatattt caagctatac caagcataca atcaactaat gtgagaccac ataagcgatc + 2761 ccaaggtcag ggcatggcgc aaatacagat agggataaca gggtaatctg cttcttatgg + 2821 accaaagt +// diff --git a/test/test_files/sbol3_genbank_conversion/iGEM_BBF10K_000475_modified.gb b/test/test_files/sbol3_genbank_conversion/iGEM_BBF10K_000475_modified.gb new file mode 100644 index 00000000..da1d9e22 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/iGEM_BBF10K_000475_modified.gb @@ -0,0 +1,170 @@ +LOCUS BBF10K_000475 3448 bp DNA circular UNK 09-JUN-2018 +DEFINITION (NOTE: This is a Modified version of the BBF10K_000475 GenBank File) + Yeast marker cassette Zeo in pOpen-v3. +ACCESSION BBF10K_000475 +VERSION BBF10K_000475.1 +KEYWORDS BBFD_0017; FreeGenes; Golden Gate Assembly; Open Yeast Collection + Part; OpenMTA. +SOURCE synthetic DNA construct + ORGANISM recombinant plasmid + . +REFERENCE 1 (bases 1 to 2001) + AUTHORS Keoni Gandall + TITLE Direct Submission +REFERENCE 2 (bases 2002 to 3448) + AUTHORS Scott Pownall, Open Science Network + TITLE Direct Submission + JOURNAL https://biobricks.org/freegenes/ +FEATURES Location/Qualifiers + source 1..2001 + /organism="recombinant plasmid" + /mol_type="other DNA" + /original_record="pOpen_v3.1" + primer_bind complement(24..42) + /label="T7 Reverse" + /note="/ApEinfo_fwdcolor=#b7e6d7 /ApEinfo_revcolor=#b7e6d7" + /original_record="pOpen_v3.1" + terminator 38..85 + /label="T7 terminator" + /note="/ApEinfo_fwdcolor=#d6b295 /ApEinfo_revcolor=#d6b295" + /original_record="pOpen_v3.1" + primer_bind complement(86..106) + /label="M13 rev" + /note="/ApEinfo_fwdcolor=#ff9ccd /ApEinfo_revcolor=#ff9ccd" + /original_record="pOpen_v3.1" + misc_feature 107..121 + /label="AarI" + /note="/ApEinfo_fwdcolor=#b1ff67 /ApEinfo_revcolor=#b1ff67" + /original_record="pOpen_v3.1" + misc_feature 122..172 + /label="spacer" + /note="/ApEinfo_fwdcolor=#b1ff67 /ApEinfo_revcolor=#b1ff67" + /original_record="pOpen_v3.1" + rep_origin complement(173..761) + /direction=LEFT + /label="ori" + /note="/ApEinfo_fwdcolor=#84b0dc /ApEinfo_revcolor=#84b0dc" + /original_record="pOpen_v3.1" + CDS complement(952..1812) + /label="AmpR" + /note="/ApEinfo_fwdcolor=#ff9ccd /ApEinfo_revcolor=#ff9ccd" + /translation="MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYI + ELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYS + PVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRW + EPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSA + LPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGAS + LIKHW*" + /original_record="pOpen_v3.1" + misc_feature 1918..1932 + /label="AarI" + /note="/ApEinfo_fwdcolor=#d6b295 /ApEinfo_revcolor=#d6b295" + /original_record="pOpen_v3.1" + primer_bind 1933..1949 + /label="M13 fwd" + /note="/ApEinfo_fwdcolor=#84b0dc /ApEinfo_revcolor=#84b0dc" + /original_record="pOpen_v3.1" + terminator 1950..1981 + /label="tonB terminator" + /note="/ApEinfo_fwdcolor=#b4abac /ApEinfo_revcolor=#b4abac" + /original_record="pOpen_v3.1" + misc_feature 2002..2021 + /label="FreeGenes dialout primer" + /ApEinfo_fwdcolor="#0048BA" + /ApEinfo_revcolor="#0048BA" + source 2002..3448 + /organism="synthetic DNA construct" + /lab_host="Saccharomyces cerevisiae" + /mol_type="other DNA" + /original_record="part" + protein_bind join(2022..2027,2029..2032) + /label="BsaI" + /ApEinfo_fwdcolor="#00FF2C" + /ApEinfo_revcolor="#00FF2C" + promoter 2033..2732 + /label="ScPGK1 Promoter" + /original_record="part" + CDS 2733..3107 + /label="ZeocinR" + /translation="MAKLTSAVPVLTARDVAGAVEFWTDRLGFSRDFVEDDFAGVVRDD + VTLFISAVQDQVVPDNTLAWVWVRGLDELYAEWSEVVSTNFRDASGPAMTEIGEQPWGR + EFALRDPAGNCVHFVAEEQD" + /original_record="part" + terminator 3108..3357 + /label="ScPRM9 Terminator" + /original_record="part" + protein_bind complement(join(3358..3361,3363..3368)) + /label="BsaI" + /ApEinfo_fwdcolor="#00FF2C" + /ApEinfo_revcolor="#00FF2C" + misc_feature complement(3369..3388) + /label="FreeGenes dialout primer" + /ApEinfo_fwdcolor="#0048BA" + /ApEinfo_revcolor="#0048BA" + misc_feature 3389..3448 + /label="FreeGenes barcode" + /ApEinfo_fwdcolor="#FF7E00" + /ApEinfo_revcolor="#FF7E00" + protein_bind 3410..3427 + /label="I-SceI" + /ApEinfo_fwdcolor="#4F0080" + /ApEinfo_revcolor="#4F0080" +ORIGIN + 1 cgctgaggtg tcaatcgtcg gagccgctga gcaataacta gcataacccc ttggggcctc + 61 taaacgggtc ttgaggggtt ttttgcatgg tcatagctgt ttcctgagag cttggcaggt + 121 gatgacacac attaacaaat ttcgtgagga gtctccagaa gaatgccatt aatttccata + 181 ggctccgccc ccctgacgag catcacaaaa atcgacgctc aagtcagagg tggcgaaacc + 241 cgacaggact ataaagatac caggcgtttc cccctggaag ctccctcgtg cgctctcctg + 301 ttccgaccct gccgcttacc ggatacctgt ccgcctttct cccttcggga agcgtggcgc + 361 tttctcatag ctcacgctgt aggtatctca gttcggtgta ggtcgttcgc tccaagctgg + 421 gctgtgtgca cgaacccccc gttcagcccg accgctgcgc cttatccggt aactatcgtc + 481 ttgagtccaa cccggtaaga cacgacttat cgccactggc agcagccact ggtaacagga + 541 ttagcagagc gaggtatgta ggcggtgcta cagagttctt gaagtggtgg cctaactacg + 601 gctacactag aagaacagta tttggtatct gcgctctgct gaagccagtt accttcggaa + 661 aaagagttgg tagctcttga tccggcaaac aaaccaccgc tggtagcggt ggtttttttg + 721 tttgcaagca gcagattacg cgcagaaaaa aaggatctca agaaggccta ctattagcaa + 781 caacgatcct ttgatctttt ctacggggtc tgacgctcag tggaacgaaa actcacgtta + 841 agggattttg gtcatgagat tatcaaaaag gatcttcacc tagatccttt taaattaaaa + 901 atgaagtttt aaatcaatct aaagtatata tgagtaaact tggtctgaca gttaccaatg + 961 cttaatcagt gaggcaccta tctcagcgat ctgtctattt cgttcatcca tagttgcctg + 1021 actccccgtc gtgtagataa ctacgatacg ggagggctta ccatctggcc ccagtgctgc + 1081 aatgataccg cgagaaccac gctcaccggc tccagattta tcagcaataa accagccagc + 1141 cggaagggcc gagcgcagaa gtggtcctgc aactttatcc gcctccatcc agtctattaa + 1201 ttgttgccgg gaagctagag taagtagttc gccagttaat agtttgcgca acgttgttgc + 1261 cattgctaca ggcatcgtgg tgtcacgctc gtcgtttggt atggcttcat tcagctccgg + 1321 ttcccaacga tcaaggcgag ttacatgatc ccccatgttg tgcaaaaaag cggttagctc + 1381 cttcggtcct ccgatcgttg tcagaagtaa gttggccgca gtgttatcac tcatggttat + 1441 ggcagcactg cataattctc ttactgtcat gccatccgta agatgctttt ctgtgactgg + 1501 tgagtactca accaagtcat tctgagaata gtgtatgcgg cgaccgagtt gctcttgccc + 1561 ggcgtcaata cgggataata ccgcgccaca tagcagaact ttaaaagtgc tcatcattgg + 1621 aaaacgttct tcggggcgaa aactctcaag gatcttaccg ctgttgagat ccagttcgat + 1681 gtaacccact cgtgcaccca actgatcttc agcatctttt actttcacca gcgtttctgg + 1741 gtgagcaaaa acaggaaggc aaaatgccgc aaaaaaggga ataagggcga cacggaaatg + 1801 ttgaatactc atactcttcc tttttcaata ttattgaagc atttatcagg gttattgtct + 1861 catgagcgga tacatatttg aatgtattta gaaaaataaa caaatagggg ttccgcgcac + 1921 ctgcaccagt cagtaaaacg acggccagta gtcaaaagcc tccgaccgga ggcttttgac + 1981 ttggttcagg tggagtggga gaagaattac tgacccctcg gggtctcaaa gggtgagtaa + 2041 ggaaagagtg aggaactatc gcatacctgc atttaaagat gccgatttgg gcgcgaatcc + 2101 tttattttgg cttcaccctc atactattat cagggccaga aaaaggaagt gtttccctcc + 2161 ttcttgaatt gatgttaccc tcataaagca cgtggcctct tatcgagaaa gaaattaccg + 2221 tcgctcgtga tttgtttgca aaaagaacaa aactgaaaaa acccagacac gctcgacttc + 2281 ctgtcatcct attgattgca gcttccaatt tcgtcacaca acaaggtcct agcgacggct + 2341 cacaggtttt gtaacaagca atcgaaggtt ctggaatggc gggaaagggt ttagtaccac + 2401 atgctatgat gcccactgtg atctccagag caaagttcgt tcgatcgtac tgttactctc + 2461 tctctttcaa acagaattgt ccgaatcgtg tgacaacaac agcctgttct cacacactct + 2521 tttcttctaa ccaagggggt ggtttagttt agtagaacct cgtgaaactt acatttacat + 2581 atatataaac ttgcataaat tggtcaatgc aagaaataca tatttggtct tttctaattc + 2641 gtagtttttc aagttcttag atgctttctt tttctctttt ttacagatca tcaaggaagt + 2701 aattatctac tttttacaac aaatataaaa caatggctaa attaacatct gccgttcctg + 2761 ttttaacagc tagggatgtt gcaggagctg tagagttttg gacagatagg ttaggattct + 2821 caagagactt tgttgaggac gattttgctg gtgttgtcag ggatgacgtt actttattta + 2881 tctcagcagt ccaagatcaa gttgtccctg ataatacatt ggcttgggtc tgggtcaggg + 2941 gtttagatga attatatgct gaatggtcag aagttgtatc tacaaacttc agagatgctt + 3001 ctggtccagc tatgaccgag attggtgaac agccatgggg tagagaattt gctttgagag + 3061 atccagctgg aaattgtgtt cattttgttg ctgaagaaca agattaaaca gatgacggga + 3121 gacactagca cacaacttta ccaggcaagg tatttgacgc tagcatgtgt ccaattcagt + 3181 gtcatttatg attttttgta gtaggatata aatatataca gcgctccaaa tagtgcggtt + 3241 gccccaaaaa caccacggaa cctcatctgt tctcgtactt tgttgtgaca aagtagctca + 3301 ctgccttatt atcacatttt cattatgcaa cgcttcggaa aatacgatgt tgaaaatatg + 3361 atgagaccac aggaagcaag gtatacgccg ctgtgtaatt ttcctgcatt agggataaca + 3421 gggtaatttc aggcaaggca gcatcaga +// diff --git a/test/test_files/sbol3_genbank_conversion/iGEM_SBOL2_imports_from_genbank_to_sbol3_direct.nt b/test/test_files/sbol3_genbank_conversion/iGEM_SBOL2_imports_from_genbank_to_sbol3_direct.nt new file mode 100644 index 00000000..5ee6b651 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/iGEM_SBOL2_imports_from_genbank_to_sbol3_direct.nt @@ -0,0 +1,284 @@ + "constitutive promoter family member" . + "BBa_J23100" . + . + . + . + . + . + "BBa_J23100" . + "01-JAN-1980" . + "UNK" . + "BBa_J23100.1" . + "" . + "BBa_J23100" . + "DNA" . + "BBa_J23100" . + "." . + "1"^^ . + "" . + "" . + "linear" . + . + "BBa_J23100_sequence" . + "ttgacggctagctcagtcctaggtacagtgctagc" . + . + . + . + "constitutive promoter family member" . + "BBa_J23102" . + . + . + . + . + . + "BBa_J23102" . + "01-JAN-1980" . + "UNK" . + "BBa_J23102.1" . + "" . + "BBa_J23102" . + "DNA" . + "BBa_J23102" . + "." . + "1"^^ . + "" . + "" . + "linear" . + . + "BBa_J23102_sequence" . + "ttgacagctagctcagtcctaggtactgtgctagc" . + . + . + . + "" . + "LmrA" . + . + . + . + . + . + "LmrA" . + "01-JAN-1980" . + "UNK" . + "LmrA.1" . + "" . + "LmrA" . + "DNA" . + "LmrA" . + "." . + "1"^^ . + "" . + "" . + "linear" . + . + "LmrA_sequence" . + "atgagctatggtgatagccgtgaaaaaattctgagcgcagcaacccgtctgtttcagctgcagggttattatggcaccggtctgaatcagattatcaaagaaagcggtgcaccgaaaggtagcctgtattatcattttccgggtggtaaagaacagctggcaattgaagcagtgaacgaaatgaaagaatatatccgccagaaaatcgccgattgtatggaagcatgtaccgatccggcagaaggtattcaggcatttctgaaagaactgagctgtcagtttagctgtaccgaagatattgaaggtctgccggttggtctgctggcagcagaaaccagcctgaaaagcgaaccgctgcgtgaagcatgtcatgaagcatataaagaatgggccagcgtgtatgaagaaaaactgcgtcagaccggttgtagcgaaagccgtgcaaaagaagcaagcaccgttgttaatgcaatgattgaaggtggtattctgctgagcctgaccgcaaaaaatagcacaccgctgctgcatattagcagctgtattccggatctgctgaaacgttaa" . + . + . + . + "Range1" . + "21"^^ . + . + . + "1"^^ . + . + "Range1" . + "2012"^^ . + . + . + "2009"^^ . + . + "SequenceFeature10" . + . + "end of terminator" . + . + . + "0:label" . + "0:end of terminator" . + . + "Range1" . + "2041"^^ . + . + . + "2022"^^ . + . + "SequenceFeature11" . + . + "Stem loop" . + . + . + "0:label" . + "0:Stem loop" . + . + "Range1" . + "2072"^^ . + . + . + "2051"^^ . + . + "SequenceFeature12" . + . + "BioBrick prefix" . + . + . + "0:label" . + "0:BioBrick prefix" . + . + "SequenceFeature1" . + . + "BioBrick suffix" . + . + . + "0:label" . + "0:BioBrick suffix" . + . + "Range1" . + "93"^^ . + . + . + "22"^^ . + . + "SequenceFeature2" . + . + "E. coli his operon terminator" . + . + . + "0:label" . + "0:E. coli his operon terminator" . + . + "Range1" . + "64"^^ . + . + . + "30"^^ . + . + "SequenceFeature3" . + . + "Stem loop" . + . + . + "0:label" . + "0:Stem loop" . + . + "Range1" . + "176"^^ . + . + . + "157"^^ . + . + "SequenceFeature4" . + . + "Verification reverse (VR) primer binding site" . + . + . + "0:label" . + "0:Verification reverse (VR) primer binding site" . + . + "Range1" . + "875"^^ . + . + . + "261"^^ . + . + "SequenceFeature5" . + . + "rep (pMB1)" . + . + . + "0:label" . + "0:rep (pMB1)" . + . + "Range1" . + "276"^^ . + . + . + "276"^^ . + . + "SequenceFeature6" . + . + "ORI" . + . + . + "0:label" . + "0:ORI" . + . + "Range1" . + "1147"^^ . + . + . + "1042"^^ . + . + "SequenceFeature7" . + . + "T0 terminator" . + . + . + "0:label" . + "0:T0 terminator" . + . + "Range1" . + "1819"^^ . + . + . + "1160"^^ . + . + "SequenceFeature8" . + . + "Chloramphenicol resistance" . + . + . + "0:label" . + "0:Chloramphenicol resistance" . + . + "Range1" . + "1952"^^ . + . + . + "1933"^^ . + . + "SequenceFeature9" . + . + "Verification forward (VF2) primer binding site" . + . + . + "0:label" . + "0:Verification forward (VF2) primer binding site" . + . + "High copy BioBrick assembly plasmid" . + "pSB1C3" . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + "pSB1C3" . + "01-JAN-1980" . + "UNK" . + "pSB1C3.1" . + "" . + "pSB1C3" . + "DNA" . + "pSB1C3" . + "." . + "1"^^ . + "" . + "" . + "linear" . + . + "pSB1C3_sequence" . + "tactagtagcggccgctgcagtccggcaaaaaagggcaaggtgtcaccaccctgccctttttctttaaaaccgaaaagattacttcgcgttatgcaggcttcctcgctcactgactcgctgcgctcggtcgttcggctgcggcgagcggtatcagctcactcaaaggcggtaatacggttatccacagaatcaggggataacgcaggaaagaacatgtgagcaaaaggccagcaaaaggccaggaaccgtaaaaaggccgcgttgctggcgtttttccacaggctccgcccccctgacgagcatcacaaaaatcgacgctcaagtcagaggtggcgaaacccgacaggactataaagataccaggcgtttccccctggaagctccctcgtgcgctctcctgttccgaccctgccgcttaccggatacctgtccgcctttctcccttcgggaagcgtggcgctttctcatagctcacgctgtaggtatctcagttcggtgtaggtcgttcgctccaagctgggctgtgtgcacgaaccccccgttcagcccgaccgctgcgccttatccggtaactatcgtcttgagtccaacccggtaagacacgacttatcgccactggcagcagccactggtaacaggattagcagagcgaggtatgtaggcggtgctacagagttcttgaagtggtggcctaactacggctacactagaagaacagtatttggtatctgcgctctgctgaagccagttaccttcggaaaaagagttggtagctcttgatccggcaaacaaaccaccgctggtagcggtggtttttttgtttgcaagcagcagattacgcgcagaaaaaaaggatctcaagaagatcctttgatcttttctacggggtctgacgctcagtggaacgaaaactcacgttaagggattttggtcatgagattatcaaaaaggatcttcacctagatccttttaaattaaaaatgaagttttaaatcaatctaaagtatatatgagtaaacttggtctgacagctcgaggcttggattctcaccaataaaaaacgcccggcggcaaccgagcgttctgaacaaatccagatggagttctgaggtcattactggatctatcaacaggagtccaagcgagctcgatatcaaattacgccccgccctgccactcatcgcagtactgttgtaattcattaagcattctgccgacatggaagccatcacaaacggcatgatgaacctgaatcgccagcggcatcagcaccttgtcgccttgcgtataatatttgcccatggtgaaaacgggggcgaagaagttgtccatattggccacgtttaaatcaaaactggtgaaactcacccagggattggctgagacgaaaaacatattctcaataaaccctttagggaaataggccaggttttcaccgtaacacgccacatcttgcgaatatatgtgtagaaactgccggaaatcgtcgtggtattcactccagagcgatgaaaacgtttcagtttgctcatggaaaacggtgtaacaagggtgaacactatcccatatcaccagctcaccgtctttcattgccatacgaaattccggatgagcattcatcaggcgggcaagaatgtgaataaaggccggataaaacttgtgcttatttttctttacggtctttaaaaaggccgtaatatccagctgaacggtctggttataggtacattgagcaactgactgaaatgcctcaaaatgttctttacgatgccattgggatatatcaacggtggtatatccagtgatttttttctccattttagcttccttagctcctgaaaatctcgataactcaaaaaatacgcccggtagtgatcttatttcattatggtgaaagttggaacctcttacgtgcccgatcaactcgagtgccacctgacgtctaagaaaccattattatcatgacattaacctataaaaataggcgtatcacgaggcagaatttcagataaaaaaaatccttagctttcgctaaggatgatttctggaattcgcggccgcttctagag" . + . + . + . diff --git a/test/test_files/sbol3_genbank_conversion/iGEM_SBOL2_imports_from_sbol3_direct.gb b/test/test_files/sbol3_genbank_conversion/iGEM_SBOL2_imports_from_sbol3_direct.gb new file mode 100644 index 00000000..60988f32 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/iGEM_SBOL2_imports_from_sbol3_direct.gb @@ -0,0 +1,115 @@ +LOCUS BBa_J23100 35 bp DNA linear UNK 01-JAN-1980 +DEFINITION constitutive promoter family member. +ACCESSION BBa_J23100 +VERSION BBa_J23100.1 +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers +ORIGIN + 1 ttgacggcta gctcagtcct aggtacagtg ctagc +// +LOCUS BBa_J23102 35 bp DNA linear UNK 01-JAN-1980 +DEFINITION constitutive promoter family member. +ACCESSION BBa_J23102 +VERSION BBa_J23102.1 +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers +ORIGIN + 1 ttgacagcta gctcagtcct aggtactgtg ctagc +// +LOCUS LmrA 567 bp DNA linear UNK 01-JAN-1980 +DEFINITION . +ACCESSION LmrA +VERSION LmrA.1 +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers +ORIGIN + 1 atgagctatg gtgatagccg tgaaaaaatt ctgagcgcag caacccgtct gtttcagctg + 61 cagggttatt atggcaccgg tctgaatcag attatcaaag aaagcggtgc accgaaaggt + 121 agcctgtatt atcattttcc gggtggtaaa gaacagctgg caattgaagc agtgaacgaa + 181 atgaaagaat atatccgcca gaaaatcgcc gattgtatgg aagcatgtac cgatccggca + 241 gaaggtattc aggcatttct gaaagaactg agctgtcagt ttagctgtac cgaagatatt + 301 gaaggtctgc cggttggtct gctggcagca gaaaccagcc tgaaaagcga accgctgcgt + 361 gaagcatgtc atgaagcata taaagaatgg gccagcgtgt atgaagaaaa actgcgtcag + 421 accggttgta gcgaaagccg tgcaaaagaa gcaagcaccg ttgttaatgc aatgattgaa + 481 ggtggtattc tgctgagcct gaccgcaaaa aatagcacac cgctgctgca tattagcagc + 541 tgtattccgg atctgctgaa acgttaa +// +LOCUS pSB1C3 2070 bp DNA linear UNK 01-JAN-1980 +DEFINITION High copy BioBrick assembly plasmid. +ACCESSION pSB1C3 +VERSION pSB1C3.1 +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers + misc_feature 1..21 + /label="BioBrick suffix" + stem_loop 22..93 + /label="E. coli his operon terminator" + stem_loop 30..64 + /label="Stem loop" + primer_bind 157..176 + /label="Verification reverse (VR) primer binding site" + misc_feature 261..875 + /label="rep (pMB1)" + misc_feature 276 + /label="ORI" + stem_loop 1042..1147 + /label="T0 terminator" + CDS 1160..1819 + /label="Chloramphenicol resistance" + primer_bind 1933..1952 + /label="Verification forward (VF2) primer binding site" + stem_loop 2009..2012 + /label="end of terminator" + stem_loop 2022..2041 + /label="Stem loop" + misc_feature 2051..2072 + /label="BioBrick prefix" +ORIGIN + 1 tactagtagc ggccgctgca gtccggcaaa aaagggcaag gtgtcaccac cctgcccttt + 61 ttctttaaaa ccgaaaagat tacttcgcgt tatgcaggct tcctcgctca ctgactcgct + 121 gcgctcggtc gttcggctgc ggcgagcggt atcagctcac tcaaaggcgg taatacggtt + 181 atccacagaa tcaggggata acgcaggaaa gaacatgtga gcaaaaggcc agcaaaaggc + 241 caggaaccgt aaaaaggccg cgttgctggc gtttttccac aggctccgcc cccctgacga + 301 gcatcacaaa aatcgacgct caagtcagag gtggcgaaac ccgacaggac tataaagata + 361 ccaggcgttt ccccctggaa gctccctcgt gcgctctcct gttccgaccc tgccgcttac + 421 cggatacctg tccgcctttc tcccttcggg aagcgtggcg ctttctcata gctcacgctg + 481 taggtatctc agttcggtgt aggtcgttcg ctccaagctg ggctgtgtgc acgaaccccc + 541 cgttcagccc gaccgctgcg ccttatccgg taactatcgt cttgagtcca acccggtaag + 601 acacgactta tcgccactgg cagcagccac tggtaacagg attagcagag cgaggtatgt + 661 aggcggtgct acagagttct tgaagtggtg gcctaactac ggctacacta gaagaacagt + 721 atttggtatc tgcgctctgc tgaagccagt taccttcgga aaaagagttg gtagctcttg + 781 atccggcaaa caaaccaccg ctggtagcgg tggttttttt gtttgcaagc agcagattac + 841 gcgcagaaaa aaaggatctc aagaagatcc tttgatcttt tctacggggt ctgacgctca + 901 gtggaacgaa aactcacgtt aagggatttt ggtcatgaga ttatcaaaaa ggatcttcac + 961 ctagatcctt ttaaattaaa aatgaagttt taaatcaatc taaagtatat atgagtaaac + 1021 ttggtctgac agctcgaggc ttggattctc accaataaaa aacgcccggc ggcaaccgag + 1081 cgttctgaac aaatccagat ggagttctga ggtcattact ggatctatca acaggagtcc + 1141 aagcgagctc gatatcaaat tacgccccgc cctgccactc atcgcagtac tgttgtaatt + 1201 cattaagcat tctgccgaca tggaagccat cacaaacggc atgatgaacc tgaatcgcca + 1261 gcggcatcag caccttgtcg ccttgcgtat aatatttgcc catggtgaaa acgggggcga + 1321 agaagttgtc catattggcc acgtttaaat caaaactggt gaaactcacc cagggattgg + 1381 ctgagacgaa aaacatattc tcaataaacc ctttagggaa ataggccagg ttttcaccgt + 1441 aacacgccac atcttgcgaa tatatgtgta gaaactgccg gaaatcgtcg tggtattcac + 1501 tccagagcga tgaaaacgtt tcagtttgct catggaaaac ggtgtaacaa gggtgaacac + 1561 tatcccatat caccagctca ccgtctttca ttgccatacg aaattccgga tgagcattca + 1621 tcaggcgggc aagaatgtga ataaaggccg gataaaactt gtgcttattt ttctttacgg + 1681 tctttaaaaa ggccgtaata tccagctgaa cggtctggtt ataggtacat tgagcaactg + 1741 actgaaatgc ctcaaaatgt tctttacgat gccattggga tatatcaacg gtggtatatc + 1801 cagtgatttt tttctccatt ttagcttcct tagctcctga aaatctcgat aactcaaaaa + 1861 atacgcccgg tagtgatctt atttcattat ggtgaaagtt ggaacctctt acgtgcccga + 1921 tcaactcgag tgccacctga cgtctaagaa accattatta tcatgacatt aacctataaa + 1981 aataggcgta tcacgaggca gaatttcaga taaaaaaaat ccttagcttt cgctaaggat + 2041 gatttctgga attcgcggcc gcttctagag +// diff --git a/test/test_files/sbol3_genbank_conversion/ignoring_sbol_properties.nt b/test/test_files/sbol3_genbank_conversion/ignoring_sbol_properties.nt new file mode 100644 index 00000000..2fb9d1b0 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/ignoring_sbol_properties.nt @@ -0,0 +1,267 @@ + + "constitutive promoter family member" . + "BBa_J23100" . + . + . + . + . + . + "BBa_J23100" . + "01-JAN-1980" . + "UNK" . + "BBa_J23100.1" . + "" . + "BBa_J23100" . + "DNA" . + "." . + "1"^^ . + "" . + "linear" . + . + "BBa_J23100_sequence" . + "ttgacggctagctcagtcctaggtacagtgctagc" . + . + . + . + "constitutive promoter family member" . + "BBa_J23102" . + . + . + . + . + . + "BBa_J23102" . + "01-JAN-1980" . + "UNK" . + "BBa_J23102.1" . + "" . + "BBa_J23102" . + "DNA" . + "." . + "1"^^ . + "" . + "linear" . + . + "BBa_J23102_sequence" . + "ttgacagctagctcagtcctaggtactgtgctagc" . + . + . + . + "" . + "LmrA" . + . + . + . + . + . + "LmrA" . + "01-JAN-1980" . + "UNK" . + "LmrA.1" . + "" . + "LmrA" . + "DNA" . + "." . + "1"^^ . + "" . + "linear" . + . + "LmrA_sequence" . + "atgagctatggtgatagccgtgaaaaaattctgagcgcagcaacccgtctgtttcagctgcagggttattatggcaccggtctgaatcagattatcaaagaaagcggtgcaccgaaaggtagcctgtattatcattttccgggtggtaaagaacagctggcaattgaagcagtgaacgaaatgaaagaatatatccgccagaaaatcgccgattgtatggaagcatgtaccgatccggcagaaggtattcaggcatttctgaaagaactgagctgtcagtttagctgtaccgaagatattgaaggtctgccggttggtctgctggcagcagaaaccagcctgaaaagcgaaccgctgcgtgaagcatgtcatgaagcatataaagaatgggccagcgtgtatgaagaaaaactgcgtcagaccggttgtagcgaaagccgtgcaaaagaagcaagcaccgttgttaatgcaatgattgaaggtggtattctgctgagcctgaccgcaaaaaatagcacaccgctgctgcatattagcagctgtattccggatctgctgaaacgttaa" . + . + . + . + "Range1" . + "21"^^ . + . + . + "0"^^ . + . + "Range1" . + "2012"^^ . + . + . + "2008"^^ . + . + "SequenceFeature10" . + . + "end of terminator" . + . + . + . + "Range1" . + "2041"^^ . + . + . + "2021"^^ . + . + "SequenceFeature11" . + . + "Stem loop" . + . + . + . + "Range1" . + "2072"^^ . + . + . + "2050"^^ . + . + "SequenceFeature12" . + . + "BioBrick prefix" . + . + . + . + "SequenceFeature1" . + . + "BioBrick suffix" . + . + . + . + "Range1" . + "93"^^ . + . + . + "21"^^ . + . + "SequenceFeature2" . + . + "E. coli his operon terminator" . + . + . + . + "Range1" . + "64"^^ . + . + . + "29"^^ . + . + "SequenceFeature3" . + . + "Stem loop" . + . + . + . + "Range1" . + "176"^^ . + . + . + "156"^^ . + . + "SequenceFeature4" . + . + "Verification reverse (VR) primer binding site" . + . + . + . + "Range1" . + "875"^^ . + . + . + "260"^^ . + . + "SequenceFeature5" . + . + "rep (pMB1)" . + . + . + . + "Range1" . + "276"^^ . + . + . + "275"^^ . + . + "SequenceFeature6" . + . + "ORI" . + . + . + . + "Range1" . + "1147"^^ . + . + . + "1041"^^ . + . + "SequenceFeature7" . + . + "T0 terminator" . + . + . + . + "Range1" . + "1819"^^ . + . + . + "1159"^^ . + . + "SequenceFeature8" . + . + "Chloramphenicol resistance" . + . + . + . + "Range1" . + "1952"^^ . + . + . + "1932"^^ . + . + "SequenceFeature9" . + . + "Verification forward (VF2) primer binding site" . + . + . + . + "High copy BioBrick assembly plasmid" . + "pSB1C3" . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + "pSB1C3" . + "01-JAN-1980" . + "UNK" . + "pSB1C3.1" . + "" . + "pSB1C3" . + "DNA" . + "." . + "1"^^ . + "" . + "linear" . + . + "pSB1C3_sequence" . + "tactagtagcggccgctgcagtccggcaaaaaagggcaaggtgtcaccaccctgccctttttctttaaaaccgaaaagattacttcgcgttatgcaggcttcctcgctcactgactcgctgcgctcggtcgttcggctgcggcgagcggtatcagctcactcaaaggcggtaatacggttatccacagaatcaggggataacgcaggaaagaacatgtgagcaaaaggccagcaaaaggccaggaaccgtaaaaaggccgcgttgctggcgtttttccacaggctccgcccccctgacgagcatcacaaaaatcgacgctcaagtcagaggtggcgaaacccgacaggactataaagataccaggcgtttccccctggaagctccctcgtgcgctctcctgttccgaccctgccgcttaccggatacctgtccgcctttctcccttcgggaagcgtggcgctttctcatagctcacgctgtaggtatctcagttcggtgtaggtcgttcgctccaagctgggctgtgtgcacgaaccccccgttcagcccgaccgctgcgccttatccggtaactatcgtcttgagtccaacccggtaagacacgacttatcgccactggcagcagccactggtaacaggattagcagagcgaggtatgtaggcggtgctacagagttcttgaagtggtggcctaactacggctacactagaagaacagtatttggtatctgcgctctgctgaagccagttaccttcggaaaaagagttggtagctcttgatccggcaaacaaaccaccgctggtagcggtggtttttttgtttgcaagcagcagattacgcgcagaaaaaaaggatctcaagaagatcctttgatcttttctacggggtctgacgctcagtggaacgaaaactcacgttaagggattttggtcatgagattatcaaaaaggatcttcacctagatccttttaaattaaaaatgaagttttaaatcaatctaaagtatatatgagtaaacttggtctgacagctcgaggcttggattctcaccaataaaaaacgcccggcggcaaccgagcgttctgaacaaatccagatggagttctgaggtcattactggatctatcaacaggagtccaagcgagctcgatatcaaattacgccccgccctgccactcatcgcagtactgttgtaattcattaagcattctgccgacatggaagccatcacaaacggcatgatgaacctgaatcgccagcggcatcagcaccttgtcgccttgcgtataatatttgcccatggtgaaaacgggggcgaagaagttgtccatattggccacgtttaaatcaaaactggtgaaactcacccagggattggctgagacgaaaaacatattctcaataaaccctttagggaaataggccaggttttcaccgtaacacgccacatcttgcgaatatatgtgtagaaactgccggaaatcgtcgtggtattcactccagagcgatgaaaacgtttcagtttgctcatggaaaacggtgtaacaagggtgaacactatcccatatcaccagctcaccgtctttcattgccatacgaaattccggatgagcattcatcaggcgggcaagaatgtgaataaaggccggataaaacttgtgcttatttttctttacggtctttaaaaaggccgtaatatccagctgaacggtctggttataggtacattgagcaactgactgaaatgcctcaaaatgttctttacgatgccattgggatatatcaacggtggtatatccagtgatttttttctccattttagcttccttagctcctgaaaatctcgataactcaaaaaatacgcccggtagtgatcttatttcattatggtgaaagttggaacctcttacgtgcccgatcaactcgagtgccacctgacgtctaagaaaccattattatcatgacattaacctataaaaataggcgtatcacgaggcagaatttcagataaaaaaaatccttagctttcgctaaggatgatttctggaattcgcggccgcttctagag" . + . + . + . + . + "Collection to be ignored by GenBank export" . + "CompositeParts" . + . + . + . + "Collection to be ignored" . + . + "Multicolor_expression" . + . + "Combinatorial derivation to be ignored" . + . + . + . diff --git a/test/test_files/sbol3_genbank_conversion/multiple_feature_locations.gb b/test/test_files/sbol3_genbank_conversion/multiple_feature_locations.gb new file mode 100644 index 00000000..cb13533b --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/multiple_feature_locations.gb @@ -0,0 +1,123 @@ +LOCUS AF165912 5485 bp DNA linear UNK 01-JAN-1980 +DEFINITION (NOTE: This is a Modified version of the AF165912 GenBank File) + Arabidopsis thaliana CTP:phosphocholine cytidylyltransferase (CCT) + gene, complete cds. +ACCESSION AF165912 +VERSION AF165912.1 +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers + regulatory 1..1602 + /label="c3" + gene 1..4637 + /label="c2" + source 1..5485 + /label="c1" + regulatory 1554..1560 + /label="c4" + 5'UTR 1603..1712 + /label="c6" + CDS join(1713..1891,2322..2438,2538..2633,2801..2843, + 2918..3073,3167..3247,3874..3972,4082..4309) + /label="c7" + mRNA complement(join(1603..1891,2322..2438,2538..2633, + 2801..2843,2918..3073,3167..3247,3874..3972,4082..4637)) + /label="c5" + 3'UTR 4310..4637 + /label="c8" +ORIGIN + 1 ccagaatggt tactatggac atccgccaac catacaagct atggtgaaat gctttatcta + 61 tctcattttt agtttcaaag cttttgttat aacacatgca aatccatatc cgtaaccaat + 121 atccaatcgc ttgacatagt ctgatgaagt ttttggtagt taagataaag ctcgagactg + 181 atatttcata tactggatga tttagggaaa cttgcattct attcatgaac gaatgagtca + 241 atacgagaca caaccaagca tgcaaggagc tgtgagttga tgttctatgc tatttaagta + 301 tttttcggga gatatatata tcttattgtt ctcctcctcc cgagtcaagt tgttctaaga + 361 aagaaggatc tatttcattt tgtggattgt ctagtttcag ggacagacgg ggtttagggg + 421 aagcgctatc cgtggctgct atgacatcga agaaactctg cacgacatgg tatgtaatct + 481 tgtgacgtta gtaaaaacgc tctaatgtta caaaagaaag aaagagaaaa cgaacccaat + 541 tcctcaaaat gttttctttt gacaatgtca acttcttcct tctcgggttt ttatcagttt + 601 gattgaagcg agactgcgaa attcctctgt ttacagtaga aaatgtgatc agccctattt + 661 ataaccgttg tatgttttcc ggtttttgtt tgtgcagaca atggggtcct cacagtttca + 721 gggatctgat tcgagccatc ctagtgatca ccgcttatcc aattaacaga acagaacaag + 781 ctcaagagtt gctactttca tatctttaaa atagtggaat gttttgtatg tacagaaata + 841 ggaaaggtct aaagtgtgga actggctttg aggtaaactc ttactctgat tggatttgct + 901 tgtatttata ccggaatcat aatagaaata tatgattaaa gtattcacat tctctaatct + 961 tcttttagac ttgtagttac cattcaaaag tcatggacaa ccttcgttaa ccttgagggc + 1021 cactgagaga caaccttgga cttggtcact ggctcactta tgcgtgcctc gccaaagtta + 1081 atctatcgac tgagattggt taatctggga acaaaaatta agagagaaag aagtagaact + 1141 aaaaagcaat attcagttat tcactctgtt gtatatagct caccaattaa attcaaacaa + 1201 ctaattcaaa acattatact atagcttttc attaaaaaat ttccaaaaca ttcatttaat + 1261 tatataaacc aaagaacagc ttttaagact taaaatattt cccaagtatc caacaagtca + 1321 atacagattt tttaagaaaa ctaaaccatt ttttcaactt tacaacaaaa acaccaactg + 1381 ttacaaaaaa actctcgaat ttcctatttc tccagcctta tgacaaagat atcagattaa + 1441 taaaatttag aattcattac tttttcttca tttttaaaat tatctacata ctatttattt + 1501 ctctccattt tattcagtga gaaaataaaa ttacaaatgc ctgaacacaa aaataaataa + 1561 aattagaata atcagtttcc tctagaggat attcttcgtc acaaaattaa aaaaaaaaag + 1621 agtaggagaa gagggaagcg actagcacct tttgtagttt tccgtttatt ttctgtataa + 1681 ggcgggtgat ttcggctcct tcatcggaaa ttatgagcaa cgttatcggc gatcgcactg + 1741 aagacggcct ttccaccgcc gctgcggcct ctggctctac ggctgtccag agttctcctc + 1801 ccactgatcg tcctgtccgc gtctacgccg atgggatcta cgatcttttc cactttggtc + 1861 atgctcgatc tctcgaacaa gccaaattag cgttagtatt tcttatctct tagagatgat + 1921 tgtcctgatt ttcatctcta attcgacttt tttttaccgt cgcgtgctca attttcgccg + 1981 ttccagtgtc atttttctct gatctgttga gtctggttca ttgtaagttg tacagttttt + 2041 gtttaggtcg agagacatat cttccttatt agatagtctc ggtgattgat tggtctgtat + 2101 tgattgaaat ctgtgatgtg caaggtcttg tcgcgtatga ttttagtgaa tccctttcta + 2161 aatgttgaaa tttgcaatag ctgatactgt ttctggatat atgttcttga cgaatgtttt + 2221 cgatttttta ttattttgag gaggtatgag agaaattgac ttctggtttc gtgttcttat + 2281 ggtgttgcta tgattgtgcc gtttcttaat cggccgagca ggtttccaaa caacacttac + 2341 cttcttgttg gatgttgcaa tgatgagact acccataagt acaagggaag gactgtaatg + 2401 actgcagaag agcgatatga atcacttcga cattgcaagt aattgttttc tcttatgttc + 2461 tgttgaatgt gttagtagaa aaacccatgg aagtggcagt gagtggaatt ttagaacacg + 2521 ttttttttat catgcaggtg ggtggatgaa gtcatccctg atgcaccatg ggtggtcaac + 2581 caggagtttc ttgacaagca ccagattgac tatgttgccc acgattctct tccgtaagaa + 2641 catgtgtctc ttgtgttagt ttttatttag ttttaaaaaa tggtgaaaac ttagttttgt + 2701 agtttttacc tttcacgacg tgcttgttgt tagtttagct cttttcttac aaatgatttt + 2761 agaactacaa taaccttctt tgtataattc tcatgcacag ctatgctgat tcaagcggag + 2821 ctggaaagga tgtctatgaa tttgtgagtc ggaagaattt tcatactcct gcttttgaca + 2881 ctttcatagt tctgttgtaa ctgagcatct gttgcaggtt aagaaagttg ggaggtttaa + 2941 ggaaacacag cgaactgaag gaatatcgac ctcggatata ataatgagaa tagtgaaaga + 3001 ttacaatcag tatgtcatgc gtaacttgga tagaggatac tcaagggaag atcttggagt + 3061 tagctttgtc aaggcatgtc atcattttct tatctctaca attttgtcct ttctcaaaaa + 3121 aaattcactt gtaagaatca actttggatt tgtcgatttg caacaggaaa agagacttag + 3181 agttaatatg aggctaaaga aactccagga gagggtcaaa gaacaacaag aaagagtggg + 3241 agaaaaggca tgtcttctct caacttcatt ttgcttaatt gatcattagt tcatcacaag + 3301 tccatcattt ggactgtatt gcattcaatc aaataaagct gttcatcata agttacaagg + 3361 agaaataact aaattttagg tcttgtctct gcctattcat tcacatctcc gcttgatctt + 3421 gtacctttga ctatttagcg actgtttgga aaccactctt aatgtgtcac gttttggagt + 3481 ctaacttgtc cttaatttga acctcgttca cttcttttag gactttaata ctctgtttgg + 3541 ttagtagcct ctaggcagaa aacatttgta tgtattgctt ttattttgtg tcttcttgtt + 3601 gtgattattg ggttatagaa ttgcatcaca aagtgatgct tgttaatccg ctgtagtagt + 3661 gccaggcgat atcatgttat ataatctcat ctcggtagta gcagccttat ctcgtgtatc + 3721 cgctgcgctt gaaacctcca tgcagtttca tgctttagct agtaatatga tatctgatga + 3781 gactaagttc atatgtgatt ctgaaaaagc tgattttgta gaagtttctt ataatgctcc + 3841 ttcctctgtt gttgttaaac ccggtttttc cagatccaaa ctgtaaaaat gctgcgcaac + 3901 gagtgggtag agaatgcaga tcgatgggtc gctggatttc ttgaaatatt tgaagaaggt + 3961 tgccataaga tggtaagttc aatcttgaag acacatacag tgcttcaaaa atctactaat + 4021 attcatgact atgttctgta taaccttgat taaacttgac aaatgcgtaa aatgttaaca + 4081 gggaactgca atcgtagaca gtatccaaga aaggttaatg agacaaaagt cggcagagag + 4141 gctggagaac ggtcaggatg atgacacaga cgaccagttc tatgaagaat acttcgatca + 4201 tgacatgggt agtgacgatg atgaagatga aaaattctac gacgaggaag aagtaaagga + 4261 agaagagaca gagaaaactg ttatgacgga tgctaaagac aacaagtaag aacaaatttg + 4321 gcttgcagaa acctcagatt agctctactt atggccactt ctactaaact cccttaagcc + 4381 tcgcactctc tctcgaaatt catctactta acatataata ccaatgttta gaaagagaga + 4441 gtgtgtgatg tgtttgtttg tgtgtgttga acaaacgaac gtgcgtggtt gtctttggtg + 4501 agttggtctc atctttgttg atttttgaat gcgcatgtat ttttttcttc tttttcatga + 4561 cgggcaaagt gttatgaagt acaatgcaat tgtctaaaac aggataagtc aatggttcgt + 4621 gtgtgccata aagtaaacat cgctgtgtac atcttccatg ttccaaactc aactcgtttt + 4681 cttcaaatat tgaaatacaa attggtcaaa agtcggttct tatttttttt ttaattcaca + 4741 tttttagttt gcagttttaa tagattacaa atcacatttt gtgctatttc caattccatg + 4801 agccggccaa gaatgtgagt aaaaggcaga taaagcaaag gatagccgat tgctttaaag + 4861 atgtctttgg taactagttc gaaattctct gtccactcga agactccaca actctcctct + 4921 caaatgtcag ctaatcaagt cctacacaac tatacaaaaa ggcaattaat tagtagaaaa + 4981 taaagattgg aggtttagct tctcccatac ataagtacct ttatgaatca ctaagctcag + 5041 ggtttatatg ataaccattg ctgatctgtg taaagagaag ttgatgaatt actacgtgag + 5101 tgttgttaac caactctctt tacatattag gaccgtgctt gtcaggccaa tggttttcac + 5161 ttcgaaaaat tgcttccgat atcaaactat gtgtacatta ttggtggact gtggacataa + 5221 cttaaacgca taattttatt gtgtaccttt aaaataaaca atagattaca catatatata + 5281 tggcaaatat ttgaacatta gatgtcaaga gaaaagtaaa acatgtcatg attacaccat + 5341 ctttgttatt atttagagtg attctcacta aatcttaggc ggttagcaac cgccatagtt + 5401 ttcaaaatct cattctatcg ggattaaatc tgtttttggt gactatatat aaacattggt + 5461 cgaattttta ggtaagtaaa atcag +// diff --git a/test/test_files/sbol3_genbank_conversion/test_dblink_property.gb b/test/test_files/sbol3_genbank_conversion/test_dblink_property.gb new file mode 100644 index 00000000..a5a1986b --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/test_dblink_property.gb @@ -0,0 +1,223 @@ +LOCUS JWYZ01000115 3242 bp DNA linear BCT 16-JAN-2015 +DEFINITION Escherichia coli strain CVM N37069PS N37069PS_contig_115, whole + genome shotgun sequence. +ACCESSION JWYZ01000115 +VERSION JWYZ01000115.1 +DBLINK BioProject: PRJNA266657 + BioSample: SAMN03177677 +KEYWORDS WGS. +SOURCE Escherichia coli + ORGANISM Escherichia coli + Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; + Enterobacteriaceae; Escherichia. +REFERENCE 1 (bases 1 to 3242) + AUTHORS Tyson,G.H., McDermott,P.F., Li,C., Chen,Y., Tadesse,D.A., + Mukherjee,S., Bodeis-Jones,S., Kabera,C., Gaines,S.A., + Loneragan,G.H., Edrington,T.S., Torrence,M., Harhay,D.M. and Zhao,S. + TITLE WGS accurately predicts antimicrobial resistance in Escherichia coli + JOURNAL J. Antimicrob. Chemother. 70 (10), 2763-2769 (2015) + PUBMED 26142410 +REFERENCE 2 (bases 1 to 3242) + AUTHORS Tyson,G.H., McDermott,P.F., Li,C., Tadesse,D.A., Mukherjee,S., + Bodeis-Jones,S., Kabera,C., Gaines,S.A., Loneragan,G.H., + Edrington,T.S., Torrence,M., Harhay,D.M. and Zhao,S. + TITLE Direct Submission + JOURNAL Submitted (17-NOV-2014) CVM, FDA, 8401 Muirkirk Rd, Laurel, MD + 20708, USA +COMMENT ##Genome-Assembly-Data-START## + Assembly Method :: CLC Genomics Workbench v. 7.5 + Assembly Name :: Escherichia coli CVM N37069PS v1.0 + Genome Coverage :: 48.5x + Sequencing Technology :: Illumina MiSeq + ##Genome-Assembly-Data-END## + ##Genome-Annotation-Data-START## + Annotation Provider :: NCBI + Annotation Date :: 12/29/2014 14:07:05 + Annotation Pipeline :: NCBI Prokaryotic Genome Annotation Pipeline + Annotation Method :: Best-placed reference protein set; GeneMarkS+ + Annotation Software revision :: 2.9 (rev. 455303) + Features Annotated :: Gene; CDS; rRNA; tRNA; ncRNA; repeat_region + Genes :: 4,855 + CDS :: 4,642 + Pseudo Genes :: 107 + CRISPR Arrays :: 2 + rRNAs :: 11 (5S, 16S, 23S) + tRNAs :: 78 + ncRNA :: 17 + Frameshifted Genes :: 41 + ##Genome-Annotation-Data-END## + Annotation was added by the NCBI Prokaryotic Genome Annotation + Pipeline (released 2013). Information about the Pipeline can be + found here: http://www.ncbi.nlm.nih.gov/genome/annotation_prok/ +FEATURES Location/Qualifiers + gene complement(1..115) + /locus_tag="PU64_23660" + CDS complement(1..115) + /locus_tag="PU64_23660" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_005059815.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="pyrBI operon leader peptide" + /protein_id="KIG36579.1" + /translation="MVQCVRHFVLPRLKKDAGLPFFFPLITHSQPLNRGAFF" + source 1..3242 + /organism="Escherichia coli" + /mol_type="genomic DNA" + /submitter_seqid="N37069PS_contig_115" + /strain="CVM N37069PS" + /isolation_source="Farm" + /db_xref="taxon:562" + /country="USA" + /collection_date="20-Jan-2012" + gene 95..427 + /locus_tag="PU64_23665" + CDS 95..427 + /locus_tag="PU64_23665" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_001349257.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="hypothetical protein" + /protein_id="KIG36585.1" + /translation="MSNTLNHTSSRQIVRHYTHRQKRRKHLMQYFVSANGLFELKVKVY + AFLFDVILQGNCPSVSIIADIPCFFLFHFHAIRYAFYSIHPTYRAECESERLTLLLTAQ + GCALSL" + gene complement(396..791) + /locus_tag="PU64_23670" + CDS complement(396..791) + /locus_tag="PU64_23670" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_001701843.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="mRNA endoribonuclease" + /protein_id="KIG36580.1" + /translation="MVERTAVFPAGRHSLYAEHRYSAAIRSGDLLFVSGQVGSREDGTP + EPDFQQQVRLAFDNLHATLAAAGCTFDDIIDVTSFHTDPENQFEDIMTVKNEIFSAPPY + PNWTAVGVTWLAGFDFEIKVIARIPEQ" + gene complement(922..1635) + /locus_tag="PU64_23675" + CDS complement(922..1635) + /locus_tag="PU64_23675" + /inference="EXISTENCE: similar to AA + sequence:SwissProt:P39333.2" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="oxidoreductase" + /protein_id="KIG36581.1" + /translation="MGAFTGKTVLILGGSRGIGAAIVRRFVTDGANVRFTYAGSKDAAK + RLAQETGATAVFTDSADRDAVIDVVRKSGALDILVVNAGIGVFGEALELNADDIDRLFK + INIHAPYHASVEAARQMPEGGRILIIGSVNGDRMPVAGMAAYAASKSALQGMARGLARD + FGPRGITINVVQPGPIDTDANPANGPMRDMLHSLMAIKRHGQPEEVAGMVAWLAGPEAS + FVTGAMHTIDGAFGA" + gene 1706..2299 + /locus_tag="PU64_23680" + CDS 1706..2299 + /locus_tag="PU64_23680" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_001544295.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="TetR family transcriptional regulator" + /protein_id="KIG36582.1" + /translation="MVTKKQSRVPGRPRRFAPEQAISAAKVLFHQKGFDAVSVAEVTDY + LGINPPSLYAAFGSKAGLFSRVLNEYVGTEAIPLADILRDDRPVGECLVEVLKEAARRY + SQNGGCAGCMVLEGIHSHDPLARDIAVQYYHAAETTIYDYIARRHPQSAQCVTDFMSTV + MSGLSAKAREGHSIEQLCATAALAGEAIKTLLKE" + gene 2444..2896 + /locus_tag="PU64_23685" + CDS 2444..2896 + /locus_tag="PU64_23685" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_001570607.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="toxin-antitoxin biofilm protein TabA" + /protein_id="KIG36583.1" + /translation="MIIGNIHNLQPWLPQELRQAIEHIKAHVTAETPKGKHDIEGNHLF + YLISEDMTEPYEARRAEYHARYLDIQIVLKGQEGMTFSTQPAGTPDTDWLADKDIAFLP + EGVDEKTVILNEGDFVVFYPGEVHKPLCAVGAPAQVRKAVVKMLMA" + gene 3019..3242 + /locus_tag="PU64_23690" + CDS 3019..3242 + /locus_tag="PU64_23690" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_000036524.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="DNA-binding protein" + /protein_id="KIG36584.1" + /translation="MSKISGWNFSQNITSADNCKQKNEDLDTWYVGMNDFARIAGGQNS + RSNILSPRAFLEFLAKIFTLGYVDFSKRS" +ORIGIN + 1 aaaaaaaagc ccctcgattg aggggctggg aatgggtgat caacgggaag aaaaacggca + 61 ggccagcgtc ttttttcaga cgcggtaaga caaaatgtcg aacacactga accatacatc + 121 ctcccggcaa attgtccggc attatactca tcgtcagaag cggcgcaagc atttgatgca + 181 atattttgtc agcgcaaacg gtttatttga attaaaagtc aaggtatatg catttttatt + 241 tgatgtgatt ctgcagggga actgtccttc ggtatcaata attgcagaca ttccctgctt + 301 tttccttttt cactttcacg caatcagata tgcattttat tccattcatc cgacttatag + 361 ggcggagtgt gaaagcgaac ggctaacact attgcttact gctcagggat gcgcgctatc + 421 actttaattt caaaatcaaa gcctgccagc catgtaacac ccaccgccgt ccagtttgga + 481 taaggtgggg cgctaaatat ttcatttttc accgtcatga tgtcttcaaa ttggttttct + 541 ggatcggtat ggaagctcgt aacatcaatg atatcgtcaa aagtgcatcc cgcagctgcc + 601 agggtcgcat gcaaattatc aaatgccagt ctgacttgtt gctgaaaatc gggttctggt + 661 gttccgtcct ctcgacttcc tacttgcccg gaaacaaaca gcaaatcgcc ggaacgaata + 721 gccgcagaat aacgatgctc agcatatagt gaatgtcggc cagcagggaa aacagcggtt + 781 ctttctacca tttggttatc ctcaagattt acgacatgaa cagaagattt ctctttaccg + 841 ggagccgctt ttagcggacg acgtgagtaa acaaaaccca gacatcatgg ataatggctg + 901 ggcttaattg agcgtagtcg gttatgcgcc aaacgcgcca tcaatggtat gcattgcgcc + 961 ggtaacaaaa ctggcttctg gccctgctaa ccatgcgacc ataccagcga cctcttccgg + 1021 ttgcccatgt cttttgatag ccatcaaact atgcaacata tcgcgcattg gcccgttggc + 1081 gggattagcg tcggtatcaa ttggccctgg ctggacgacg ttaatggtga tcccacgcgg + 1141 tccaaaatca cgggccagcc cgcgcgccat gccttgcagg gcagatttgc tggcggcata + 1201 agcagccatg cctgcaacag gcatacgatc gccattcacg gagccgatga ttaagatgcg + 1261 cccgccttcg ggcatctgcc gggcggcttc aacagaggca tgataaggag catgaatatt + 1321 gattttgaaa aggcgatcaa tatcgtcggc atttaattcc agggcctcgc caaagacgcc + 1381 aatacctgca tttaccacca ggatatccaa tgcgccgctc ttacgaacga catcaatgac + 1441 agcgtctctg tcagcactat ctgtgaatac tgctgtcgct ccagtctctt gtgccaggcg + 1501 tttagcggca tctttcgacc ccgcataggt gaatcgtaca ttggccccat cggtgacgaa + 1561 acgacgtacg atagcggcac cgataccacg actgccaccg aggatgagaa ctgtcttacc + 1621 tgtaaaagcg cccataagga ctccttgatt tattatgtaa catgcattac aaaactgttt + 1681 taactttctg tcaacatgtt ttgtaatggt cactaaaaaa caatctcgcg ttccaggtcg + 1741 tcccagacgt ttcgctcctg agcaggcaat ctctgcggca aaagtgcttt ttcaccaaaa + 1801 aggtttcgat gctgtcagtg ttgctgaagt tactgattat cttggtatta accccccgag + 1861 cctctacgcg gcttttggca gtaaagctgg gttatttagc cgtgtactca atgaatacgt + 1921 cggtacggaa gctattccgc ttgccgatat tcttcgtgat gatcgtccag taggcgagtg + 1981 cctggttgag gtattaaaag aagcggcgcg cagatatagc caaaacggcg gctgcgctgg + 2041 ctgtatggtt cttgaaggta ttcatagtca tgatccacta gcgcgtgata ttgccgttca + 2101 atattatcac gccgcagaaa cgaccattta tgactatatc gccaggcggc atccacaaag + 2161 cgcacaatgt gtgactgatt ttatgagtac cgtgatgtca gggctttctg cgaaggcacg + 2221 ggaggggcac tcaatcgaac aactctgtgc aacagctgca ctggcggggg aagcgataaa + 2281 aactcttctc aaggagtgat gctggccttg atccgaaagg cgggaacgcg cctgccgata + 2341 agttgtgata agacaataat tcacgcatta aggctagcgg aattgatcat cttttcgtat + 2401 aacgatagaa atgaaacgtt gttttaatta aggagtggaa aagatgatca tcggaaatat + 2461 tcataatctt cagccgtggc taccccagga gttacgccag gcgattgagc atatcaaagc + 2521 acacgttacg gcagaaacgc caaagggcaa gcacgatatc gaaggcaatc atctgtttta + 2581 tcttatctcg gaagatatga ccgagccgta cgaagcccgc cgtgcggagt accatgcccg + 2641 ctatctcgac attcagattg tgttaaaagg tcaggaaggc atgaccttca gcacgcaacc + 2701 tgcaggcacg ccggataccg actggttagc tgataaagac atcgcatttt tgccggaagg + 2761 cgttgatgag aaaacagtta tccttaatga aggtgatttt gttgtgtttt atccggggga + 2821 agtgcataaa ccgctgtgcg cagtgggtgc accagcccag gttcgcaaag cagtagtgaa + 2881 gatgctgatg gcgtgatgac ttttcgccgt aaataactcc aggtttacgg cgagtttgtg + 2941 aaaagagcgt tttttgatat ttttttgtga gtaaaatttg taatgcttag acgttcttat + 3001 taactcaagg agtccgtcat gagcaaaata tcaggttgga atttttctca aaacattaca + 3061 tcagccgaca attgtaaaca aaaaaatgaa gacttagata cctggtatgt gggaatgaat + 3121 gattttgccc gaattgccgg agggcagaat agcagaagca atattctttc tcccagagca + 3181 tttttggagt ttttggctaa gatatttacc ctgggttatg tggattttag caaacgctcc + 3241 aa +// diff --git a/test/test_files/sbol3_genbank_conversion/test_extra_properties.gb b/test/test_files/sbol3_genbank_conversion/test_extra_properties.gb new file mode 100644 index 00000000..a6ec17c6 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/test_extra_properties.gb @@ -0,0 +1,124 @@ +LOCUS AF165912 5485 bp DNA circular PLN 29-JUL-1999 +DEFINITION (NOTE: This is a Modified version of the AF165912 GenBank File) + Arabidopsis thaliana CTP:phosphocholine cytidylyltransferase (CCT) + gene, complete cds. +ACCESSION AF165912 +VERSION AF165912.1 GI:5640000 +KEYWORDS test_keyword1; test_keyword2. +SOURCE Arabidopsis thaliana (thale cress) + ORGANISM Arabidopsis thaliana + Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; + Spermatophyta; Magnoliophyta; eudicotyledons; Gunneridae; + Pentapetalae; rosids; malvids; Brassicales; Brassicaceae; + Camelineae; Arabidopsis. +FEATURES Location/Qualifiers + regulatory 1..1602 + /label="c3" + gene 1..4637 + /label="c2" + source 1..5485 + /label="c1" + regulatory 1554..1560 + /label="c4" + 5'UTR 1603..1712 + /label="c6" + mRNA complement(1603..4637) + /label="c5" + CDS 1713..4309 + /label="c7" + 3'UTR 4310..4637 + /label="c8" +ORIGIN + 1 ccagaatggt tactatggac atccgccaac catacaagct atggtgaaat gctttatcta + 61 tctcattttt agtttcaaag cttttgttat aacacatgca aatccatatc cgtaaccaat + 121 atccaatcgc ttgacatagt ctgatgaagt ttttggtagt taagataaag ctcgagactg + 181 atatttcata tactggatga tttagggaaa cttgcattct attcatgaac gaatgagtca + 241 atacgagaca caaccaagca tgcaaggagc tgtgagttga tgttctatgc tatttaagta + 301 tttttcggga gatatatata tcttattgtt ctcctcctcc cgagtcaagt tgttctaaga + 361 aagaaggatc tatttcattt tgtggattgt ctagtttcag ggacagacgg ggtttagggg + 421 aagcgctatc cgtggctgct atgacatcga agaaactctg cacgacatgg tatgtaatct + 481 tgtgacgtta gtaaaaacgc tctaatgtta caaaagaaag aaagagaaaa cgaacccaat + 541 tcctcaaaat gttttctttt gacaatgtca acttcttcct tctcgggttt ttatcagttt + 601 gattgaagcg agactgcgaa attcctctgt ttacagtaga aaatgtgatc agccctattt + 661 ataaccgttg tatgttttcc ggtttttgtt tgtgcagaca atggggtcct cacagtttca + 721 gggatctgat tcgagccatc ctagtgatca ccgcttatcc aattaacaga acagaacaag + 781 ctcaagagtt gctactttca tatctttaaa atagtggaat gttttgtatg tacagaaata + 841 ggaaaggtct aaagtgtgga actggctttg aggtaaactc ttactctgat tggatttgct + 901 tgtatttata ccggaatcat aatagaaata tatgattaaa gtattcacat tctctaatct + 961 tcttttagac ttgtagttac cattcaaaag tcatggacaa ccttcgttaa ccttgagggc + 1021 cactgagaga caaccttgga cttggtcact ggctcactta tgcgtgcctc gccaaagtta + 1081 atctatcgac tgagattggt taatctggga acaaaaatta agagagaaag aagtagaact + 1141 aaaaagcaat attcagttat tcactctgtt gtatatagct caccaattaa attcaaacaa + 1201 ctaattcaaa acattatact atagcttttc attaaaaaat ttccaaaaca ttcatttaat + 1261 tatataaacc aaagaacagc ttttaagact taaaatattt cccaagtatc caacaagtca + 1321 atacagattt tttaagaaaa ctaaaccatt ttttcaactt tacaacaaaa acaccaactg + 1381 ttacaaaaaa actctcgaat ttcctatttc tccagcctta tgacaaagat atcagattaa + 1441 taaaatttag aattcattac tttttcttca tttttaaaat tatctacata ctatttattt + 1501 ctctccattt tattcagtga gaaaataaaa ttacaaatgc ctgaacacaa aaataaataa + 1561 aattagaata atcagtttcc tctagaggat attcttcgtc acaaaattaa aaaaaaaaag + 1621 agtaggagaa gagggaagcg actagcacct tttgtagttt tccgtttatt ttctgtataa + 1681 ggcgggtgat ttcggctcct tcatcggaaa ttatgagcaa cgttatcggc gatcgcactg + 1741 aagacggcct ttccaccgcc gctgcggcct ctggctctac ggctgtccag agttctcctc + 1801 ccactgatcg tcctgtccgc gtctacgccg atgggatcta cgatcttttc cactttggtc + 1861 atgctcgatc tctcgaacaa gccaaattag cgttagtatt tcttatctct tagagatgat + 1921 tgtcctgatt ttcatctcta attcgacttt tttttaccgt cgcgtgctca attttcgccg + 1981 ttccagtgtc atttttctct gatctgttga gtctggttca ttgtaagttg tacagttttt + 2041 gtttaggtcg agagacatat cttccttatt agatagtctc ggtgattgat tggtctgtat + 2101 tgattgaaat ctgtgatgtg caaggtcttg tcgcgtatga ttttagtgaa tccctttcta + 2161 aatgttgaaa tttgcaatag ctgatactgt ttctggatat atgttcttga cgaatgtttt + 2221 cgatttttta ttattttgag gaggtatgag agaaattgac ttctggtttc gtgttcttat + 2281 ggtgttgcta tgattgtgcc gtttcttaat cggccgagca ggtttccaaa caacacttac + 2341 cttcttgttg gatgttgcaa tgatgagact acccataagt acaagggaag gactgtaatg + 2401 actgcagaag agcgatatga atcacttcga cattgcaagt aattgttttc tcttatgttc + 2461 tgttgaatgt gttagtagaa aaacccatgg aagtggcagt gagtggaatt ttagaacacg + 2521 ttttttttat catgcaggtg ggtggatgaa gtcatccctg atgcaccatg ggtggtcaac + 2581 caggagtttc ttgacaagca ccagattgac tatgttgccc acgattctct tccgtaagaa + 2641 catgtgtctc ttgtgttagt ttttatttag ttttaaaaaa tggtgaaaac ttagttttgt + 2701 agtttttacc tttcacgacg tgcttgttgt tagtttagct cttttcttac aaatgatttt + 2761 agaactacaa taaccttctt tgtataattc tcatgcacag ctatgctgat tcaagcggag + 2821 ctggaaagga tgtctatgaa tttgtgagtc ggaagaattt tcatactcct gcttttgaca + 2881 ctttcatagt tctgttgtaa ctgagcatct gttgcaggtt aagaaagttg ggaggtttaa + 2941 ggaaacacag cgaactgaag gaatatcgac ctcggatata ataatgagaa tagtgaaaga + 3001 ttacaatcag tatgtcatgc gtaacttgga tagaggatac tcaagggaag atcttggagt + 3061 tagctttgtc aaggcatgtc atcattttct tatctctaca attttgtcct ttctcaaaaa + 3121 aaattcactt gtaagaatca actttggatt tgtcgatttg caacaggaaa agagacttag + 3181 agttaatatg aggctaaaga aactccagga gagggtcaaa gaacaacaag aaagagtggg + 3241 agaaaaggca tgtcttctct caacttcatt ttgcttaatt gatcattagt tcatcacaag + 3301 tccatcattt ggactgtatt gcattcaatc aaataaagct gttcatcata agttacaagg + 3361 agaaataact aaattttagg tcttgtctct gcctattcat tcacatctcc gcttgatctt + 3421 gtacctttga ctatttagcg actgtttgga aaccactctt aatgtgtcac gttttggagt + 3481 ctaacttgtc cttaatttga acctcgttca cttcttttag gactttaata ctctgtttgg + 3541 ttagtagcct ctaggcagaa aacatttgta tgtattgctt ttattttgtg tcttcttgtt + 3601 gtgattattg ggttatagaa ttgcatcaca aagtgatgct tgttaatccg ctgtagtagt + 3661 gccaggcgat atcatgttat ataatctcat ctcggtagta gcagccttat ctcgtgtatc + 3721 cgctgcgctt gaaacctcca tgcagtttca tgctttagct agtaatatga tatctgatga + 3781 gactaagttc atatgtgatt ctgaaaaagc tgattttgta gaagtttctt ataatgctcc + 3841 ttcctctgtt gttgttaaac ccggtttttc cagatccaaa ctgtaaaaat gctgcgcaac + 3901 gagtgggtag agaatgcaga tcgatgggtc gctggatttc ttgaaatatt tgaagaaggt + 3961 tgccataaga tggtaagttc aatcttgaag acacatacag tgcttcaaaa atctactaat + 4021 attcatgact atgttctgta taaccttgat taaacttgac aaatgcgtaa aatgttaaca + 4081 gggaactgca atcgtagaca gtatccaaga aaggttaatg agacaaaagt cggcagagag + 4141 gctggagaac ggtcaggatg atgacacaga cgaccagttc tatgaagaat acttcgatca + 4201 tgacatgggt agtgacgatg atgaagatga aaaattctac gacgaggaag aagtaaagga + 4261 agaagagaca gagaaaactg ttatgacgga tgctaaagac aacaagtaag aacaaatttg + 4321 gcttgcagaa acctcagatt agctctactt atggccactt ctactaaact cccttaagcc + 4381 tcgcactctc tctcgaaatt catctactta acatataata ccaatgttta gaaagagaga + 4441 gtgtgtgatg tgtttgtttg tgtgtgttga acaaacgaac gtgcgtggtt gtctttggtg + 4501 agttggtctc atctttgttg atttttgaat gcgcatgtat ttttttcttc tttttcatga + 4561 cgggcaaagt gttatgaagt acaatgcaat tgtctaaaac aggataagtc aatggttcgt + 4621 gtgtgccata aagtaaacat cgctgtgtac atcttccatg ttccaaactc aactcgtttt + 4681 cttcaaatat tgaaatacaa attggtcaaa agtcggttct tatttttttt ttaattcaca + 4741 tttttagttt gcagttttaa tagattacaa atcacatttt gtgctatttc caattccatg + 4801 agccggccaa gaatgtgagt aaaaggcaga taaagcaaag gatagccgat tgctttaaag + 4861 atgtctttgg taactagttc gaaattctct gtccactcga agactccaca actctcctct + 4921 caaatgtcag ctaatcaagt cctacacaac tatacaaaaa ggcaattaat tagtagaaaa + 4981 taaagattgg aggtttagct tctcccatac ataagtacct ttatgaatca ctaagctcag + 5041 ggtttatatg ataaccattg ctgatctgtg taaagagaag ttgatgaatt actacgtgag + 5101 tgttgttaac caactctctt tacatattag gaccgtgctt gtcaggccaa tggttttcac + 5161 ttcgaaaaat tgcttccgat atcaaactat gtgtacatta ttggtggact gtggacataa + 5221 cttaaacgca taattttatt gtgtaccttt aaaataaaca atagattaca catatatata + 5281 tggcaaatat ttgaacatta gatgtcaaga gaaaagtaaa acatgtcatg attacaccat + 5341 ctttgttatt atttagagtg attctcacta aatcttaggc ggttagcaac cgccatagtt + 5401 ttcaaaatct cattctatcg ggattaaatc tgtttttggt gactatatat aaacattggt + 5461 cgaattttta ggtaagtaaa atcag +// diff --git a/test/test_files/sbol3_genbank_conversion/test_extra_properties_with_references.gb b/test/test_files/sbol3_genbank_conversion/test_extra_properties_with_references.gb new file mode 100644 index 00000000..d8932581 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/test_extra_properties_with_references.gb @@ -0,0 +1,134 @@ +LOCUS AF165912 5485 bp DNA circular PLN 29-JUL-1999 +DEFINITION (NOTE: This is a Modified version of the AF165912 GenBank File) + Arabidopsis thaliana CTP:phosphocholine cytidylyltransferase (CCT) + gene, complete cds. +ACCESSION AF165912 +VERSION AF165912.1 GI:5640000 +KEYWORDS test_keyword1; test_keyword2. +SOURCE Arabidopsis thaliana (thale cress) + ORGANISM Arabidopsis thaliana + Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; + Spermatophyta; Magnoliophyta; eudicotyledons; Gunneridae; + Pentapetalae; rosids; malvids; Brassicales; Brassicaceae; + Camelineae; Arabidopsis. +REFERENCE 1 (bases 1 to 5485) + AUTHORS Choi,Y.H., Choi,S.B. and Cho,S.H. + TITLE Structure of a CTP:Phosphocholine Cytidylyltransferase Gene from + Arabidopsis thaliana + JOURNAL Unpublished +REFERENCE 2 (bases 1 to 5485) + AUTHORS Choi,Y.H., Choi,S.B. and Cho,S.H. + TITLE Direct Submission + JOURNAL Submitted (06-JUL-1999) Biology, Inha University, Yonghyon-Dong 253, + Inchon 402-751, Korea +FEATURES Location/Qualifiers + regulatory 1..1602 + /label="c3" + gene 1..4637 + /label="c2" + source 1..5485 + /label="c1" + regulatory 1554..1560 + /label="c4" + 5'UTR 1603..1712 + /label="c6" + mRNA complement(1603..4637) + /label="c5" + CDS 1713..4309 + /label="c7" + 3'UTR 4310..4637 + /label="c8" +ORIGIN + 1 ccagaatggt tactatggac atccgccaac catacaagct atggtgaaat gctttatcta + 61 tctcattttt agtttcaaag cttttgttat aacacatgca aatccatatc cgtaaccaat + 121 atccaatcgc ttgacatagt ctgatgaagt ttttggtagt taagataaag ctcgagactg + 181 atatttcata tactggatga tttagggaaa cttgcattct attcatgaac gaatgagtca + 241 atacgagaca caaccaagca tgcaaggagc tgtgagttga tgttctatgc tatttaagta + 301 tttttcggga gatatatata tcttattgtt ctcctcctcc cgagtcaagt tgttctaaga + 361 aagaaggatc tatttcattt tgtggattgt ctagtttcag ggacagacgg ggtttagggg + 421 aagcgctatc cgtggctgct atgacatcga agaaactctg cacgacatgg tatgtaatct + 481 tgtgacgtta gtaaaaacgc tctaatgtta caaaagaaag aaagagaaaa cgaacccaat + 541 tcctcaaaat gttttctttt gacaatgtca acttcttcct tctcgggttt ttatcagttt + 601 gattgaagcg agactgcgaa attcctctgt ttacagtaga aaatgtgatc agccctattt + 661 ataaccgttg tatgttttcc ggtttttgtt tgtgcagaca atggggtcct cacagtttca + 721 gggatctgat tcgagccatc ctagtgatca ccgcttatcc aattaacaga acagaacaag + 781 ctcaagagtt gctactttca tatctttaaa atagtggaat gttttgtatg tacagaaata + 841 ggaaaggtct aaagtgtgga actggctttg aggtaaactc ttactctgat tggatttgct + 901 tgtatttata ccggaatcat aatagaaata tatgattaaa gtattcacat tctctaatct + 961 tcttttagac ttgtagttac cattcaaaag tcatggacaa ccttcgttaa ccttgagggc + 1021 cactgagaga caaccttgga cttggtcact ggctcactta tgcgtgcctc gccaaagtta + 1081 atctatcgac tgagattggt taatctggga acaaaaatta agagagaaag aagtagaact + 1141 aaaaagcaat attcagttat tcactctgtt gtatatagct caccaattaa attcaaacaa + 1201 ctaattcaaa acattatact atagcttttc attaaaaaat ttccaaaaca ttcatttaat + 1261 tatataaacc aaagaacagc ttttaagact taaaatattt cccaagtatc caacaagtca + 1321 atacagattt tttaagaaaa ctaaaccatt ttttcaactt tacaacaaaa acaccaactg + 1381 ttacaaaaaa actctcgaat ttcctatttc tccagcctta tgacaaagat atcagattaa + 1441 taaaatttag aattcattac tttttcttca tttttaaaat tatctacata ctatttattt + 1501 ctctccattt tattcagtga gaaaataaaa ttacaaatgc ctgaacacaa aaataaataa + 1561 aattagaata atcagtttcc tctagaggat attcttcgtc acaaaattaa aaaaaaaaag + 1621 agtaggagaa gagggaagcg actagcacct tttgtagttt tccgtttatt ttctgtataa + 1681 ggcgggtgat ttcggctcct tcatcggaaa ttatgagcaa cgttatcggc gatcgcactg + 1741 aagacggcct ttccaccgcc gctgcggcct ctggctctac ggctgtccag agttctcctc + 1801 ccactgatcg tcctgtccgc gtctacgccg atgggatcta cgatcttttc cactttggtc + 1861 atgctcgatc tctcgaacaa gccaaattag cgttagtatt tcttatctct tagagatgat + 1921 tgtcctgatt ttcatctcta attcgacttt tttttaccgt cgcgtgctca attttcgccg + 1981 ttccagtgtc atttttctct gatctgttga gtctggttca ttgtaagttg tacagttttt + 2041 gtttaggtcg agagacatat cttccttatt agatagtctc ggtgattgat tggtctgtat + 2101 tgattgaaat ctgtgatgtg caaggtcttg tcgcgtatga ttttagtgaa tccctttcta + 2161 aatgttgaaa tttgcaatag ctgatactgt ttctggatat atgttcttga cgaatgtttt + 2221 cgatttttta ttattttgag gaggtatgag agaaattgac ttctggtttc gtgttcttat + 2281 ggtgttgcta tgattgtgcc gtttcttaat cggccgagca ggtttccaaa caacacttac + 2341 cttcttgttg gatgttgcaa tgatgagact acccataagt acaagggaag gactgtaatg + 2401 actgcagaag agcgatatga atcacttcga cattgcaagt aattgttttc tcttatgttc + 2461 tgttgaatgt gttagtagaa aaacccatgg aagtggcagt gagtggaatt ttagaacacg + 2521 ttttttttat catgcaggtg ggtggatgaa gtcatccctg atgcaccatg ggtggtcaac + 2581 caggagtttc ttgacaagca ccagattgac tatgttgccc acgattctct tccgtaagaa + 2641 catgtgtctc ttgtgttagt ttttatttag ttttaaaaaa tggtgaaaac ttagttttgt + 2701 agtttttacc tttcacgacg tgcttgttgt tagtttagct cttttcttac aaatgatttt + 2761 agaactacaa taaccttctt tgtataattc tcatgcacag ctatgctgat tcaagcggag + 2821 ctggaaagga tgtctatgaa tttgtgagtc ggaagaattt tcatactcct gcttttgaca + 2881 ctttcatagt tctgttgtaa ctgagcatct gttgcaggtt aagaaagttg ggaggtttaa + 2941 ggaaacacag cgaactgaag gaatatcgac ctcggatata ataatgagaa tagtgaaaga + 3001 ttacaatcag tatgtcatgc gtaacttgga tagaggatac tcaagggaag atcttggagt + 3061 tagctttgtc aaggcatgtc atcattttct tatctctaca attttgtcct ttctcaaaaa + 3121 aaattcactt gtaagaatca actttggatt tgtcgatttg caacaggaaa agagacttag + 3181 agttaatatg aggctaaaga aactccagga gagggtcaaa gaacaacaag aaagagtggg + 3241 agaaaaggca tgtcttctct caacttcatt ttgcttaatt gatcattagt tcatcacaag + 3301 tccatcattt ggactgtatt gcattcaatc aaataaagct gttcatcata agttacaagg + 3361 agaaataact aaattttagg tcttgtctct gcctattcat tcacatctcc gcttgatctt + 3421 gtacctttga ctatttagcg actgtttgga aaccactctt aatgtgtcac gttttggagt + 3481 ctaacttgtc cttaatttga acctcgttca cttcttttag gactttaata ctctgtttgg + 3541 ttagtagcct ctaggcagaa aacatttgta tgtattgctt ttattttgtg tcttcttgtt + 3601 gtgattattg ggttatagaa ttgcatcaca aagtgatgct tgttaatccg ctgtagtagt + 3661 gccaggcgat atcatgttat ataatctcat ctcggtagta gcagccttat ctcgtgtatc + 3721 cgctgcgctt gaaacctcca tgcagtttca tgctttagct agtaatatga tatctgatga + 3781 gactaagttc atatgtgatt ctgaaaaagc tgattttgta gaagtttctt ataatgctcc + 3841 ttcctctgtt gttgttaaac ccggtttttc cagatccaaa ctgtaaaaat gctgcgcaac + 3901 gagtgggtag agaatgcaga tcgatgggtc gctggatttc ttgaaatatt tgaagaaggt + 3961 tgccataaga tggtaagttc aatcttgaag acacatacag tgcttcaaaa atctactaat + 4021 attcatgact atgttctgta taaccttgat taaacttgac aaatgcgtaa aatgttaaca + 4081 gggaactgca atcgtagaca gtatccaaga aaggttaatg agacaaaagt cggcagagag + 4141 gctggagaac ggtcaggatg atgacacaga cgaccagttc tatgaagaat acttcgatca + 4201 tgacatgggt agtgacgatg atgaagatga aaaattctac gacgaggaag aagtaaagga + 4261 agaagagaca gagaaaactg ttatgacgga tgctaaagac aacaagtaag aacaaatttg + 4321 gcttgcagaa acctcagatt agctctactt atggccactt ctactaaact cccttaagcc + 4381 tcgcactctc tctcgaaatt catctactta acatataata ccaatgttta gaaagagaga + 4441 gtgtgtgatg tgtttgtttg tgtgtgttga acaaacgaac gtgcgtggtt gtctttggtg + 4501 agttggtctc atctttgttg atttttgaat gcgcatgtat ttttttcttc tttttcatga + 4561 cgggcaaagt gttatgaagt acaatgcaat tgtctaaaac aggataagtc aatggttcgt + 4621 gtgtgccata aagtaaacat cgctgtgtac atcttccatg ttccaaactc aactcgtttt + 4681 cttcaaatat tgaaatacaa attggtcaaa agtcggttct tatttttttt ttaattcaca + 4741 tttttagttt gcagttttaa tagattacaa atcacatttt gtgctatttc caattccatg + 4801 agccggccaa gaatgtgagt aaaaggcaga taaagcaaag gatagccgat tgctttaaag + 4861 atgtctttgg taactagttc gaaattctct gtccactcga agactccaca actctcctct + 4921 caaatgtcag ctaatcaagt cctacacaac tatacaaaaa ggcaattaat tagtagaaaa + 4981 taaagattgg aggtttagct tctcccatac ataagtacct ttatgaatca ctaagctcag + 5041 ggtttatatg ataaccattg ctgatctgtg taaagagaag ttgatgaatt actacgtgag + 5101 tgttgttaac caactctctt tacatattag gaccgtgctt gtcaggccaa tggttttcac + 5161 ttcgaaaaat tgcttccgat atcaaactat gtgtacatta ttggtggact gtggacataa + 5221 cttaaacgca taattttatt gtgtaccttt aaaataaaca atagattaca catatatata + 5281 tggcaaatat ttgaacatta gatgtcaaga gaaaagtaaa acatgtcatg attacaccat + 5341 ctttgttatt atttagagtg attctcacta aatcttaggc ggttagcaac cgccatagtt + 5401 ttcaaaatct cattctatcg ggattaaatc tgtttttggt gactatatat aaacattggt + 5461 cgaattttta ggtaagtaaa atcag +// diff --git a/test/test_files/sbol3_genbank_conversion/test_location_types.gb b/test/test_files/sbol3_genbank_conversion/test_location_types.gb new file mode 100644 index 00000000..30b1be91 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/test_location_types.gb @@ -0,0 +1,223 @@ +LOCUS JWYZ01000115 3242 bp DNA linear BCT 16-JAN-2015 +DEFINITION Escherichia coli strain CVM N37069PS N37069PS_contig_115, whole + genome shotgun sequence. +ACCESSION JWYZ01000115 +VERSION JWYZ01000115.1 +DBLINK BioProject: PRJNA266657 + BioSample: SAMN03177677 +KEYWORDS WGS. +SOURCE Escherichia coli + ORGANISM Escherichia coli + Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; + Enterobacteriaceae; Escherichia. +REFERENCE 1 (bases 1 to 3242) + AUTHORS Tyson,G.H., McDermott,P.F., Li,C., Chen,Y., Tadesse,D.A., + Mukherjee,S., Bodeis-Jones,S., Kabera,C., Gaines,S.A., + Loneragan,G.H., Edrington,T.S., Torrence,M., Harhay,D.M. and Zhao,S. + TITLE WGS accurately predicts antimicrobial resistance in Escherichia coli + JOURNAL J. Antimicrob. Chemother. 70 (10), 2763-2769 (2015) + PUBMED 26142410 +REFERENCE 2 (bases 1 to 3242) + AUTHORS Tyson,G.H., McDermott,P.F., Li,C., Tadesse,D.A., Mukherjee,S., + Bodeis-Jones,S., Kabera,C., Gaines,S.A., Loneragan,G.H., + Edrington,T.S., Torrence,M., Harhay,D.M. and Zhao,S. + TITLE Direct Submission + JOURNAL Submitted (17-NOV-2014) CVM, FDA, 8401 Muirkirk Rd, Laurel, MD + 20708, USA +COMMENT ##Genome-Assembly-Data-START## + Assembly Method :: CLC Genomics Workbench v. 7.5 + Assembly Name :: Escherichia coli CVM N37069PS v1.0 + Genome Coverage :: 48.5x + Sequencing Technology :: Illumina MiSeq + ##Genome-Assembly-Data-END## + ##Genome-Annotation-Data-START## + Annotation Provider :: NCBI + Annotation Date :: 12/29/2014 14:07:05 + Annotation Pipeline :: NCBI Prokaryotic Genome Annotation Pipeline + Annotation Method :: Best-placed reference protein set; GeneMarkS+ + Annotation Software revision :: 2.9 (rev. 455303) + Features Annotated :: Gene; CDS; rRNA; tRNA; ncRNA; repeat_region + Genes :: 4,855 + CDS :: 4,642 + Pseudo Genes :: 107 + CRISPR Arrays :: 2 + rRNAs :: 11 (5S, 16S, 23S) + tRNAs :: 78 + ncRNA :: 17 + Frameshifted Genes :: 41 + ##Genome-Annotation-Data-END## + Annotation was added by the NCBI Prokaryotic Genome Annotation + Pipeline (released 2013). Information about the Pipeline can be + found here: http://www.ncbi.nlm.nih.gov/genome/annotation_prok/ +FEATURES Location/Qualifiers + gene complement(<1..115) + /locus_tag="PU64_23660" + CDS complement(<1..115) + /locus_tag="PU64_23660" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_005059815.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="pyrBI operon leader peptide" + /protein_id="KIG36579.1" + /translation="MVQCVRHFVLPRLKKDAGLPFFFPLITHSQPLNRGAFF" + source 1..3242 + /organism="Escherichia coli" + /mol_type="genomic DNA" + /submitter_seqid="N37069PS_contig_115" + /strain="CVM N37069PS" + /isolation_source="Farm" + /db_xref="taxon:562" + /country="USA" + /collection_date="20-Jan-2012" + gene 95..427 + /locus_tag="PU64_23665" + CDS 95..427 + /locus_tag="PU64_23665" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_001349257.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="hypothetical protein" + /protein_id="KIG36585.1" + /translation="MSNTLNHTSSRQIVRHYTHRQKRRKHLMQYFVSANGLFELKVKVY + AFLFDVILQGNCPSVSIIADIPCFFLFHFHAIRYAFYSIHPTYRAECESERLTLLLTAQ + GCALSL" + gene complement(396..791) + /locus_tag="PU64_23670" + CDS complement(396..791) + /locus_tag="PU64_23670" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_001701843.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="mRNA endoribonuclease" + /protein_id="KIG36580.1" + /translation="MVERTAVFPAGRHSLYAEHRYSAAIRSGDLLFVSGQVGSREDGTP + EPDFQQQVRLAFDNLHATLAAAGCTFDDIIDVTSFHTDPENQFEDIMTVKNEIFSAPPY + PNWTAVGVTWLAGFDFEIKVIARIPEQ" + gene complement(922..1635) + /locus_tag="PU64_23675" + CDS complement(922..1635) + /locus_tag="PU64_23675" + /inference="EXISTENCE: similar to AA + sequence:SwissProt:P39333.2" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="oxidoreductase" + /protein_id="KIG36581.1" + /translation="MGAFTGKTVLILGGSRGIGAAIVRRFVTDGANVRFTYAGSKDAAK + RLAQETGATAVFTDSADRDAVIDVVRKSGALDILVVNAGIGVFGEALELNADDIDRLFK + INIHAPYHASVEAARQMPEGGRILIIGSVNGDRMPVAGMAAYAASKSALQGMARGLARD + FGPRGITINVVQPGPIDTDANPANGPMRDMLHSLMAIKRHGQPEEVAGMVAWLAGPEAS + FVTGAMHTIDGAFGA" + gene 1706..2299 + /locus_tag="PU64_23680" + CDS 1706..2299 + /locus_tag="PU64_23680" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_001544295.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="TetR family transcriptional regulator" + /protein_id="KIG36582.1" + /translation="MVTKKQSRVPGRPRRFAPEQAISAAKVLFHQKGFDAVSVAEVTDY + LGINPPSLYAAFGSKAGLFSRVLNEYVGTEAIPLADILRDDRPVGECLVEVLKEAARRY + SQNGGCAGCMVLEGIHSHDPLARDIAVQYYHAAETTIYDYIARRHPQSAQCVTDFMSTV + MSGLSAKAREGHSIEQLCATAALAGEAIKTLLKE" + gene 2444..2896 + /locus_tag="PU64_23685" + CDS 2444..2896 + /locus_tag="PU64_23685" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_001570607.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="toxin-antitoxin biofilm protein TabA" + /protein_id="KIG36583.1" + /translation="MIIGNIHNLQPWLPQELRQAIEHIKAHVTAETPKGKHDIEGNHLF + YLISEDMTEPYEARRAEYHARYLDIQIVLKGQEGMTFSTQPAGTPDTDWLADKDIAFLP + EGVDEKTVILNEGDFVVFYPGEVHKPLCAVGAPAQVRKAVVKMLMA" + gene 3019..>3242 + /locus_tag="PU64_23690" + CDS 3019..>3242 + /locus_tag="PU64_23690" + /inference="EXISTENCE: similar to AA + sequence:RefSeq:WP_000036524.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="DNA-binding protein" + /protein_id="KIG36584.1" + /translation="MSKISGWNFSQNITSADNCKQKNEDLDTWYVGMNDFARIAGGQNS + RSNILSPRAFLEFLAKIFTLGYVDFSKRS" +ORIGIN + 1 aaaaaaaagc ccctcgattg aggggctggg aatgggtgat caacgggaag aaaaacggca + 61 ggccagcgtc ttttttcaga cgcggtaaga caaaatgtcg aacacactga accatacatc + 121 ctcccggcaa attgtccggc attatactca tcgtcagaag cggcgcaagc atttgatgca + 181 atattttgtc agcgcaaacg gtttatttga attaaaagtc aaggtatatg catttttatt + 241 tgatgtgatt ctgcagggga actgtccttc ggtatcaata attgcagaca ttccctgctt + 301 tttccttttt cactttcacg caatcagata tgcattttat tccattcatc cgacttatag + 361 ggcggagtgt gaaagcgaac ggctaacact attgcttact gctcagggat gcgcgctatc + 421 actttaattt caaaatcaaa gcctgccagc catgtaacac ccaccgccgt ccagtttgga + 481 taaggtgggg cgctaaatat ttcatttttc accgtcatga tgtcttcaaa ttggttttct + 541 ggatcggtat ggaagctcgt aacatcaatg atatcgtcaa aagtgcatcc cgcagctgcc + 601 agggtcgcat gcaaattatc aaatgccagt ctgacttgtt gctgaaaatc gggttctggt + 661 gttccgtcct ctcgacttcc tacttgcccg gaaacaaaca gcaaatcgcc ggaacgaata + 721 gccgcagaat aacgatgctc agcatatagt gaatgtcggc cagcagggaa aacagcggtt + 781 ctttctacca tttggttatc ctcaagattt acgacatgaa cagaagattt ctctttaccg + 841 ggagccgctt ttagcggacg acgtgagtaa acaaaaccca gacatcatgg ataatggctg + 901 ggcttaattg agcgtagtcg gttatgcgcc aaacgcgcca tcaatggtat gcattgcgcc + 961 ggtaacaaaa ctggcttctg gccctgctaa ccatgcgacc ataccagcga cctcttccgg + 1021 ttgcccatgt cttttgatag ccatcaaact atgcaacata tcgcgcattg gcccgttggc + 1081 gggattagcg tcggtatcaa ttggccctgg ctggacgacg ttaatggtga tcccacgcgg + 1141 tccaaaatca cgggccagcc cgcgcgccat gccttgcagg gcagatttgc tggcggcata + 1201 agcagccatg cctgcaacag gcatacgatc gccattcacg gagccgatga ttaagatgcg + 1261 cccgccttcg ggcatctgcc gggcggcttc aacagaggca tgataaggag catgaatatt + 1321 gattttgaaa aggcgatcaa tatcgtcggc atttaattcc agggcctcgc caaagacgcc + 1381 aatacctgca tttaccacca ggatatccaa tgcgccgctc ttacgaacga catcaatgac + 1441 agcgtctctg tcagcactat ctgtgaatac tgctgtcgct ccagtctctt gtgccaggcg + 1501 tttagcggca tctttcgacc ccgcataggt gaatcgtaca ttggccccat cggtgacgaa + 1561 acgacgtacg atagcggcac cgataccacg actgccaccg aggatgagaa ctgtcttacc + 1621 tgtaaaagcg cccataagga ctccttgatt tattatgtaa catgcattac aaaactgttt + 1681 taactttctg tcaacatgtt ttgtaatggt cactaaaaaa caatctcgcg ttccaggtcg + 1741 tcccagacgt ttcgctcctg agcaggcaat ctctgcggca aaagtgcttt ttcaccaaaa + 1801 aggtttcgat gctgtcagtg ttgctgaagt tactgattat cttggtatta accccccgag + 1861 cctctacgcg gcttttggca gtaaagctgg gttatttagc cgtgtactca atgaatacgt + 1921 cggtacggaa gctattccgc ttgccgatat tcttcgtgat gatcgtccag taggcgagtg + 1981 cctggttgag gtattaaaag aagcggcgcg cagatatagc caaaacggcg gctgcgctgg + 2041 ctgtatggtt cttgaaggta ttcatagtca tgatccacta gcgcgtgata ttgccgttca + 2101 atattatcac gccgcagaaa cgaccattta tgactatatc gccaggcggc atccacaaag + 2161 cgcacaatgt gtgactgatt ttatgagtac cgtgatgtca gggctttctg cgaaggcacg + 2221 ggaggggcac tcaatcgaac aactctgtgc aacagctgca ctggcggggg aagcgataaa + 2281 aactcttctc aaggagtgat gctggccttg atccgaaagg cgggaacgcg cctgccgata + 2341 agttgtgata agacaataat tcacgcatta aggctagcgg aattgatcat cttttcgtat + 2401 aacgatagaa atgaaacgtt gttttaatta aggagtggaa aagatgatca tcggaaatat + 2461 tcataatctt cagccgtggc taccccagga gttacgccag gcgattgagc atatcaaagc + 2521 acacgttacg gcagaaacgc caaagggcaa gcacgatatc gaaggcaatc atctgtttta + 2581 tcttatctcg gaagatatga ccgagccgta cgaagcccgc cgtgcggagt accatgcccg + 2641 ctatctcgac attcagattg tgttaaaagg tcaggaaggc atgaccttca gcacgcaacc + 2701 tgcaggcacg ccggataccg actggttagc tgataaagac atcgcatttt tgccggaagg + 2761 cgttgatgag aaaacagtta tccttaatga aggtgatttt gttgtgtttt atccggggga + 2821 agtgcataaa ccgctgtgcg cagtgggtgc accagcccag gttcgcaaag cagtagtgaa + 2881 gatgctgatg gcgtgatgac ttttcgccgt aaataactcc aggtttacgg cgagtttgtg + 2941 aaaagagcgt tttttgatat ttttttgtga gtaaaatttg taatgcttag acgttcttat + 3001 taactcaagg agtccgtcat gagcaaaata tcaggttgga atttttctca aaacattaca + 3061 tcagccgaca attgtaaaca aaaaaatgaa gacttagata cctggtatgt gggaatgaat + 3121 gattttgccc gaattgccgg agggcagaat agcagaagca atattctttc tcccagagca + 3181 tttttggagt ttttggctaa gatatttacc ctgggttatg tggattttag caaacgctcc + 3241 aa +// diff --git a/test/test_files/sbol3_genbank_conversion/test_location_types.nt b/test/test_files/sbol3_genbank_conversion/test_location_types.nt new file mode 100644 index 00000000..8c652cda --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/test_location_types.nt @@ -0,0 +1,436 @@ + "Range1" . + "3242"^^ . + . + . + "0"^^ . + . + "GenbankReference1" . + "Tyson,G.H., McDermott,P.F., Li,C., Chen,Y., Tadesse,D.A., Mukherjee,S., Bodeis-Jones,S., Kabera,C., Gaines,S.A., Loneragan,G.H., Edrington,T.S., Torrence,M., Harhay,D.M. and Zhao,S." . + "" . + "JWYZ01000115" . + "" . + "J. Antimicrob. Chemother. 70 (10), 2763-2769 (2015)" . + . + "" . + "26142410" . + "WGS accurately predicts antimicrobial resistance in Escherichia coli" . + . + . + "Range1" . + "3242"^^ . + . + . + "0"^^ . + . + "GenbankReference2" . + "Tyson,G.H., McDermott,P.F., Li,C., Tadesse,D.A., Mukherjee,S., Bodeis-Jones,S., Kabera,C., Gaines,S.A., Loneragan,G.H., Edrington,T.S., Torrence,M., Harhay,D.M. and Zhao,S." . + "" . + "JWYZ01000115" . + "" . + "Submitted (17-NOV-2014) CVM, FDA, 8401 Muirkirk Rd, Laurel, MD 20708, USA" . + . + "" . + "" . + "Direct Submission" . + . + . + "Range1" . + . + "115"^^ . + "1"^^ . + . + . + "Range1" . + "2299"^^ . + . + . + "1706"^^ . + . + "SequenceFeature10" . + . + . + . + "0:locus_tag" . + "0:PU64_23680" . + . + "Range1" . + "2299"^^ . + . + . + "1706"^^ . + . + "SequenceFeature11" . + . + . + . + "0:locus_tag" . + "1:inference" . + "2:note" . + "3:codon_start" . + "4:transl_table" . + "5:product" . + "6:protein_id" . + "7:translation" . + "0:PU64_23680" . + "1:EXISTENCE: similar to AA sequence:RefSeq:WP_001544295.1" . + "2:Derived by automated computational analysis using gene prediction method: Protein Homology." . + "3:1" . + "4:11" . + "5:TetR family transcriptional regulator" . + "6:KIG36582.1" . + "7:MVTKKQSRVPGRPRRFAPEQAISAAKVLFHQKGFDAVSVAEVTDYLGINPPSLYAAFGSKAGLFSRVLNEYVGTEAIPLADILRDDRPVGECLVEVLKEAARRYSQNGGCAGCMVLEGIHSHDPLARDIAVQYYHAAETTIYDYIARRHPQSAQCVTDFMSTVMSGLSAKAREGHSIEQLCATAALAGEAIKTLLKE" . + . + "Range1" . + "2896"^^ . + . + . + "2444"^^ . + . + "SequenceFeature12" . + . + . + . + "0:locus_tag" . + "0:PU64_23685" . + . + "Range1" . + "2896"^^ . + . + . + "2444"^^ . + . + "SequenceFeature13" . + . + . + . + "0:locus_tag" . + "1:inference" . + "2:note" . + "3:codon_start" . + "4:transl_table" . + "5:product" . + "6:protein_id" . + "7:translation" . + "0:PU64_23685" . + "1:EXISTENCE: similar to AA sequence:RefSeq:WP_001570607.1" . + "2:Derived by automated computational analysis using gene prediction method: Protein Homology." . + "3:1" . + "4:11" . + "5:toxin-antitoxin biofilm protein TabA" . + "6:KIG36583.1" . + "7:MIIGNIHNLQPWLPQELRQAIEHIKAHVTAETPKGKHDIEGNHLFYLISEDMTEPYEARRAEYHARYLDIQIVLKGQEGMTFSTQPAGTPDTDWLADKDIAFLPEGVDEKTVILNEGDFVVFYPGEVHKPLCAVGAPAQVRKAVVKMLMA" . + . + "Range1" . + . + "3242"^^ . + "3019"^^ . + . + . + "SequenceFeature14" . + . + . + . + "0:locus_tag" . + "0:PU64_23690" . + . + "Range1" . + . + "3242"^^ . + "3019"^^ . + . + . + "SequenceFeature15" . + . + . + . + "0:locus_tag" . + "1:inference" . + "2:note" . + "3:codon_start" . + "4:transl_table" . + "5:product" . + "6:protein_id" . + "7:translation" . + "0:PU64_23690" . + "1:EXISTENCE: similar to AA sequence:RefSeq:WP_000036524.1" . + "2:Derived by automated computational analysis using gene prediction method: Protein Homology." . + "3:1" . + "4:11" . + "5:DNA-binding protein" . + "6:KIG36584.1" . + "7:MSKISGWNFSQNITSADNCKQKNEDLDTWYVGMNDFARIAGGQNSRSNILSPRAFLEFLAKIFTLGYVDFSKRS" . + . + "SequenceFeature1" . + . + . + . + "0:locus_tag" . + "0:PU64_23660" . + . + "Range1" . + . + "115"^^ . + "1"^^ . + . + . + "SequenceFeature2" . + . + . + . + "0:locus_tag" . + "1:inference" . + "2:note" . + "3:codon_start" . + "4:transl_table" . + "5:product" . + "6:protein_id" . + "7:translation" . + "0:PU64_23660" . + "1:EXISTENCE: similar to AA sequence:RefSeq:WP_005059815.1" . + "2:Derived by automated computational analysis using gene prediction method: Protein Homology." . + "3:1" . + "4:11" . + "5:pyrBI operon leader peptide" . + "6:KIG36579.1" . + "7:MVQCVRHFVLPRLKKDAGLPFFFPLITHSQPLNRGAFF" . + . + "Range1" . + "3242"^^ . + . + . + "1"^^ . + . + "SequenceFeature3" . + . + . + . + "0:organism" . + "1:mol_type" . + "2:submitter_seqid" . + "3:strain" . + "4:isolation_source" . + "5:db_xref" . + "6:country" . + "7:collection_date" . + "0:Escherichia coli" . + "1:genomic DNA" . + "2:N37069PS_contig_115" . + "3:CVM N37069PS" . + "4:Farm" . + "5:taxon:562" . + "6:USA" . + "7:20-Jan-2012" . + . + "Range1" . + "427"^^ . + . + . + "95"^^ . + . + "SequenceFeature4" . + . + . + . + "0:locus_tag" . + "0:PU64_23665" . + . + "Range1" . + "427"^^ . + . + . + "95"^^ . + . + "SequenceFeature5" . + . + . + . + "0:locus_tag" . + "1:inference" . + "2:note" . + "3:codon_start" . + "4:transl_table" . + "5:product" . + "6:protein_id" . + "7:translation" . + "0:PU64_23665" . + "1:EXISTENCE: similar to AA sequence:RefSeq:WP_001349257.1" . + "2:Derived by automated computational analysis using gene prediction method: Protein Homology." . + "3:1" . + "4:11" . + "5:hypothetical protein" . + "6:KIG36585.1" . + "7:MSNTLNHTSSRQIVRHYTHRQKRRKHLMQYFVSANGLFELKVKVYAFLFDVILQGNCPSVSIIADIPCFFLFHFHAIRYAFYSIHPTYRAECESERLTLLLTAQGCALSL" . + . + "Range1" . + "791"^^ . + . + . + "396"^^ . + . + "SequenceFeature6" . + . + . + . + "0:locus_tag" . + "0:PU64_23670" . + . + "Range1" . + "791"^^ . + . + . + "396"^^ . + . + "SequenceFeature7" . + . + . + . + "0:locus_tag" . + "1:inference" . + "2:note" . + "3:codon_start" . + "4:transl_table" . + "5:product" . + "6:protein_id" . + "7:translation" . + "0:PU64_23670" . + "1:EXISTENCE: similar to AA sequence:RefSeq:WP_001701843.1" . + "2:Derived by automated computational analysis using gene prediction method: Protein Homology." . + "3:1" . + "4:11" . + "5:mRNA endoribonuclease" . + "6:KIG36580.1" . + "7:MVERTAVFPAGRHSLYAEHRYSAAIRSGDLLFVSGQVGSREDGTPEPDFQQQVRLAFDNLHATLAAAGCTFDDIIDVTSFHTDPENQFEDIMTVKNEIFSAPPYPNWTAVGVTWLAGFDFEIKVIARIPEQ" . + . + "Range1" . + "1635"^^ . + . + . + "922"^^ . + . + "SequenceFeature8" . + . + . + . + "0:locus_tag" . + "0:PU64_23675" . + . + "Range1" . + "1635"^^ . + . + . + "922"^^ . + . + "SequenceFeature9" . + . + . + . + "0:locus_tag" . + "1:inference" . + "2:note" . + "3:codon_start" . + "4:transl_table" . + "5:product" . + "6:protein_id" . + "7:translation" . + "0:PU64_23675" . + "1:EXISTENCE: similar to AA sequence:SwissProt:P39333.2" . + "2:Derived by automated computational analysis using gene prediction method: Protein Homology." . + "3:1" . + "4:11" . + "5:oxidoreductase" . + "6:KIG36581.1" . + "7:MGAFTGKTVLILGGSRGIGAAIVRRFVTDGANVRFTYAGSKDAAKRLAQETGATAVFTDSADRDAVIDVVRKSGALDILVVNAGIGVFGEALELNADDIDRLFKINIHAPYHASVEAARQMPEGGRILIIGSVNGDRMPVAGMAAYAASKSALQGMARGLARDFGPRGITINVVQPGPIDTDANPANGPMRDMLHSLMAIKRHGQPEEVAGMVAWLAGPEASFVTGAMHTIDGAFGA" . + . + "GenbankStructuredComment1" . + "JWYZ01000115" . + "Genome-Assembly-Data" . + "1::Assembly Method" . + "2::Assembly Name" . + "3::Genome Coverage" . + "4::Sequencing Technology" . + "1::CLC Genomics Workbench v. 7.5" . + "2::Escherichia coli CVM N37069PS v1.0" . + "3::48.5x" . + "4::Illumina MiSeq" . + . + . + "GenbankStructuredComment2" . + "JWYZ01000115" . + "Genome-Annotation-Data" . + "10::CRISPR Arrays" . + "11::rRNAs" . + "12::tRNAs" . + "13::ncRNA" . + "14::Frameshifted Genes" . + "1::Annotation Provider" . + "2::Annotation Date" . + "3::Annotation Pipeline" . + "4::Annotation Method" . + "5::Annotation Software revision" . + "6::Features Annotated" . + "7::Genes" . + "8::CDS" . + "9::Pseudo Genes" . + "10::2" . + "11::11 (5S, 16S, 23S)" . + "12::78" . + "13::17" . + "14::41" . + "1::NCBI" . + "2::12/29/2014 14:07:05" . + "3::NCBI Prokaryotic Genome Annotation Pipeline" . + "4::Best-placed reference protein set; GeneMarkS+" . + "5::2.9 (rev. 455303)" . + "6::Gene; CDS; rRNA; tRNA; ncRNA; repeat_region" . + "7::4,855" . + "8::4,642" . + "9::107" . + . + . + "Escherichia coli strain CVM N37069PS N37069PS_contig_115, whole genome shotgun sequence" . + "JWYZ01000115" . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + "JWYZ01000115" . + "Annotation was added by the NCBI Prokaryotic Genome Annotation\nPipeline (released 2013). Information about the Pipeline can be\nfound here: http://www.ncbi.nlm.nih.gov/genome/annotation_prok/" . + "16-JAN-2015" . + "BioProject:PRJNA266657::BioSample:SAMN03177677" . + "BCT" . + . + . + . + . + "JWYZ01000115.1" . + "WGS" . + "JWYZ01000115" . + "DNA" . + "JWYZ01000115" . + "Escherichia coli" . + . + . + "1"^^ . + "Escherichia coli" . + . + . + "Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia" . + "linear" . + . + "JWYZ01000115_sequence" . + "aaaaaaaagcccctcgattgaggggctgggaatgggtgatcaacgggaagaaaaacggcaggccagcgtcttttttcagacgcggtaagacaaaatgtcgaacacactgaaccatacatcctcccggcaaattgtccggcattatactcatcgtcagaagcggcgcaagcatttgatgcaatattttgtcagcgcaaacggtttatttgaattaaaagtcaaggtatatgcatttttatttgatgtgattctgcaggggaactgtccttcggtatcaataattgcagacattccctgctttttcctttttcactttcacgcaatcagatatgcattttattccattcatccgacttatagggcggagtgtgaaagcgaacggctaacactattgcttactgctcagggatgcgcgctatcactttaatttcaaaatcaaagcctgccagccatgtaacacccaccgccgtccagtttggataaggtggggcgctaaatatttcatttttcaccgtcatgatgtcttcaaattggttttctggatcggtatggaagctcgtaacatcaatgatatcgtcaaaagtgcatcccgcagctgccagggtcgcatgcaaattatcaaatgccagtctgacttgttgctgaaaatcgggttctggtgttccgtcctctcgacttcctacttgcccggaaacaaacagcaaatcgccggaacgaatagccgcagaataacgatgctcagcatatagtgaatgtcggccagcagggaaaacagcggttctttctaccatttggttatcctcaagatttacgacatgaacagaagatttctctttaccgggagccgcttttagcggacgacgtgagtaaacaaaacccagacatcatggataatggctgggcttaattgagcgtagtcggttatgcgccaaacgcgccatcaatggtatgcattgcgccggtaacaaaactggcttctggccctgctaaccatgcgaccataccagcgacctcttccggttgcccatgtcttttgatagccatcaaactatgcaacatatcgcgcattggcccgttggcgggattagcgtcggtatcaattggccctggctggacgacgttaatggtgatcccacgcggtccaaaatcacgggccagcccgcgcgccatgccttgcagggcagatttgctggcggcataagcagccatgcctgcaacaggcatacgatcgccattcacggagccgatgattaagatgcgcccgccttcgggcatctgccgggcggcttcaacagaggcatgataaggagcatgaatattgattttgaaaaggcgatcaatatcgtcggcatttaattccagggcctcgccaaagacgccaatacctgcatttaccaccaggatatccaatgcgccgctcttacgaacgacatcaatgacagcgtctctgtcagcactatctgtgaatactgctgtcgctccagtctcttgtgccaggcgtttagcggcatctttcgaccccgcataggtgaatcgtacattggccccatcggtgacgaaacgacgtacgatagcggcaccgataccacgactgccaccgaggatgagaactgtcttacctgtaaaagcgcccataaggactccttgatttattatgtaacatgcattacaaaactgttttaactttctgtcaacatgttttgtaatggtcactaaaaaacaatctcgcgttccaggtcgtcccagacgtttcgctcctgagcaggcaatctctgcggcaaaagtgctttttcaccaaaaaggtttcgatgctgtcagtgttgctgaagttactgattatcttggtattaaccccccgagcctctacgcggcttttggcagtaaagctgggttatttagccgtgtactcaatgaatacgtcggtacggaagctattccgcttgccgatattcttcgtgatgatcgtccagtaggcgagtgcctggttgaggtattaaaagaagcggcgcgcagatatagccaaaacggcggctgcgctggctgtatggttcttgaaggtattcatagtcatgatccactagcgcgtgatattgccgttcaatattatcacgccgcagaaacgaccatttatgactatatcgccaggcggcatccacaaagcgcacaatgtgtgactgattttatgagtaccgtgatgtcagggctttctgcgaaggcacgggaggggcactcaatcgaacaactctgtgcaacagctgcactggcgggggaagcgataaaaactcttctcaaggagtgatgctggccttgatccgaaaggcgggaacgcgcctgccgataagttgtgataagacaataattcacgcattaaggctagcggaattgatcatcttttcgtataacgatagaaatgaaacgttgttttaattaaggagtggaaaagatgatcatcggaaatattcataatcttcagccgtggctaccccaggagttacgccaggcgattgagcatatcaaagcacacgttacggcagaaacgccaaagggcaagcacgatatcgaaggcaatcatctgttttatcttatctcggaagatatgaccgagccgtacgaagcccgccgtgcggagtaccatgcccgctatctcgacattcagattgtgttaaaaggtcaggaaggcatgaccttcagcacgcaacctgcaggcacgccggataccgactggttagctgataaagacatcgcatttttgccggaaggcgttgatgagaaaacagttatccttaatgaaggtgattttgttgtgttttatccgggggaagtgcataaaccgctgtgcgcagtgggtgcaccagcccaggttcgcaaagcagtagtgaagatgctgatggcgtgatgacttttcgccgtaaataactccaggtttacggcgagtttgtgaaaagagcgttttttgatatttttttgtgagtaaaatttgtaatgcttagacgttcttattaactcaaggagtccgtcatgagcaaaatatcaggttggaatttttctcaaaacattacatcagccgacaattgtaaacaaaaaaatgaagacttagatacctggtatgtgggaatgaatgattttgcccgaattgccggagggcagaatagcagaagcaatattctttctcccagagcatttttggagtttttggctaagatatttaccctgggttatgtggattttagcaaacgctccaa" . + . + . + . diff --git a/test/test_files/sbol3_genbank_conversion/test_locus_name_display_id.gb b/test/test_files/sbol3_genbank_conversion/test_locus_name_display_id.gb new file mode 100644 index 00000000..f6c9c513 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/test_locus_name_display_id.gb @@ -0,0 +1,110 @@ +LOCUS SaABE8e_TadA-8e_V106W 3876 bp DNA linear UNK 13-JAN-2022 +DEFINITION . +ACCESSION SaABE8e_TadA-8e_V106W +VERSION SaABE8e_TadA-8e_V106W.1 +KEYWORDS . +SOURCE None + ORGANISM None + . +FEATURES Location/Qualifiers + misc_feature 1..57 + /label="NLS" + /ApEinfo_revcolor="#b4abac" + /ApEinfo_fwdcolor="#b4abac" + CDS 1..3876 + /label="Translation 1-3876" + misc_feature 1..3876 + /label="SaABE8e(TadA V106W)" + /ApEinfo_revcolor="#c7b0e3" + /ApEinfo_fwdcolor="#c7b0e3" + CDS 37..57 + /label="SV40 NLS" + /ApEinfo_revcolor="#9eafd2" + /ApEinfo_fwdcolor="#9eafd2" + misc_feature 58..555 + /label="TadA-8e V106W" + /ApEinfo_revcolor="#84b0dc" + /ApEinfo_fwdcolor="#84b0dc" + misc_feature 556..651 + /label="Linker" + /ApEinfo_revcolor="#d6b295" + /ApEinfo_fwdcolor="#d6b295" + misc_feature 652..3810 + /label="SaCas9 nickase" + /ApEinfo_revcolor="#b1ff67" + /ApEinfo_fwdcolor="#b1ff67" + misc_feature 3811..3876 + /label="NLS" + /ApEinfo_revcolor="#f8d3a9" + /ApEinfo_fwdcolor="#f8d3a9" + CDS 3853..3873 + /label="SV40 NLS" + /ApEinfo_revcolor="#b1ff67" + /ApEinfo_fwdcolor="#b1ff67" +ORIGIN + 1 atgaaacgga cagccgacgg aagcgagttc gagtcaccaa agaagaagcg gaaagtctct + 61 gaggtggagt tttcccacga gtactggatg agacatgccc tgaccctggc caagagggca + 121 cgggatgaga gggaggtgcc tgtgggagcc gtgctggtgc tgaacaatag agtgatcggc + 181 gagggctgga acagagccat cggcctgcac gacccaacag cccatgccga aattatggcc + 241 ctgagacagg gcggcctggt catgcagaac tacagactga ttgacgccac cctgtacgtg + 301 acattcgagc cttgcgtgat gtgcgccggc gccatgatcc actctaggat cggccgcgtg + 361 gtgtttggat ggagaaattc taaaagaggc gccgcaggct ccctgatgaa cgtgctgaac + 421 taccccggca tgaatcaccg cgtcgaaatt accgagggaa tcctggcaga tgaatgtgcc + 481 gccctgctgt gcgatttcta tcggatgcct agacaggtgt tcaatgctca gaagaaggcc + 541 cagagctcca tcaactccgg aggatctagc ggaggctcct ctggctctga gacacctggc + 601 acaagcgaga gcgcaacacc tgaaagcagc gggggcagca gcggggggtc agggaagcga + 661 aattacattc tggggctggc cattggcatt acatcagtgg gctatggcat cattgactac + 721 gagacaaggg acgtgatcga cgccggcgtg agactgttca aggaggccaa cgtggagaac + 781 aatgagggcc ggagatccaa gaggggagca aggcgcctga agcggagaag gcgccacaga + 841 atccagagag tgaagaagct gctgttcgat tacaacctgc tgaccgacca ctccgagctg + 901 tctggcatca atccttatga ggccagagtg aagggcctgt cccagaagct gtctgaggag + 961 gagtttagcg ccgccctgct gcacctggca aagaggagag gcgtgcacaa cgtgaatgag + 1021 gtggaggagg acaccggcaa cgagctgtcc acaaaggagc agatcagccg caattccaag + 1081 gccctggagg agaagtatgt ggccgagctg cagctggagc ggctgaagaa ggatggcgag + 1141 gtgaggggct ccatcaatcg cttcaagacc tctgactacg tgaaggaggc caagcagctg + 1201 ctgaaggtgc agaaggccta ccaccagctg gatcagtcct ttatcgatac atatatcgac + 1261 ctgctggaga caaggcgcac atactatgag ggaccaggag agggctctcc cttcggctgg + 1321 aaggacatca aggagtggta cgagatgctg atgggccact gcacctattt tccagaggag + 1381 ctgagaagcg tgaagtacgc ctataacgcc gatctgtaca acgccctgaa tgacctgaac + 1441 aacctggtca tcaccaggga tgagaacgag aagctggagt actatgagaa gttccagatc + 1501 atcgagaacg tgttcaagca gaagaagaag cctacactga agcagatcgc caaggagatc + 1561 ctggtgaacg aggaggacat caagggctac cgcgtgacct ccacaggcaa gccagagttc + 1621 accaatctga aggtgtatca cgatatcaag gacatcacag cccggaagga gatcatcgag + 1681 aacgccgagc tgctggatca gatcgccaag atcctgacca tctatcagag ctccgaggac + 1741 atccaggagg agctgaccaa cctgaatagc gagctgacac aggaggagat cgagcagatc + 1801 agcaatctga agggctacac cggcacacac aacctgagcc tgaaggccat caatctgatc + 1861 ctggatgagc tgtggcacac aaacgacaat cagatcgcca tctttaaccg gctgaagctg + 1921 gtgccaaaga aggtggacct gtcccagcag aaggagatcc caaccacact ggtggacgat + 1981 ttcatcctgt ctcccgtggt gaagcggagc ttcatccaga gcatcaaagt gatcaacgcc + 2041 atcatcaaga agtacggcct gcccaatgat atcatcatcg agctggccag ggagaagaac + 2101 tccaaggacg cccagaagat gatcaatgag atgcagaaga ggaaccgcca gaccaatgag + 2161 cggatcgagg agatcatcag aaccacaggc aaggagaacg ccaagtacct gatcgagaag + 2221 atcaagctgc acgatatgca ggagggcaag tgtctgtatt ctctggaggc catccctctg + 2281 gaggacctgc tgaacaatcc attcaactac gaggtggatc acatcatccc ccggagcgtg + 2341 agcttcgaca attcttttaa caataaggtg ctggtgaagc aggaggagaa cagcaagaag + 2401 ggcaatagga cccctttcca gtacctgtct agctccgatt ctaagatcag ctacgagaca + 2461 ttcaagaagc acatcctgaa tctggccaag ggcaagggcc gcatcagcaa gaccaagaag + 2521 gagtacctgc tggaggagcg ggacatcaac agattctccg tgcagaagga cttcatcaac + 2581 cggaatctgg tggacaccag atacgccaca cgcggcctga tgaatctgct gcggtcttat + 2641 ttcagagtga acaatctgga tgtgaaggtg aagagcatca acggcggctt cacctccttt + 2701 ctgcggagaa agtggaagtt taagaaggag cgcaacaagg gctataagca ccacgccgag + 2761 gatgccctga tcatcgccaa tgccgacttc atctttaagg agtggaagaa gctggacaag + 2821 gccaagaaag tgatggagaa ccagatgttc gaggagaagc aggccgagag catgcccgag + 2881 atcgagacag agcaggagta caaggagatt ttcatcacac ctcaccagat caagcacatc + 2941 aaggacttca aggactacaa gtattctcac agggtggata agaagcccaa ccgcgagctg + 3001 atcaatgaca ccctgtatag cacacggaag gacgataagg gcaataccct gatcgtgaac + 3061 aatctgaacg gcctgtacga caaggataat gacaagctga agaagctgat caacaagtct + 3121 cccgagaagc tgctgatgta ccaccacgat cctcagacat atcagaagct gaagctgatc + 3181 atggagcagt acggcgacga gaagaaccca ctgtataagt actatgagga gacaggcaac + 3241 tacctgacaa agtatagcaa gaaggataat ggccccgtga tcaagaagat caagtactat + 3301 ggcaacaagc tgaatgccca cctggacatc accgacgatt accctaactc tcgcaataag + 3361 gtggtgaagc tgagcctgaa gccataccgg ttcgacgtgt acctggacaa cggcgtgtat + 3421 aagtttgtga cagtgaagaa tctggatgtg atcaagaagg agaactacta tgaggtgaac + 3481 agcaagtgct acgaggaggc caagaagctg aagaagatca gcaaccaggc cgagttcatc + 3541 gcctcttttt acaacaatga cctgatcaag atcaatggcg agctgtatag agtgatcggc + 3601 gtgaacaatg atctgctgaa cagaatcgaa gtgaatatga tcgacatcac ctacagggag + 3661 tatctggaga acatgaatga taagaggccc cctcgcatca tcaagaccat cgcctctaag + 3721 acacagagca tcaagaagta cagcacagac atcctgggga acctgtatga agtcaagagc + 3781 aagaaacatc ctcagattat caagaaaggc tctggcggct caaaaagaac cgccgacggc + 3841 agcgaattcg agcccaagaa gaagaggaaa gtctaa +// diff --git a/test/test_files/sbol3_genbank_conversion/test_structured_comments.gb b/test/test_files/sbol3_genbank_conversion/test_structured_comments.gb new file mode 100644 index 00000000..658d2958 --- /dev/null +++ b/test/test_files/sbol3_genbank_conversion/test_structured_comments.gb @@ -0,0 +1,56 @@ +LOCUS KY484012 720 bp DNA linear SYN 01-NOV-2017 +DEFINITION Synthetic construct mCerulean3 (cpmCerulean3) gene, complete cds. +ACCESSION KY484012 +VERSION KY484012.1 +KEYWORDS . +SOURCE synthetic construct + ORGANISM synthetic construct + other sequences; artificial sequences. +REFERENCE 1 (bases 1 to 720) + AUTHORS Boehm,C.R., Gorchs Rovira,A. and Mehrshahi,P. + TITLE Implementation of a synthetic transcriptional and gate in the + chloroplast of Chlamydomonas reinhardtii + JOURNAL Unpublished +REFERENCE 2 (bases 1 to 720) + AUTHORS Boehm,C.R., Gorchs Rovira,A. and Mehrshahi,P. + TITLE Direct Submission + JOURNAL Submitted (16-JAN-2017) Plant Sciences, University of Cambridge, + Downing Street, Cambridge, Cambridgeshire CB2 3EA, United Kingdom +COMMENT ##Assembly-Data-START## + Sequencing Technology :: Sanger dideoxy sequencing + ##Assembly-Data-END## +FEATURES Location/Qualifiers + gene 1..720 + /gene="cpmCerulean3" + source 1..720 + /organism="synthetic construct" + /mol_type="other DNA" + /db_xref="taxon:32630" + CDS 1..720 + /gene="cpmCerulean3" + /note="enhanced cyan fluorescent protein; codon-optimized + for expression from the Chlamydomonas reinhardtii + chloroplast" + /codon_start=1 + /transl_table=11 + /product="mCerulean3" + /protein_id="ATP07149.1" + /translation="MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTL + KFICTTGKLPVPWPTLVTTLSWGVQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDD + GNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNAIHGNVYITADKQKNGIK + ANFGLNCNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSKLSKDPNEKRDHMVLL + EFVTAAGITLGMDELYK" +ORIGIN + 1 atggtttcta aaggtgaaga attattcact ggtgttgtac caattttagt tgaattagat + 61 ggtgacgtaa acggtcacaa attctcagta tcaggtgaag gtgaaggtga tgctacttac + 121 ggtaaattaa ctttaaaatt catttgtaca acaggtaaat taccagttcc atggccaact + 181 ttagtaacaa ctttatcttg gggtgtacaa tgtttcgctc gttacccaga tcacatgaaa + 241 caacacgact tcttcaaatc agctatgcca gaaggttacg tacaagaacg tactattttc + 301 ttcaaagatg atggtaacta caaaacacgt gctgaagtta aattcgaagg tgatacttta + 361 gtaaaccgta ttgaattaaa aggtattgat ttcaaagaag atggtaacat tttaggtcac + 421 aaattagaat acaacgctat tcacggtaac gtatacatta ctgctgataa acaaaaaaac + 481 ggtattaaag ctaacttcgg tttaaactgt aacattgaag atggttcagt acaattagct + 541 gatcactacc aacaaaacac tccaattggt gatggtccag ttttattacc agataaccac + 601 tacttatcaa ctcaatctaa attatctaaa gatccaaacg aaaaacgtga ccacatggtt + 661 ttattagaat tcgtaactgc tgctggtatt actttaggta tggatgaatt atacaaataa +// diff --git a/test/test_files/sbol3_implementation.nt b/test/test_files/sbol3_implementation.nt new file mode 100644 index 00000000..64a32fb0 --- /dev/null +++ b/test/test_files/sbol3_implementation.nt @@ -0,0 +1,13 @@ + . + . + "impl1" . + . + . + "TetR protein" . + "TetR" . + . + . + "TetR_protein" . + . + "1" . + "1" . diff --git a/test/test_files/sbol_3to2_collection.xml b/test/test_files/sbol_3to2_collection.xml new file mode 100644 index 00000000..75ca6a46 --- /dev/null +++ b/test/test_files/sbol_3to2_collection.xml @@ -0,0 +1,30 @@ + + + + + col1 + 1 + + + + + + LacI + + + LacI protein + + LacI_protein + 1 + + + + TetR_protein + TetR + 1 + + + TetR protein + + + diff --git a/test/test_files/sbol_3to2_implementation.xml b/test/test_files/sbol_3to2_implementation.xml new file mode 100644 index 00000000..d9ef888d --- /dev/null +++ b/test/test_files/sbol_3to2_implementation.xml @@ -0,0 +1,19 @@ + + + + 1 + + TetR + TetR_protein + + + TetR protein + + + + + + 1 + impl1 + + diff --git a/test/test_files/sbol_3to2to3_implementation.nt b/test/test_files/sbol_3to2to3_implementation.nt new file mode 100644 index 00000000..080ee87d --- /dev/null +++ b/test/test_files/sbol_3to2to3_implementation.nt @@ -0,0 +1,13 @@ + "1" . + . + . + . + . + . + . + "impl1" . + "1" . + "TetR_protein" . + "TetR protein" . + . + "TetR" . diff --git a/test/test_genbank_sbol3_direct.py b/test/test_genbank_sbol3_direct.py new file mode 100644 index 00000000..58ef4e37 --- /dev/null +++ b/test/test_genbank_sbol3_direct.py @@ -0,0 +1,278 @@ +import unittest +import sbol3 +import os +from pathlib import Path +from helpers import copy_to_tmp, assert_files_identical +from sbol_utilities.sbol_diff import doc_diff +from sbol_utilities.sbol3_genbank_conversion import GenBankSBOL3Converter + + +class TestGenBankSBOL3(unittest.TestCase): + # Create converter instance + converter = GenBankSBOL3Converter() + + def _test_genbank_to_sbol3(self, sample_sbol3_file: Path, sample_genbank_file: Path): + """Helper method to test conversion of a given GenBank file to SBOL3 using new converter. + :param sample_sbol3_file: Path of expected SBOL3 converted file + :param sample_genbank_file: Path of given GenBank file to convert + """ + test_output_sbol3 = str(sample_sbol3_file) + ".test" + # Don't write to file for testing, we directly compare sbol documents + test_output_sbol3 = self.converter.convert_genbank_to_sbol3( + str(sample_genbank_file), + test_output_sbol3, + namespace=self.converter.TEST_NAMESPACE, + write=False, + ) + sbol3_file_1 = sbol3.Document() + sbol3_file_1.read( + location=str(sample_sbol3_file), file_format=sbol3.SORTED_NTRIPLES + ) + assert not doc_diff( + test_output_sbol3, sbol3_file_1 + ), f"Converted SBOL3 file: {test_output_sbol3} not identical to expected file: {sample_sbol3_file}" + + def _test_sbol3_to_genbank(self, sample_sbol3_file: Path, sample_genbank_file: Path): + """Helper method to test conversion of a given SBOL3 file to GenBank using new converter. + :param sample_sbol3_file: Path of given SBOL3 file to convert + :param sample_genbank_file: Path of expected GenBank converted file + """ + # create tmp directory to store generated genbank file in for comparison + tmp_sub = copy_to_tmp(package=[str(sample_sbol3_file)]) + doc3 = sbol3.Document() + doc3.read(str(sample_sbol3_file)) + # Convert to GenBank and check contents + outfile = Path(tmp_sub) / sample_genbank_file.with_suffix('.test').name + self.converter.convert_sbol3_to_genbank(sbol3_file=None, doc=doc3, gb_file=str(outfile), write=True) + assert_files_identical(outfile, sample_genbank_file) + + def _test_round_trip_genbank(self, sample_genbank_file: Path): + """Helper method to test conversion of a given GenBank file to SBOL3 and then back to GenBank + and confirm the final file is exactly the same as the initial provided file. + :param sample_genbank_file: Path of given GenBank file to round trip test + """ + sbol3.set_namespace(self.converter.TEST_NAMESPACE) + test_output_sbol3 = str(sample_genbank_file) + ".nt" + # Don't write to file for testing, we directly compare sbol documents + test_output_sbol3 = self.converter.convert_genbank_to_sbol3( + str(sample_genbank_file), + test_output_sbol3, + namespace=self.converter.TEST_NAMESPACE, + write=False, + ) + # create tmp directory to store generated genbank file in for comparison + tmp_sub = copy_to_tmp(package=[str(sample_genbank_file)]) + # Convert to GenBank and check contents + outfile = Path(tmp_sub) / sample_genbank_file.with_suffix('.test').name + self.converter.convert_sbol3_to_genbank( + sbol3_file=None, doc=test_output_sbol3, gb_file=str(outfile), write=True + ) + assert_files_identical(outfile, sample_genbank_file) + + def test_gbtosbol3_1(self): + """Test conversion of a simple genbank file with a single sequence""" + genbank_file = Path(__file__).parent / "test_files" / "BBa_J23101.gb" + sbol3_file = ( + Path(__file__).parent + / "test_files" + / "sbol3_genbank_conversion" + / "BBa_J23101_from_genbank_to_sbol3_direct.nt" + ) + sbol3.set_namespace(self.converter.TEST_NAMESPACE) + self._test_genbank_to_sbol3(sample_sbol3_file=sbol3_file, sample_genbank_file=genbank_file) + + def test_gbtosbol3_2(self): + """Test conversion of a simple genbank file with a multiple sequence with multiple features""" + genbank_file = ( + Path(__file__).parent / "test_files" / "iGEM_SBOL2_imports.gb" + ) + sbol3_file = ( + Path(__file__).parent + / "test_files" + / "sbol3_genbank_conversion" + / "iGEM_SBOL2_imports_from_genbank_to_sbol3_direct.nt" + ) + sbol3.set_namespace(self.converter.TEST_NAMESPACE) + self._test_genbank_to_sbol3(sample_sbol3_file=sbol3_file, sample_genbank_file=genbank_file) + + def test_sbol3_to_gb_1(self): + """Test ability to convert from SBOL3 to GenBank using new converter""" + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "BBa_J23101_from_sbol3_direct.gb" + ) + sbol3_file = ( + Path(__file__).parent + / "test_files" + / "sbol3_genbank_conversion" + / "BBa_J23101_from_genbank_to_sbol3_direct.nt" + ) + self._test_sbol3_to_genbank(sample_sbol3_file=sbol3_file, sample_genbank_file=genbank_file) + + def test_sbol3_to_gb_2(self): + """Test ability to convert from SBOL3 to GenBank with multiple records/features using new converter""" + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / + "iGEM_SBOL2_imports_from_sbol3_direct.gb" + ) + sbol3_file = ( + Path(__file__).parent + / "test_files" + / "sbol3_genbank_conversion" + / "iGEM_SBOL2_imports_from_genbank_to_sbol3_direct.nt" + ) + self._test_sbol3_to_genbank(sample_sbol3_file=sbol3_file, sample_genbank_file=genbank_file) + + def test_round_trip_extra_properties(self): + """Test ability to produce same genbank file on round trip when original genbank file has nonstandard + values for extraneous properties + """ + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "test_extra_properties.gb" + ) + self._test_round_trip_genbank(genbank_file) + + def test_round_trip_multiple_loc_feat(self): + """Test ability to produce same genbank file on round trip when original genbank file has multiple + locations on a feature + """ + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "multiple_feature_locations.gb" + ) + self._test_round_trip_genbank(genbank_file) + + def test_round_trip_extra_properties_with_references(self): + """Test ability to produce same genbank file on round trip when original genbank file has nonstandard + values for extraneous properties, along with references + """ + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / + "test_extra_properties_with_references.gb" + ) + self._test_round_trip_genbank(genbank_file) + + def test_ignoring_sbol_properties(self): + """Test ability to ignore SBOL3 properties, which can't be parsed to their corresponding GenBank fields + while converting from SBOL3 to GenBank. This also tests tolerance of deprecated orientations + """ + # this test file contains only components, sequences and attachments as it's top level objects + sbol3_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "ignoring_sbol_properties.nt" + ) + # create tmp directory to store generated genbank file in for comparison + tmp_sub = copy_to_tmp(package=[str(sbol3_file)]) + doc3 = sbol3.Document() + doc3.read(str(sbol3_file)) + # Convert to GenBank and check contents + outfile = os.path.join(tmp_sub, str(sbol3_file).split("/")[-1] + ".test") + res = self.converter.convert_sbol3_to_genbank( + sbol3_file=None, doc=doc3, gb_file=outfile, write=True + ) + # since test sbol3 file contains components, sequences and attachments, and only components and sequences will + # be parsed and converted to be stored in the genbank file, the attachment + # object should have a "False" status of conversion + for top_level_object in res["status"]: + if isinstance(top_level_object, sbol3.Component) or isinstance(top_level_object, sbol3.Sequence): + self.assertTrue(res["status"][top_level_object]) + elif isinstance(top_level_object, sbol3.Attachment): # TODO: why is this separate from the next clause? + self.assertFalse(res["status"][top_level_object]) + else: + self.assertFalse(res["status"][top_level_object]) + + def test_round_trip_feature_qualifiers(self): + """Test ability to produce same genbank file on round trip when original genbank file has keys and + values for feature qualifiers + """ + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "feature_qualifier_storage.gb" + ) + self._test_round_trip_genbank(genbank_file) + + def test_round_trip_all_test_files(self): + """Test ability to correctly round trip all genbank test files in this test suite""" + test_file_dir = Path(__file__).parent / 'test_files' + for genbank_file in test_file_dir.glob('*.gb'): + self._test_round_trip_genbank(genbank_file) + + def test_round_trip_iGEM_BBF10K(self): + """Test ability to correctly round trip genbank test files in the iGEM distribution of the form + BBF10K_000***.gb ; these files mostly follow standard GenBank formatting, and don't have misplaced information + """ + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "iGEM_BBF10K_000475_modified.gb" + ) + self._test_round_trip_genbank(genbank_file) + + def test_round_trip_structured_comments(self): + """Test ability to correctly round trip genbank test files in the iGEM distribution which have + comment and structured comment annotations. + """ + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "test_structured_comments.gb" + ) + self._test_round_trip_genbank(genbank_file) + + def test_dblink_property(self): + """Test ability to correctly round trip genbank test files in the iGEM distribution which have + the DB_LINK (or 'dbxrefs' in biopython) property. + """ + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "test_dblink_property.gb" + ) + self._test_round_trip_genbank(genbank_file) + + def test_location_genbank_extension(self): + """Test that the Location_GenBank_Extension can be converted + from GenBank into SBOL3, and that the resulting SBOL3 file can + be loaded. + NOTE: This unit test can be removed once pySBOL3 makes a new release fixing the following bug + ISSUE: https://github.com/SynBioDex/pySBOL3/issues/414 + """ + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "test_location_types.gb" + ) + self.converter.convert_genbank_to_sbol3(str(genbank_file), write=True) + doc = sbol3.Document() + doc.read('sbol3.nt', file_format=sbol3.NTRIPLES) + + def test_locus_name_and_display_id(self): + """Test ability to correctly round trip genbank test files in the iGEM distribution which have + the DB_LINK (or 'dbxrefs' in biopython) property. + """ + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "test_locus_name_display_id.gb" + ) + self._test_round_trip_genbank(genbank_file) + + def test_feature_location_types_ignore_fuzzy(self): + """Test ability to correctly convert genbank test files in the iGEM distribution which have + different FeatureLocation types like BeforePosition / AfterPosition / ExactPosition. + """ + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "test_location_types.gb" + ) + sbol3_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "test_location_types.nt" + ) + sbol3.set_namespace(self.converter.TEST_NAMESPACE) + self._test_genbank_to_sbol3(sample_sbol3_file=sbol3_file, sample_genbank_file=genbank_file) + + @unittest.skip(reason="Round-tripping blocked by https://github.com/SynBioDex/SBOL-utilities/issues/200") + def test_feature_location_types_round_trip_fuzzy(self): + """Test ability to correctly round trip genbank test files in the iGEM distribution which have + different FeatureLocation types like BeforePosition / AfterPosition / ExactPosition. + """ + genbank_file = ( + Path(__file__).parent / "test_files" / "sbol3_genbank_conversion" / "test_location_types.gb" + ) + self._test_round_trip_genbank(genbank_file) + + # def test_round_trip_all_iGEM(self): + # test_file_dir = Path(__file__).parent.parent / 'iGEM' + # for genbank_file in test_file_dir.glob('BBF10K_*.gb'): + # print(f"file {genbank_file}") + # x = self._test_round_trip_genbank(genbank_file) + # print(f"result {x}") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_sbol2_sbol3_direct.py b/test/test_sbol2_sbol3_direct.py new file mode 100644 index 00000000..94791fcf --- /dev/null +++ b/test/test_sbol2_sbol3_direct.py @@ -0,0 +1,134 @@ +import tempfile +from pathlib import Path + +import unittest + +import sbol2 +import sbol3 + +from sbol_utilities.conversion import convert2to3, convert3to2 +from sbol_utilities.sbol_diff import file_diff + +TEST_FILES = Path(__file__).parent / 'test_files' + + +class TestDirectSBOL2SBOL3Conversion(unittest.TestCase): + + # TODO: turn on validation + def test_3to2_conversion(self): + """Test ability to convert a simple part from SBOL3 to SBOL2""" + # Load an SBOL3 document and check its contents + doc3 = sbol3.Document() + doc3.read(TEST_FILES / 'BBa_J23101_patched.nt') + # Convert to SBOL2 and check contents + doc2 = convert3to2(doc3, True) + #report = doc2.validate() + #self.assertEqual(len(report), 0, f'Validation failed: {report}') + with tempfile.TemporaryDirectory() as tmpdir: + tmp2 = Path(tmpdir) / 'doc2.xml' + doc2.write(tmp2) + self.assertFalse(file_diff(str(tmp2), str(TEST_FILES / 'BBa_J23101.xml'))) + doc3_loop = convert2to3(doc2, use_native_converter=True) + self.assertEqual(len(doc3_loop.validate()), 0) + tmp3 = Path(tmpdir) / 'doc3_loop.nt' + doc3_loop.write(tmp3) + self.assertFalse(file_diff(str(tmp3), str(TEST_FILES / 'BBa_J23101_patched.nt'))) + + def test_2to3_conversion(self): + """Test ability to convert a simple part from SBOL2 to SBOL3""" + # Load an SBOL2 document and check its contents + doc2 = sbol2.Document() + doc2.read(TEST_FILES / 'BBa_J23101.xml') + # Convert to SBOL3 and check contents + doc3 = convert2to3(doc2, use_native_converter=True) + self.assertEqual(len(doc3.validate()), 0) + with tempfile.TemporaryDirectory() as tmpdir: + tmp3 = Path(tmpdir) / 'doc3.nt' + doc3.write(tmp3) + self.assertFalse(file_diff(str(tmp3), str(TEST_FILES / 'BBa_J23101_patched.nt'))) + doc2_loop = convert3to2(doc3, True) + # report = doc2.validate() + # self.assertEqual(len(report), 0, f'Validation failed: {report}') + tmp2 = Path(tmpdir) / 'doc2_loop.xml' + doc2_loop.write(tmp2) + self.assertFalse(file_diff(str(tmp2), str(TEST_FILES / 'BBa_J23101.xml'))) + + def test_3to2_implementation_conversion(self): + """Test ability to convert an implementation from SBOL3 to SBOL2""" + # Load an SBOL3 document and check its contents + doc3 = sbol3.Document() + doc3.read(TEST_FILES / 'sbol3_implementation.nt') + # Convert to SBOL2 and check contents + doc2 = convert3to2(doc3, True) + #report = doc2.validate() + #self.assertEqual(len(report), 0, f'Validation failed: {report}') + with tempfile.TemporaryDirectory() as tmpdir: + tmp2 = Path(tmpdir) / 'doc2.xml' + doc2.write(tmp2) + self.assertFalse(file_diff(str(tmp2), str(TEST_FILES / 'sbol_3to2_implementation.xml'))) + doc3_loop = convert2to3(doc2, use_native_converter=True) + self.assertEqual(len(doc3_loop.validate()), 0) + tmp3 = Path(tmpdir) / 'doc3_loop.nt' + doc3_loop.write(tmp3) + self.assertFalse(file_diff(str(tmp3), str(TEST_FILES / 'sbol3_implementation.nt'))) + + def test_2to3_implementation_conversion(self): + """Test ability to convert an implementation from SBOL2 to SBOL3""" + # Load an SBOL2 document and check its contents + doc2 = sbol2.Document() + doc2.read(TEST_FILES / 'sbol_3to2_implementation.xml') + # Convert to SBOL3 and check contents + doc3 = convert2to3(doc2, use_native_converter=True) + self.assertEqual(len(doc3.validate()), 0) + with tempfile.TemporaryDirectory() as tmpdir: + tmp3 = Path(tmpdir) / 'doc3.nt' + doc3.write(tmp3) + self.assertFalse(file_diff(str(tmp3), str(TEST_FILES / 'sbol3_implementation.nt'))) + doc2_loop = convert3to2(doc3, True) + # report = doc2.validate() + # self.assertEqual(len(report), 0, f'Validation failed: {report}') + tmp2 = Path(tmpdir) / 'doc2_loop.xml' + doc2_loop.write(tmp2) + self.assertFalse(file_diff(str(tmp2), str(TEST_FILES / 'sbol_3to2_implementation.xml'))) + + def test_3to2_collection_conversion(self): + """Test ability to convert a collection from SBOL3 to SBOL2""" + # Load an SBOL3 document and check its contents + doc3 = sbol3.Document() + doc3.read(TEST_FILES / 'sbol3_collection.nt') + # Convert to SBOL2 and check contents + doc2 = convert3to2(doc3, True) + #report = doc2.validate() + #self.assertEqual(len(report), 0, f'Validation failed: {report}') + with tempfile.TemporaryDirectory() as tmpdir: + tmp2 = Path(tmpdir) / 'doc2.xml' + doc2.write(tmp2) + self.assertFalse(file_diff(str(tmp2), str(TEST_FILES / 'sbol_3to2_collection.xml'))) + doc3_loop = convert2to3(doc2, use_native_converter=True) + self.assertEqual(len(doc3_loop.validate()), 0) + tmp3 = Path(tmpdir) / 'doc3_loop.nt' + doc3_loop.write(tmp3) + self.assertFalse(file_diff(str(tmp3), str(TEST_FILES / 'sbol3_collection.nt'))) + + def test_2to3_collection_conversion(self): + """Test ability to convert a collection from SBOL2 to SBOL3""" + # Load an SBOL2 document and check its contents + doc2 = sbol2.Document() + doc2.read(TEST_FILES / 'sbol_3to2_collection.xml') + # Convert to SBOL3 and check contents + doc3 = convert2to3(doc2, use_native_converter=True) + self.assertEqual(len(doc3.validate()), 0) + with tempfile.TemporaryDirectory() as tmpdir: + tmp3 = Path(tmpdir) / 'doc3.nt' + doc3.write(tmp3) + self.assertFalse(file_diff(str(tmp3), str(TEST_FILES / 'sbol3_collection.nt'))) + doc2_loop = convert3to2(doc3, True) + # report = doc2.validate() + # self.assertEqual(len(report), 0, f'Validation failed: {report}') + tmp2 = Path(tmpdir) / 'doc2_loop.xml' + doc2_loop.write(tmp2) + self.assertFalse(file_diff(str(tmp2), str(TEST_FILES / 'sbol_3to2_collection.xml'))) + + +if __name__ == '__main__': + unittest.main()