diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index b0a0318f..6cf55889 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -12,23 +12,25 @@ on:
jobs:
build:
- runs-on: ubuntu-latest
+ env:
+ IDT_CREDENTIALS: ${{ secrets.IDT_CREDENTIALS }}
+ runs-on: ${{ matrix.os }}
strategy:
matrix:
# Default builds are on Ubuntu
os: [ubuntu-latest]
- python-version: ['3.7', '3.8', '3.9', '3.10']
+ python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
include:
# Also test on macOS and Windows using the latest Python 3
- os: macos-latest
- python-version: 3.x
- - os: windows-latest
- python-version: 3.x
+ python-version: 3.11 # Return to 3.x after resolution of https://github.com/RDFLib/pySHACL/issues/212
+ - os: windows-2019
+ python-version: 3.11 # Return to 3.x after resolution of https://github.com/RDFLib/pySHACL/issues/212
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@@ -37,11 +39,16 @@ jobs:
python -m pip install pytest
python -m pip install interrogate
- name: Setup Graphviz
- uses: ts-graphviz/setup-graphviz@v1
+ uses: ts-graphviz/setup-graphviz@v2
+ with:
+ # Skip running of sometimes problematic brew update command on macOS.
+ # Remove after resolution of https://github.com/ts-graphviz/setup-graphviz/issues/593
+ macos-skip-brew-update: 'true' # default false
- name: Show Node.js version
run: |
node --version
- name: Test with pytest
run: |
pip install .
+ echo "$IDT_CREDENTIALS" > test_secret_idt_credentials.json
pytest --ignore=test/test_docstr_coverage.py -s
diff --git a/.gitignore b/.gitignore
index 563f9abf..16ddcf6e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,9 @@
test/__pycache__/
sbol_utilities/__pycache__/
__pycache__/
+
+# test secrets
+test_secret*
+
+.idea/
+*.egg-info/
diff --git a/README.md b/README.md
index b32f9dd3..798ad77c 100644
--- a/README.md
+++ b/README.md
@@ -57,9 +57,9 @@ The `excel-to-sbol` utility reads an Excel file specifying a library of basic an
The `sbol-converter` utility converts between any of the SBOL3, SBOL2, GenBank, and FASTA formats.
Additional "macro" utilities convert specifically between SBOL3 and one of the other formats:
-- `sbol2fasta` and `fasta2sbol` convert from SBOL3 to FASTA and vice versa
-- `sbol2genbank` and `genbank2sbol` convert from SBOL3 to GenBank and vice versa
-- `sbol3to2` and `sbol2to3` convert to and from SBOL2
+- `sbol-to-fasta` and `fasta-to-sbol` convert from SBOL3 to FASTA and vice versa
+- `sbol-to-genbank` and `genbank-to-sbol` convert from SBOL3 to GenBank and vice versa
+- `sbol3-to-sbol2` and `sbol2-to-sbol3` convert to and from SBOL2
### Expand the combinatorial derivations in an SBOL file
@@ -69,6 +69,14 @@ The `sbol-expand-derivations` utility searches through an SBOL file for Combinat
The `sbol-calculate-sequences` utility attempts to calculate the sequence of any DNA Component that can be fully specified from the sequences of its sub-components.
+### Calculate sequence synthesis complexity for DNA sequences in an SBOL file
+
+The `sbol-calculate-complexity` utility attempts to calculate the synthesis complexity of any DNA sequence in the file, by sending sequences to be evaluated by IDT's sequence calculator service. Sequences whose complexity is known are not re-calculated.
+
+The system uses the gBlock API, which is intended for sequences from 125 to 3000 bp in length. If it is more than 3000 bp or less than 125 bp your returned score will be 0. A complexity score in the range from 0 to 10 means your sequence is synthesizable, if the score is greater or equal than 10 means it is not synthesizable.
+
+Note that use of this utility requires an account with IDT that is set up to use IDT's online service API (see: https://www.idtdna.com/pages/tools/apidoc)
+
### Compute the difference between two SBOL3 documents
The `sbol-diff` utility computes the difference between two SBOL3 documents
and reports the differences.
@@ -77,4 +85,4 @@ and reports the differences.
## Contributing
We welcome contributions that patch bugs, improve existing utilities or documentation, or add new utilities!
-For guidance on how to contribute effectively to this project, see [CONTRIBUTING.md](CONTRIBUTING.md).
\ No newline at end of file
+For guidance on how to contribute effectively to this project, see [CONTRIBUTING.md](CONTRIBUTING.md).
diff --git a/sbol_utilities/calculate_complexity_scores.py b/sbol_utilities/calculate_complexity_scores.py
new file mode 100644
index 00000000..c94749ca
--- /dev/null
+++ b/sbol_utilities/calculate_complexity_scores.py
@@ -0,0 +1,250 @@
+from __future__ import annotations
+
+import json
+
+from typing import Optional
+
+import datetime
+import argparse
+import logging
+import uuid
+from requests import post
+from requests.auth import HTTPBasicAuth
+
+import sbol3
+import tyto
+
+from sbol_utilities.workarounds import type_to_standard_extension
+
+COMPLEXITY_SCORE_NAMESPACE = 'http://igem.org/IDT_complexity_score'
+REPORT_ACTIVITY_TYPE = 'https://github.com/SynBioDex/SBOL-utilities/compute-sequence-complexity'
+
+
+class IDTAccountAccessor:
+ """Class that wraps access to the IDT API"""
+
+ _TOKEN_URL = 'https://www.idtdna.com/Identityserver/connect/token'
+ """API URL for obtaining session tokens"""
+ _SCORE_URL = 'https://www.idtdna.com/api/complexities/screengBlockSequences'
+ """APR URL for obtaining sequence scores"""
+ _BLOCK_SIZE = 1 # TODO: determine if it is possible to run multiple sequences in a single query
+ SCORE_TIMEOUT = 120
+ """Number of seconds to wait for score query requests to complete"""
+
+ def __init__(self, username: str, password: str, client_id: str, client_secret: str):
+ """Initialize with required access information for IDT API (see: https://www.idtdna.com/pages/tools/apidoc)
+ Automatically logs in and obtains a session token
+
+ :param username: Username of your IDT account
+ :param password: Password of your IDT account
+ :param client_id: ClientID key of your IDT account
+ :param client_secret: ClientSecret key of your IDT account
+ """
+ self.username = username
+ self.password = password
+ self.client_id = client_id
+ self.client_secret = client_secret
+ self.token = self._get_idt_access_token()
+
+ @staticmethod
+ def from_json(json_object) -> IDTAccountAccessor:
+ """Initialize IDT account accessor from a JSON object with field values
+
+ :param json_object: object with account information
+ :return: Account accessor object
+ """
+ return IDTAccountAccessor(username=json_object['username'], password=json_object['password'],
+ client_id=json_object['ClientID'], client_secret=json_object['ClientSecret'])
+
+ def _get_idt_access_token(self) -> str:
+ """Get access token for IDT API (see: https://www.idtdna.com/pages/tools/apidoc)
+
+ :return: access token string
+ """
+ logging.info('Connecting to IDT API')
+ data = {'grant_type': 'password', 'username': self.username, 'password': self.password, 'scope': 'test'}
+ auth = HTTPBasicAuth(self.client_id, self.client_secret)
+ result = post(IDTAccountAccessor._TOKEN_URL, data, auth=auth, timeout=IDTAccountAccessor.SCORE_TIMEOUT)
+
+ if 'access_token' in result.json():
+ return result.json()['access_token']
+ else:
+ raise ValueError('Access token for IDT API could not be generated. Check your credentials.')
+
+ def get_sequence_scores(self, sequences: list[sbol3.Sequence]) -> list:
+ """Retrieve synthesis complexity scores of sequences from the IDT API
+ This system uses the gBlock API, which is intended for sequences from 125 to 3000 bp in length. If it is more
+ than 3000 bp or less than 125 bp your returned score will be 0. A complexity score in the range from 0 to 10 means
+ your sequence is synthesizable, if the score is greater or equal than 10 means it is not synthesizable.
+
+ :param sequences: sequences for which we want to calculate the complexity score
+ :return: dictionary mapping sequences to complexity Scores
+ :return: List of lists of dictionaries with information about sequence synthesis features
+ """
+ # Set up list of query dictionaries
+ seq_dict = [{'Name': str(seq.display_name), 'Sequence': str(seq.elements)} for seq in sequences]
+ # Break into query blocks
+ partitions_sequences = [seq_dict[x:x + 1] for x in range(0, len(seq_dict), IDTAccountAccessor._BLOCK_SIZE)]
+ # Send each query to IDT and collect results
+ results = []
+ for idx, partition in enumerate(partitions_sequences):
+ logging.debug('Sequence score request %i of %i', idx+1, len(partitions_sequences))
+ resp = post(IDTAccountAccessor._SCORE_URL, json=partition, timeout=IDTAccountAccessor.SCORE_TIMEOUT,
+ headers={'Authorization': 'Bearer {}'.format(self.token),
+ 'Content-Type': 'application/json; charset=utf-8'})
+ response_list = resp.json()
+ if len(response_list) != len(partition):
+ raise ValueError(f'Unexpected complexity score: expected {len(partition)} scores, '
+ f'but got {len(response_list)}')
+ results.append(resp.json())
+ logging.info('Requests to IDT API finished.')
+ return results
+
+ def get_sequence_complexity(self, sequences: list[sbol3.Sequence]) -> dict[sbol3.Sequence, float]:
+ """ Extract complexity scores from IDT API for a list of SBOL Sequence objects
+ This works by computing full sequence evaluations, then compressing down to a single score for each sequence.
+
+ :param sequences: list of SBOL Sequences to evaluate
+ :return: dictionary mapping sequences to complexity Scores
+ """
+ # Retrieve full evaluations for sequences
+ scores = self.get_sequence_scores(sequences)
+ # Compute total score for each sequence as the sum all complexity scores for the sequence
+ score_list = []
+ for score_set in scores:
+ for sequence_scores in score_set:
+ complexity_score = sum(score.get('Score') for score in sequence_scores)
+ score_list.append(complexity_score)
+ # Associate each sequence to its score
+ return dict(zip(sequences, score_list))
+
+
+def get_complexity_score(seq: sbol3.Sequence) -> Optional[float]:
+ """Given a sequence, return its previously computed complexity score, if such exists
+
+ :param seq: SBOL Sequence object to check for score
+ :return: score if set, None if not
+ """
+ scores = [score for score in seq.measures if tyto.EDAM.sequence_complexity_report in score.types]
+ if scores:
+ if len(scores) > 1:
+ raise ValueError(f'Found multiple complexity scores on Sequence {seq.identity}')
+ return scores[0].value
+ else:
+ return None
+
+
+def get_complexity_scores(sequences: list[sbol3.Sequence], include_missing=False) -> \
+ dict[sbol3.Sequence, Optional[float]]:
+ """Retrieve complexity scores for a list of sequences
+
+ :param sequences: Sequences to get scores for
+ :param include_missing: if true, Sequences without scores are included, mapping to none
+ :return: dictionary mapping Sequence to score
+ """
+ # TODO: change to run computations only on DNA sequences
+ score_map = {seq: get_complexity_score(seq) for seq in sequences}
+ if not include_missing:
+ score_map = {k: v for k, v in score_map.items() if v is not None}
+ return score_map
+
+
+def idt_calculate_sequence_complexity_scores(accessor: IDTAccountAccessor, sequences: list[sbol3.Sequence]) -> \
+ dict[sbol3.Sequence, float]:
+ """Given a list of sequences, compute the complexity scores for any sequences not currently scored
+ by sending the sequences to IDT's online service for calculating sequence synthesis complexity.
+ Also records the complexity computation with an activity
+
+ :param accessor: IDT API access object
+ :param sequences: list of SBOL Sequences to evaluate
+ :return: Dictionary mapping Sequences to complexity scores for newly computed sequences
+ """
+ # Determine which sequences need scores
+ need_scores = [seq for seq, score in get_complexity_scores(sequences, include_missing=True).items()
+ if score is None]
+ if not need_scores:
+ return dict()
+
+ # Query for the scores of the sequences
+ score_dictionary = accessor.get_sequence_complexity(need_scores)
+
+ # Create report generation activity
+ doc = need_scores[0].document
+ timestamp = datetime.datetime.utcnow().isoformat(timespec='seconds') + 'Z'
+ report_id = f'{COMPLEXITY_SCORE_NAMESPACE}/Complexity_Report_{timestamp.replace(":", "").replace("-", "")}_' \
+ f'{str(uuid.uuid4())[0:8]}'
+ report_generation = sbol3.Activity(report_id, end_time=timestamp, types=[REPORT_ACTIVITY_TYPE])
+ doc.add(report_generation)
+
+ # Mark the sequences with their scores, where each score is a dimensionless measure
+ for sequence, score in score_dictionary.items():
+ measure = sbol3.Measure(score, unit=tyto.OM.number_unit, types=[tyto.EDAM.sequence_complexity_report])
+ measure.generated_by.append(report_generation)
+ sequence.measures.append(measure)
+ # return the dictionary of newly computed scores
+ return score_dictionary
+
+
+def idt_calculate_complexity_scores(accessor: IDTAccountAccessor, doc: sbol3.Document) -> dict[sbol3.Sequence, float]:
+ """Given an SBOL Document, compute the complexity scores for any sequences in the Document not currently scored
+ by sending the sequences to IDT's online service for calculating sequence synthesis complexity.
+ Also records the complexity computation with an activity
+
+ :param accessor: IDT API access object
+ :param doc: SBOL document with sequences of interest in it
+ :return: Dictionary mapping Sequences to complexity scores
+ """
+ sequences = [obj for obj in doc if isinstance(obj, sbol3.Sequence)]
+ return idt_calculate_sequence_complexity_scores(accessor, sequences)
+
+
+def main():
+ """
+ Main wrapper: read from input file, invoke idt_calculate_complexity_scores, then write to output file
+ """
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-c', '--credentials',
+ help="""JSON file containing IDT API access credentials.
+To obtain access credentials, follow the directions at https://www.idtdna.com/pages/tools/apidoc
+The values of the IDT access credentials should be stored in a JSON of the following form:
+{ "username": "username", "password": "password", "ClientID": "####", "ClientSecret": "XXXXXXXXXXXXXXXXXXX" }"
+""")
+ parser.add_argument('--username', help="Username of your IDT account (if not using JSON credentials)")
+ parser.add_argument('--password', help="Password of your IDT account (if not using JSON credentials)")
+ parser.add_argument('--ClientID', help="ClientID of your IDT account (if not using JSON credentials)")
+ parser.add_argument('--ClientSecret', help="ClientSecret of your IDT account (if not using JSON credentials)")
+ parser.add_argument('input_file', help="Absolute path to sbol file with sequences")
+ parser.add_argument('output_name', help="Name of SBOL file to be written")
+ parser.add_argument('-t', '--file-type', dest='file_type', default=sbol3.SORTED_NTRIPLES,
+ help="Name of SBOL file to output to (excluding type)")
+ parser.add_argument('--verbose', '-v', dest='verbose', action='count', default=0)
+ args_dict = vars(parser.parse_args())
+
+ # Extract arguments:
+ verbosity = args_dict['verbose']
+ logging.getLogger().setLevel(level=(logging.WARN if verbosity == 0 else
+ logging.INFO if verbosity == 1 else logging.DEBUG))
+ input_file = args_dict['input_file']
+ output_name = args_dict['output_name']
+
+ if args_dict['credentials'] != None:
+ with open(args_dict['credentials']) as credentials:
+ idt_accessor = IDTAccountAccessor.from_json(json.load(credentials))
+ else:
+ idt_accessor = IDTAccountAccessor(args_dict['username'], args_dict['password'], args_dict['ClientID'],
+ args_dict['ClientSecret'])
+
+ extension = type_to_standard_extension[args_dict['file_type']]
+ outfile_name = output_name if output_name.endswith(extension) else output_name + extension
+
+ # Read file, convert, and write resulting document
+ logging.info('Reading SBOL file ' + input_file)
+ doc = sbol3.Document()
+ doc.read(input_file)
+ results = idt_calculate_complexity_scores(idt_accessor, doc)
+ doc.write(outfile_name, args_dict['file_type'])
+ logging.info('SBOL file written to %s with %i new scores calculated', outfile_name, len(results))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/sbol_utilities/conversion.py b/sbol_utilities/conversion.py
index e19141ce..225c55b7 100644
--- a/sbol_utilities/conversion.py
+++ b/sbol_utilities/conversion.py
@@ -15,6 +15,8 @@
from sbol_utilities.helper_functions import strip_sbol2_version, GENETIC_DESIGN_FILE_TYPES, \
find_top_level
+from sbol_utilities.sbol3_genbank_conversion import GenBankSBOL3Converter
+import sbol_utilities.sbol3_sbol2_conversion
from sbol_utilities.workarounds import id_sort
# sbol javascript executable based on https://github.com/sboltools/sbolgraph
@@ -68,13 +70,18 @@ def convert_identities2to3(sbol3_data: str) -> str:
return g.serialize(format="xml")
-def convert2to3(sbol2_doc: Union[str, sbol2.Document], namespaces=None) -> sbol3.Document:
+def convert2to3(sbol2_doc: Union[str, sbol2.Document], namespaces=None, use_native_converter: bool = False) \
+ -> sbol3.Document:
"""Convert an SBOL2 document to an equivalent SBOL3 document
:param sbol2_doc: Document to convert
:param namespaces: list of URI prefixes to treat as namespaces
+ :param use_native_converter: if true, use experimental Python converter instead of JavaScript call-out
:return: equivalent SBOL3 document
"""
+ if use_native_converter:
+ return sbol_utilities.sbol3_sbol2_conversion.convert2to3(sbol2_doc, namespaces)
+
# if we've started with a Document in memory, write it to a temp file
if namespaces is None:
namespaces = []
@@ -166,12 +173,16 @@ def change_orientation(o):
return doc
-def convert3to2(doc3: sbol3.Document) -> sbol2.Document:
+def convert3to2(doc3: sbol3.Document, use_native_converter: bool = False) -> sbol2.Document:
"""Convert an SBOL3 document to an equivalent SBOL2 document
:param doc3: Document to convert
+ :param use_native_converter: if true, use experimental Python converter instead of JavaScript call-out
:return: equivalent SBOL2 document
"""
+ if use_native_converter:
+ return sbol_utilities.sbol3_sbol2_conversion.convert3to2(doc3)
+
# TODO: remove workarounds after conversion errors fixed in https://github.com/sboltools/sbolgraph/issues/16
# remap sequence encodings:
encoding_remapping = {
@@ -293,7 +304,7 @@ def convert_from_fasta(path: str, namespace: str, identity_map: Dict[str, str] =
# TODO: Figure out how to support multiple namespaces like we do for FASTA: currently, importing from multiple
# namespaces will not work correctly
-def convert_from_genbank(path: str, namespace: str, allow_genbank_online: bool = False) -> sbol3.Document:
+def convert_from_genbank(path: str, namespace: str, allow_genbank_online: bool = False, force_new_converter: bool = False) -> sbol3.Document:
"""Convert a GenBank document on disk into an SBOL3 document
Specifically, the GenBank document is first imported to SBOL2, then converted from SBOL2 to SBOL3
@@ -302,6 +313,9 @@ def convert_from_genbank(path: str, namespace: str, allow_genbank_online: bool =
:param allow_genbank_online: Use the online converter, rather than the local converter
:return: SBOL3 document containing converted materials
"""
+ if force_new_converter:
+ converter = GenBankSBOL3Converter()
+ return converter.convert_genbank_to_sbol3(gb_file=path, namespace=namespace, write=False)
doc2 = sbol2.Document()
sbol2.setHomespace(namespace)
# Convert document offline
@@ -316,7 +330,7 @@ def convert_from_genbank(path: str, namespace: str, allow_genbank_online: bool =
return doc
-def convert_to_genbank(doc3: sbol3.Document, path: str, allow_genbank_online: bool = False) \
+def convert_to_genbank(doc3: sbol3.Document, path: str, allow_genbank_online: bool = False, force_new_converter: bool = False) \
-> List[SeqRecord.SeqRecord]:
"""Convert an SBOL3 document to a GenBank file, which is written to disk
Note that for compatibility with version control software, if no prov:modified term is available on each Component,
@@ -327,6 +341,10 @@ def convert_to_genbank(doc3: sbol3.Document, path: str, allow_genbank_online: bo
:param allow_genbank_online: use the online converter rather than the local converter
:return: BioPython SeqRecord of the GenBank that was written
"""
+ if force_new_converter:
+ converter = GenBankSBOL3Converter()
+ result_dict = converter.convert_sbol3_to_genbank(sbol3_file=None, doc=doc3, gb_file=path, write=True)
+ return result_dict["seqrecords"]
# first convert to SBOL2, then export to a temp GenBank file
doc2 = convert3to2(doc3)
@@ -395,7 +413,7 @@ def command_line_converter(args_dict: Dict[str, Any]):
if input_file_type == 'FASTA':
doc3 = convert_from_fasta(input_file, namespace)
elif input_file_type == 'GenBank':
- doc3 = convert_from_genbank(input_file, namespace, args_dict['allow_genbank_online'])
+ doc3 = convert_from_genbank(input_file, namespace, args_dict['allow_genbank_online'], args_dict['force_new_converter'])
elif input_file_type == 'SBOL2':
doc2 = sbol2.Document()
doc2.read(input_file)
@@ -411,7 +429,7 @@ def command_line_converter(args_dict: Dict[str, Any]):
if output_file_type == 'FASTA':
convert_to_fasta(doc3, output_file)
elif output_file_type == 'GenBank':
- convert_to_genbank(doc3, output_file, args_dict['allow_genbank_online'])
+ convert_to_genbank(doc3, output_file, args_dict['allow_genbank_online'], args_dict['force_new_converter'])
elif output_file_type == 'SBOL2':
doc2 = convert3to2(doc3)
validate_online = sbol2.Config.getOption(sbol2.ConfigOptions.VALIDATE_ONLINE)
@@ -440,6 +458,8 @@ def main():
help="Print running explanation of conversion process")
parser.add_argument('--allow-genbank-online', dest='allow_genbank_online', action='store_true', default=False,
help='Perform GenBank conversion using online converter')
+ parser.add_argument('--force-new-converter', dest='force_new_converter', action='store_true', default=False,
+ help='Force the usage of new (offline) converter instead of legacy (online) converter.')
args_dict = vars(parser.parse_args())
# Call the shared command-line conversion routine
command_line_converter(args_dict)
@@ -474,6 +494,8 @@ def genbank2sbol():
help='Print running explanation of conversion process')
parser.add_argument('--allow-genbank-online', dest='allow_genbank_online', action='store_true', default=False,
help='Perform GenBank conversion using online converter')
+ parser.add_argument('--force-new-converter', dest='force_new_converter', action='store_true', default=False,
+ help='Force the usage of new (offline) converter instead of legacy (online) converter.')
args_dict = vars(parser.parse_args())
args_dict['input_file_type'] = 'GenBank'
args_dict['output_file_type'] = 'SBOL3'
@@ -524,6 +546,8 @@ def sbol2genbank():
help="Print running explanation of conversion process")
parser.add_argument('--allow-genbank-online', dest='allow_genbank_online', action='store_true', default=False,
help='Perform GenBank conversion using online converter')
+ parser.add_argument('--force-new-converter', dest='force_new_converter', action='store_true', default=False,
+ help='Force the usage of new (offline) converter instead of legacy (online) converter.')
args_dict = vars(parser.parse_args())
args_dict['input_file_type'] = 'SBOL3'
args_dict['output_file_type'] = 'GenBank'
diff --git a/sbol_utilities/gb2so.csv b/sbol_utilities/gb2so.csv
new file mode 100644
index 00000000..764feb03
--- /dev/null
+++ b/sbol_utilities/gb2so.csv
@@ -0,0 +1,81 @@
+GenBank_Ontology,SO_Ontology
+allele,SO:0001023
+attenuator,SO:0000140
+C_region,SO:0001834
+CAAT_signal,SO:0000172
+CDS,SO:0000316
+D-loop,SO:0000297
+D_segment,SO:0000458
+enhancer,SO:0000165
+exon,SO:0000147
+gene,SO:0000704
+GC_signal,SO:0000173
+iDNA,SO:0000723
+intron,SO:0000188
+J_region,SO:0000470
+LTR,SO:0000286
+mat_peptide,SO:0000419
+misc_binding,SO:0000409
+misc_difference,SO:0000413
+misc_feature,SO:0000001
+misc_marker,SO:0001645
+misc_recom,
+misc_RNA,SO:0000233
+misc_signal,SO:0001411
+misc_structure,SO:0000002
+modified_base,SO:0000305
+mRNA,SO:0000234
+N_region,SO:0001835
+polyA_signal,SO:0000551
+polyA_site,SO:0000553
+precursor_RNA,SO:0000185
+prim_transcript,SO:0000185
+primer,SO:0000112
+primer_bind,SO:0005850
+promoter,SO:0000167
+protein_bind,SO:0000410
+RBS,SO:0000139
+rep_origin,SO:0000296
+repeat_region,SO:0000657
+repeat_unit,SO:0000726
+rRNA,SO:0000252
+S_region,SO:0001836
+satellite,SO:0000005
+scRNA,SO:0000013
+sig_peptide,SO:0000418
+snRNA,SO:0000274
+source,SO:0000149
+stem_loop,SO:0000313
+STS,SO:0000331
+TATA_signal,SO:0000174
+terminator,SO:0000141
+transit_peptide,SO:0000725
+transposon,SO:0001054
+tRNA,SO:0000253
+V_region,SO:0001833
+variation,SO:0001060
+-10_signal,SO:0000175
+-35_signal,SO:0000176
+3'clip,SO:0000557
+3'UTR,SO:0000205
+5'clip,SO:0000555
+5'UTR,SO:0000204
+regulatory,SO:0005836
+snoRNA,SO:0000275
+assembly_gap,SO:0000730
+gap,SO:0000730
+centromere,SO:0000577
+J_segment,SO:0000470
+J_gene_segemnt,SO:0000470
+mobile_element,SO:0001037
+mobile_genetic_element,SO:0001037
+ncRNA,SO:0000655
+operon,SO:0000178
+oriT,SO:0000724
+propeptide,SO:0001062
+telomere,SO:0000624
+tmRNA,SO:0000584
+unsure,SO:0001086
+sequence_uncertainty,SO:0001086
+V_segment,SO:000046
+V_gene_segment,SO:0000466
diff --git a/sbol_utilities/sbol3_genbank_conversion.py b/sbol_utilities/sbol3_genbank_conversion.py
new file mode 100644
index 00000000..c7182755
--- /dev/null
+++ b/sbol_utilities/sbol3_genbank_conversion.py
@@ -0,0 +1,858 @@
+import os
+import csv
+import math
+import sbol3
+import logging
+from collections import OrderedDict
+
+import tyto
+from typing import Dict, List, Sequence, Union, Optional
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation, Reference, \
+ CompoundLocation, BeforePosition, ExactPosition, AfterPosition
+
+from sbol_utilities.workarounds import tyto_normalize_term
+
+
+class GenBankSBOL3Converter:
+ """Main Converter class handling offline, direction conversion of files SBOL3 files to and from GenBank files"""
+ # dictionaries to store feature lookups for terms in GenBank and SO ontologies
+ gb2so_map = {}
+ so2gb_map = {}
+ # Conversion Constants :
+ # TODO: Temporarily assuming only dna components to be dealt with in genbank files
+ COMP_TYPES = [sbol3.SBO_DNA]
+ # TODO: Temporarily assuming components to only have the engineered_region role
+ COMP_ROLES = [sbol3.SO_ENGINEERED_REGION]
+ # TODO: Temporarily encoding sequence objects in IUPAC mode only
+ SEQUENCE_ENCODING = sbol3.IUPAC_DNA_ENCODING
+ # BIO_STRAND constants, which server as the GenBank counterparts to SBOL3's inline and reverse orientations
+ BIO_STRAND_FORWARD = 1
+ BIO_STRAND_REVERSE = -1
+ # Mapping int types to the types of locationPositions in GenBank (Before/After/Exact)
+ SBOL_LOCATION_POSITION = {BeforePosition: 0, ExactPosition: 1, AfterPosition: 2}
+ GENBANK_LOCATION_POSITION = {0: BeforePosition, 1: ExactPosition, 2: AfterPosition}
+ # Default value for the "sequence_version" annotation in GenBank files
+ DEFAULT_GB_SEQ_VERSION = 1
+ # Default terms for SBOL3 and GenBank in case the feature lookup from
+ # respective dictionaries does not yield any ontology term
+ DEFAULT_SO_TERM = "SO:0000110"
+ DEFAULT_GB_TERM = "misc_feature"
+ # Namespace to be used be default if not provided, and also for all unit tests related to this converter
+ TEST_NAMESPACE = "https://test.sbol3.genbank/"
+ # File locations for required CSV data files which store the ontology term
+ # translations between GenBank and SO ontologies
+ GB2SO_MAPPINGS_CSV = os.path.join(os.path.dirname(os.path.realpath(__file__)), "gb2so.csv")
+ SO2GB_MAPPINGS_CSV = os.path.join(os.path.dirname(os.path.realpath(__file__)), "so2gb.csv")
+
+ def __init__(self) -> None:
+ """While instantiating an instance of the converter, required builders
+ must be registered in order to accurately parse modified or new SBOL3 class objects
+ """
+ def build_component_genbank_extension(*, identity, type_uri) -> GenBankSBOL3Converter.ComponentGenBankExtension:
+ """A builder function to be called by the SBOL3 parser
+ when it encounters a Component in an SBOL file.
+ :param identity: identity for new component class instance to have
+ :param type_uri: type_uri for new component class instance to have
+ """
+ # `types` is required and not known at build time.
+ # Supply a missing value to the constructor, then clear
+ # the missing value before returning the built object.
+ obj = self.ComponentGenBankExtension(identity=identity, types=[sbol3.PYSBOL3_MISSING], type_uri=type_uri)
+ # Remove the placeholder value
+ obj.clear_property(sbol3.SBOL_TYPE)
+ return obj
+
+ def build_feature_qualifiers_extension(*, identity, type_uri) -> GenBankSBOL3Converter.FeatureGenBankExtension:
+ """A builder function to be called by the SBOL3 parser
+ when it encounters a SequenceFeature in an SBOL file.
+ :param identity: identity for new feature qualifier class instance to have
+ :param type_uri: type_uri for new feature qualifier class instance to have
+ """
+ # `types` is required and not known at build time.
+ # Supply a missing value to the constructor, then clear
+ # the missing value before returning the built object.
+ obj = self.FeatureGenBankExtension(identity=identity, type_uri=type_uri)
+ # Remove the placeholder value
+ obj.clear_property(sbol3.SBOL_TYPE)
+ return obj
+
+ def build_location_extension(*, identity, type_uri) -> GenBankSBOL3Converter.LocationGenBankExtension:
+ """A builder function to be called by the SBOL3 parser
+ when it encounters a Custom location in an SBOL file.
+ :param identity: identity for new Location class instance to have
+ :param type_uri: type_uri for new Location class instance to have
+ """
+ # `types` is required and not known at build time.
+ # Supply a missing value to the constructor, then clear
+ # the missing value before returning the built object.
+ obj = self.LocationGenBankExtension(identity=identity, type_uri=type_uri)
+ # Remove the placeholder value
+ # obj.clear_property(sbol3.SBOL_TYPE)
+ return obj
+
+ def build_custom_reference_property(*, identity, type_uri) -> GenBankSBOL3Converter.CustomReferenceProperty:
+ """A builder function to be called by the SBOL3 parser
+ when it encounters a CustomReferenceProperty Toplevel object in an SBOL file.
+ :param identity: identity for custom reference property instance to have
+ :param type_uri: type_uri for custom reference property instance to have
+ """
+ obj = self.CustomReferenceProperty(identity=identity, type_uri=type_uri)
+ return obj
+
+ def build_custom_structured_comment_property(
+ *, identity, type_uri) -> GenBankSBOL3Converter.CustomStructuredCommentProperty:
+ """A builder function to be called by the SBOL3 parser
+ when it encounters a CustomStructuredCommentProperty Toplevel object in an SBOL file.
+ :param identity: identity for custom comment property instance to have
+ :param type_uri: type_uri for custom comment property instance to have
+ """
+ obj = self.CustomStructuredCommentProperty(identity=identity, type_uri=type_uri)
+ return obj
+
+ # Register the builder function so SBOL3 parser can build objects with a Component type URI
+ sbol3.Document.register_builder(sbol3.SBOL_COMPONENT, build_component_genbank_extension)
+ # Register the builder function for custom reference properties
+ sbol3.Document.register_builder(self.CustomReferenceProperty.CUSTOM_REFERENCE_NS,
+ build_custom_reference_property)
+ # Register the builder function for custom structured comment properties
+ sbol3.Document.register_builder(
+ self.CustomStructuredCommentProperty.CUSTOM_STRUCTURED_COMMENT_NS,
+ build_custom_structured_comment_property)
+ # Register the builder function so SBOL3 parser can build objects with a SequenceFeature type URI
+ sbol3.Document.register_builder(sbol3.SBOL_SEQUENCE_FEATURE, build_feature_qualifiers_extension)
+ # Register the builder function so SBOL3 parser can build objects with a Location type URI
+ sbol3.Document.register_builder(self.LocationGenBankExtension.GENBANK_RANGE_NS, build_location_extension)
+
+ class CustomReferenceProperty(sbol3.CustomIdentified):
+ """Serves to store information and annotations for 'Reference' objects in
+ GenBank file to SBOL3 while parsing so that it may be retrieved back in a round trip
+ :extends: sbol3.CustomIdentified class
+ """
+ CUSTOM_REFERENCE_NS = "http://www.ncbi.nlm.nih.gov/genbank#GenbankReference"
+
+ def __init__(self, type_uri=CUSTOM_REFERENCE_NS, identity=None):
+ super().__init__(identity=identity, type_uri=type_uri)
+ self.authors = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#authors", 0, 1)
+ self.comment = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#comment", 0, 1)
+ self.journal = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#journal", 0, 1)
+ self.consrtm = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#consrtm", 0, 1)
+ self.title = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#title", 0, 1)
+ self.medline_id = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#medline_id", 0, 1)
+ self.pubmed_id = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#pubmed_id", 0, 1)
+ # stores the display id of parent component for a particular CustomReferenceProperty object
+ self.component = sbol3.TextProperty(self, f"{self.CUSTOM_REFERENCE_NS}#component", 0, 1)
+ # TODO: support cut locations?
+ # there can be multiple locations described for a reference, thus upper
+ # bound needs to be > 1 in order to use ListProperty
+ self.location = sbol3.OwnedObject(
+ self, f"{self.CUSTOM_REFERENCE_NS}#location", 0, math.inf, type_constraint=sbol3.Range)
+
+ class CustomStructuredCommentProperty(sbol3.CustomIdentified):
+ """Serves to store information and annotations for 'Structured_Comment' objects in
+ GenBank file to SBOL3 while parsing so that it may be retrieved back in a round trip
+ Complete reference available at: https://www.ncbi.nlm.nih.gov/genbank/structuredcomment/
+ :extends: sbol3.CustomIdentified class
+ """
+ CUSTOM_STRUCTURED_COMMENT_NS = "http://www.ncbi.nlm.nih.gov/genbank#GenbankStructuredComment"
+
+ def __init__(self, type_uri=CUSTOM_STRUCTURED_COMMENT_NS, identity=None):
+ super().__init__(identity=identity, type_uri=type_uri)
+ self.heading = sbol3.TextProperty(self, f"{self.CUSTOM_STRUCTURED_COMMENT_NS}#heading", 0, 1)
+ # stores the display id of parent component for a particular CustomReferenceProperty object
+ self.component = sbol3.TextProperty(self, f"{self.CUSTOM_STRUCTURED_COMMENT_NS}#component", 0, 1)
+ # there can be multiple key/values described for a structured_comment,
+ # thus upper bound needs to be > 1 in order to use ListProperty
+ self.structured_keys = sbol3.TextProperty(
+ self, f"{self.CUSTOM_STRUCTURED_COMMENT_NS}#structuredKeys", 0, math.inf)
+ self.structured_values = sbol3.TextProperty(
+ self, f"{self.CUSTOM_STRUCTURED_COMMENT_NS}#structuredValues", 0, math.inf)
+
+ class FeatureGenBankExtension(sbol3.SequenceFeature):
+ """Overrides the sbol3 SequenceFeature class to include fields to directly read and write
+ qualifiers of GenBank features not storable in any SBOL3 data field.
+ :extends: sbol3.SequenceFeature class
+ """
+ GENBANK_FEATURE_QUALIFIER_NS = "http://www.ncbi.nlm.nih.gov/genbank#featureQualifier"
+
+ def __init__(self, locations: List[sbol3.Location] = None, **kwargs) -> None:
+ if locations is None:
+ locations = []
+ # instantiating sbol3 SequenceFeature object
+ super().__init__(locations=locations, **kwargs)
+ # Setting properties for GenBank's qualifiers not settable in any SBOL3 field.
+ self.qualifier_key = sbol3.TextProperty(self, f"{self.GENBANK_FEATURE_QUALIFIER_NS}#key", 0, math.inf)
+ self.qualifier_value = sbol3.TextProperty(self, f"{self.GENBANK_FEATURE_QUALIFIER_NS}#value", 0, math.inf)
+
+ class LocationGenBankExtension(sbol3.Location):
+ """Overrides the sbol3 Location class to include fields to store the
+ start and end position types (AfterPosition / BeforePosition / ExactPosition).
+ :extends: sbol3.Location class
+ """
+ GENBANK_RANGE_NS = "http://www.ncbi.nlm.nih.gov/genbank#locationPosition"
+
+ def __init__(self, sequence: sbol3.Sequence = sbol3.Sequence("autoCreatedSequence"),
+ *, identity: str = None, type_uri: str = GENBANK_RANGE_NS,
+ **kwargs) -> None:
+ super().__init__(sequence=sequence, identity=identity, type_uri=type_uri, **kwargs)
+ self.start = sbol3.IntProperty(self, f"{self.GENBANK_RANGE_NS}#start", 0, 1)
+ self.end = sbol3.IntProperty(self, f"{self.GENBANK_RANGE_NS}#end", 0, 1)
+ # Setting properties for GenBank's location position not settable in any SBOL3 field.
+ self.start_position = sbol3.IntProperty(self, f"{self.GENBANK_RANGE_NS}#start_position", 0, 1)
+ self.end_position = sbol3.IntProperty(self, f"{self.GENBANK_RANGE_NS}#end_position", 0, 1)
+
+ class ComponentGenBankExtension(sbol3.Component):
+ """Overrides the sbol3 Component class to include fields to directly read and write
+ extraneous properties of GenBank not storable in any SBOL3 data field.
+ :extends: sbol3.Component class
+ """
+ GENBANK_EXTRA_PROPERTY_NS = "http://www.ncbi.nlm.nih.gov/genbank"
+
+ def __init__(self, identity: str, types: Optional[Union[str, Sequence[str]]], **kwargs) -> None:
+ # instantiating sbol3 component object
+ super().__init__(identity=identity, types=types, **kwargs)
+ # Setting properties for GenBank's extraneous properties not settable in any SBOL3 field.
+ self.genbank_seq_version = sbol3.IntProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#seq_version", 0, 1)
+ self.genbank_name = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#name", 0, 1)
+ self.genbank_date = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#date", 0, 1)
+ self.genbank_division = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#division", 0, 1)
+ self.genbank_locus = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#locus", 0, 1)
+ self.genbank_molecule_type = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#molecule", 0, 1)
+ self.genbank_organism = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#organism", 0, 1)
+ self.genbank_source = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#source", 0, 1)
+ self.genbank_topology = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#topology", 0, 1)
+ self.genbank_gi = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#gi", 0, 1)
+ self.genbank_comment = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#comment", 0, 1)
+ self.genbank_dblink = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#dbxrefs", 0, 1)
+ self.genbank_record_id = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#id", 0, 1)
+ # TODO : add note linking issue here
+ self.genbank_taxonomy = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#taxonomy", 0, 1)
+ self.genbank_keywords = sbol3.TextProperty(self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#keywords", 0, 1)
+ # there can be multiple accessions, thus upper bound needs to be > 1 in order to use TextListProperty
+ self.genbank_accessions = sbol3.TextProperty(
+ self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#accession", 0, math.inf)
+ self.fuzzy_features = sbol3.OwnedObject(
+ self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#fuzzyFeature", 0, math.inf,
+ type_constraint=sbol3.SequenceFeature)
+ self.genbank_references = sbol3.OwnedObject(
+ self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#reference", 0, math.inf,
+ type_constraint=GenBankSBOL3Converter.CustomReferenceProperty)
+ self.genbank_structured_comments = sbol3.OwnedObject(
+ self, f"{self.GENBANK_EXTRA_PROPERTY_NS}#structuredComment", 0, math.inf,
+ type_constraint=GenBankSBOL3Converter.CustomStructuredCommentProperty)
+
+ def create_gb2so_role_mappings(self, gb2so_csv: str = GB2SO_MAPPINGS_CSV, so2gb_csv: str = SO2GB_MAPPINGS_CSV,
+ convert_gb2so: bool = True, convert_so2gb: bool = True) -> int:
+ """Reads 2 CSV Files containing mappings for converting between GenBank and SequenceOntology (SO) roles
+ :param gb2so_csv: path to read genbank to so conversion csv file
+ :param so2gb_csv: path to read so to genbank conversion csv file
+ :param convert_gb2so: bool stating whether to read csv for genbank to SO mappings
+ :param convert_so2gb: bool stating whether to read csv for SO to genbank mappings
+ :return: int 1 / 0 denoting the status of whether the mappings were created and stored in dictionaries
+ """
+ if convert_gb2so:
+ logging.debug("Parsing %s for GenBank to SO ontology mappings.", gb2so_csv)
+ try:
+ with open(gb2so_csv, mode="r") as csv_file:
+ csv_reader = csv.DictReader(csv_file)
+ for row in csv_reader:
+ self.gb2so_map[row["GenBank_Ontology"]] = row["SO_Ontology"]
+ except FileNotFoundError:
+ logging.error("No GenBank to SO Ontology Mapping CSV File Exists!")
+ return 0
+ if convert_so2gb:
+ logging.debug("Parsing %s for SO to GenBank ontology mappings.", so2gb_csv)
+ try:
+ with open(so2gb_csv, mode="r") as csv_file:
+ csv_reader = csv.DictReader(csv_file)
+ for row in csv_reader:
+ self.so2gb_map[row["SO_Ontology"]] = row["GenBank_Ontology"]
+ except FileNotFoundError:
+ logging.error("No SO to Genbank Ontology Mapping CSV File Exists!")
+ return 0
+ return 1
+
+ def convert_genbank_to_sbol3(self, gb_file: str, sbol3_file: str = "sbol3.nt", namespace: str = TEST_NAMESPACE,
+ write: bool = False) -> sbol3.Document:
+ """Convert a GenBank document on disk into an SBOL3 document
+ The GenBank document is parsed using BioPython, and corresponding objects of SBOL3 document are created
+
+ :param gb_file: path to read GenBank file from
+ :param sbol3_file: path to write SBOL3 file to, if write set to true
+ :param namespace: URIs of Components will be set to {namespace}/{genbank_id},
+ defaults to "https://test.sbol3.genbank/"
+ :param write: writes the generated sbol3 document in SORTED_NTRIPLES
+ format to provided sbol3_file path
+ :return: SBOL3 document containing converted materials
+ """
+ # create sbol3 document, and record parser handler for gb file
+ sbol3.set_namespace(namespace)
+ doc = sbol3.Document()
+ # create updated py dict to store mappings between gb and so ontologies
+ logging.debug("Creating GenBank and SO ontologies mappings for sequence feature roles")
+ map_created = self.create_gb2so_role_mappings(gb2so_csv=self.GB2SO_MAPPINGS_CSV, convert_so2gb=False)
+ if not map_created:
+ # TODO: Need better SBOL3-GenBank specific error classes in future
+ raise ValueError("Required CSV data files are not present in your package.\n "
+ "Please reinstall the sbol_utilities package.\n Stopping current conversion process.\n "
+ "Reverting to legacy converter if new Conversion process is not forced.")
+ # access records by parsing gb file using SeqIO class
+ logging.debug("Parsing Genbank records using SeqIO class, using GenBank file %s", gb_file)
+ for record in list(SeqIO.parse(gb_file, "genbank").records):
+ # TODO: Currently we assume only linear or circular topology is possible
+ logging.debug("Parsing record - `%s` in genbank file.", record.id)
+ topology = "linear"
+ if "topology" in record.annotations:
+ topology = record.annotations["topology"]
+ # sometimes topology is specified in the 'data_file_division' field
+ elif record.annotations['data_file_division'] in ['circular', 'linear']:
+ topology = record.annotations['data_file_division']
+ if topology == "linear":
+ extra_comp_types = [sbol3.SO_LINEAR]
+ else:
+ extra_comp_types = [sbol3.SO_CIRCULAR]
+ # creating component extended Component class to include GenBank extraneous properties
+ comp = self.ComponentGenBankExtension(identity=sbol3.string_to_display_id(record.name),
+ types=self.COMP_TYPES + extra_comp_types, roles=self.COMP_ROLES,
+ description=record.description)
+ # since SBOL3 requires display_id to have only alphanumeric characters and start not with a number;
+ # and these constraints are not present in GenBank, we pass the GenBank locus name through a filter
+ # helper method ('string_to_display_id'), which conforms it to SBOL's standard, and also store the
+ # original name in an extraneous property field 'genbank_name' which is reset later on during round trips.
+ comp.genbank_name = record.name
+ doc.add(comp)
+
+ # TODO: Currently we use a fixed method of encoding (IUPAC)
+ seq = sbol3.Sequence(identity=str(comp.display_id) + "_sequence", elements=str(record.seq.lower()),
+ encoding=self.SEQUENCE_ENCODING)
+ doc.add(seq)
+ comp.sequences = [seq]
+
+ # Setting properties for GenBank's extraneous properties not settable in any SBOL3 field.
+ self._store_extra_properties_in_sbol3(comp, seq, record)
+
+ # create all sequence features, and tag all encountered feature qualifiers
+ # via extended Feature_GenBank_Extension class
+ self._handle_features_gb_to_sbol(record, comp, seq)
+
+ if write:
+ logging.debug("Writing created sbol3 document to disk in sorted ntriples format: %s", sbol3_file)
+ doc.write(fpath=sbol3_file, file_format=sbol3.SORTED_NTRIPLES)
+ return doc
+
+ def convert_sbol3_to_genbank(self, sbol3_file: str, doc: sbol3.Document = None, gb_file: str = "genbank.out",
+ # write: bool = False) -> List[SeqRecord]:
+ write: bool = False) -> Dict:
+ """Convert a SBOL3 document on disk into a GenBank document
+ The GenBank document is made using an array of SeqRecords using BioPython, by parsing SBOL3 objects
+
+ :param sbol3_file: path to read SBOL3 file from
+ :param gb_file: path to write GenBank file to, if write set to true
+ :param write: writes the generated genbank document to provided path
+ :return: Array of SeqRecord objects which comprise the generated GenBank document
+ """
+ if not doc:
+ doc = sbol3.Document()
+ doc.read(sbol3_file)
+ seq_records = []
+ # create logs dict to be returned as conversion status of the SBOL3 file provided
+ logs: Dict[sbol3.TopLevel, bool] = {}
+ logging.debug("Creating GenBank and SO ontologies mappings for sequence feature roles")
+ # create updated py dict to store mappings between gb and so ontologies
+ map_created = self.create_gb2so_role_mappings(so2gb_csv=self.SO2GB_MAPPINGS_CSV, convert_gb2so=False)
+ if not map_created:
+ # TODO: Need better SBOL3-GenBank specific error classes in future
+ raise ValueError("Required CSV data files are not present in your package.\n "
+ "Please reinstall the sbol_utilities package.\n Stopping current conversion process.\n "
+ "Reverting to legacy converter if new Conversion process is not forced.")
+ # consider sbol3 objects which are components
+ logging.debug("Parsing SBOL3 Document components using SBOL3 Document: %s", doc)
+ for obj in doc.objects:
+ if isinstance(obj, sbol3.TopLevel):
+ # create a key for the top level object if it is not already parsed
+ if obj not in logs:
+ logs[obj] = False
+ if isinstance(obj, sbol3.Component):
+ logging.debug("Parsing component - `%s` in sbol3 document.", obj.display_id)
+ # NOTE: A single component/record cannot have multiple sequences
+ seq = None # If no sequence is found for a component
+ if obj.sequences and len(obj.sequences) == 1:
+ if doc.find(obj.sequences[0]):
+ obj_seq = doc.find(obj.sequences[0])
+ seq = Seq(obj_seq.elements.upper())
+ # mark the status of this top level sequence object as parsed and converted
+ if isinstance(obj_seq, sbol3.TopLevel):
+ logs[obj_seq] = True
+ elif len(obj.sequences) > 1:
+ raise ValueError(f"Component `{obj.display_id}` of given SBOL3 document has more than 1 sequence\n \
+ (`{len(obj.sequences)}`). This is invalid; a component may only have 1 or 0 sequences.")
+ # Locus name for the sequence record is just the display id if SBOL3 component was not extended
+ # to include extraneous properties (in which case, we use the directly stored 'genbank_name' field)
+ locus_name = obj.display_id
+ if isinstance(obj, self.ComponentGenBankExtension) and obj.genbank_name:
+ locus_name = obj.genbank_name
+ seq_rec = SeqRecord(seq=seq, id=obj.display_id, description=obj.description or '', name=locus_name)
+ # Resetting extraneous genbank properties from extended component-genbank class
+ self._reset_extra_properties_in_genbank(obj, seq_rec)
+
+ # recreate all sequence features, and tag all encountered feature
+ # qualifiers via extended Feature_GenBank_Extension class
+ self._handle_features_sbol_to_gb(seq_rec, obj)
+
+ # mark the top level component object as parsed and converter
+ logs[obj] = True
+ seq_records.append(seq_rec)
+ # writing generated genbank document to disk at path provided
+ if write:
+ logging.debug("Writing created genbank file to disk: %s", gb_file)
+ SeqIO.write(seq_records, gb_file, "genbank")
+ return {"status": logs, "seqrecords": seq_records}
+
+ def _store_extra_properties_in_sbol3(self, comp: ComponentGenBankExtension,
+ seq: sbol3.Sequence, record: SeqRecord) -> None:
+ """Helper function for setting properties for GenBank's extraneous properties not directly settable in any
+ SBOL3 field, using a modified, extended SBOL3 Component class, and a new CustomReferenceProperty TopLevel class.
+ :param comp: Instance of the extended SBOL3 Component class (Component_GenBank_Extension)
+ :param seq: The Sequence used in the GenBank record corresponding to sbol3 comp
+ :param record: GenBank SeqRecord instance for the record which contains extra properties
+ """
+ comp.genbank_record_id = record.id
+ # set dblinks from the dbxrefs property of biopython
+ if record.dbxrefs:
+ # dbxrefs are parsed in a list by biopython from `record.dbxrefs`; we are storing them as a flat string
+ # to maintain order. Thus, we are creating a custom delimiter of `::`, by which we shall separate
+ # individual dbxrefs in the string and later split them to a list while resetting them in genbank
+ comp.genbank_dblink = "::".join(record.dbxrefs)
+ for annotation in record.annotations:
+ # Sending out warnings for genbank info not storable in sbol3
+ logging.warning("Extraneous information not directly storable in SBOL3 - %s: %s", annotation,
+ record.annotations[annotation])
+ # 1. GenBank Record Date
+ if annotation == 'date':
+ comp.genbank_date = record.annotations['date']
+ # 2. GenBank Record Division
+ elif annotation == 'data_file_division':
+ # FIX for iGEM files not having data file division but topology stored in its key
+ if record.annotations['data_file_division'] in ['circular', 'linear']:
+ comp.genbank_topology = record.annotations['data_file_division']
+ else:
+ comp.genbank_division = record.annotations['data_file_division']
+ # 3. GenBank Record Keywords
+ elif annotation == 'keywords':
+ comp.genbank_keywords = ",".join(record.annotations['keywords'])
+ # 4. GenBank Record Molecule Type
+ elif annotation == 'molecule_type':
+ comp.genbank_molecule_type = record.annotations['molecule_type']
+ # 5. GenBank Record Organism
+ elif annotation == 'organism':
+ comp.genbank_organism = record.annotations['organism']
+ # 6. GenBank Record Source
+ elif annotation == 'source':
+ comp.genbank_source = record.annotations['source']
+ # 7. GenBank Record Taxonomy
+ elif annotation == 'taxonomy':
+ comp.genbank_taxonomy = ",".join(record.annotations['taxonomy'])
+ # 8. GenBank Record Topology
+ elif annotation == 'topology':
+ comp.genbank_topology = record.annotations['topology']
+ # 9. GenBank Record GI Property
+ elif annotation == 'gi':
+ comp.genbank_gi = record.annotations['gi']
+ # 10. GenBank Record Accessions
+ elif annotation == 'accessions':
+ comp.genbank_accessions = sorted(record.annotations['accessions'])
+ # 11. GenBank Sequence Version
+ elif annotation == 'sequence_version':
+ comp.genbank_seq_version = record.annotations['sequence_version']
+ # 12. GenBank Record References
+ elif annotation == 'references':
+ references = []
+ for ind, reference in enumerate(record.annotations['references']):
+ # create a custom reference property instance for each reference
+ custom_reference = self.CustomReferenceProperty()
+ custom_reference.authors = reference.authors
+ custom_reference.comment = reference.comment
+ custom_reference.journal = reference.journal
+ custom_reference.title = reference.title
+ custom_reference.consrtm = reference.consrtm
+ custom_reference.medline_id = reference.medline_id
+ custom_reference.pubmed_id = reference.pubmed_id
+ for gb_loc in reference.location:
+ feat_loc_orientation = sbol3.SO_FORWARD
+ if gb_loc.strand == -1:
+ feat_loc_orientation = sbol3.SO_REVERSE
+ if gb_loc.start == gb_loc.end:
+ locs = sbol3.Cut(sequence=seq, at=int(gb_loc.start), orientation=feat_loc_orientation)
+ else:
+ locs = sbol3.Range(sequence=seq, start=int(gb_loc.start),
+ end=int(gb_loc.end), orientation=feat_loc_orientation)
+ custom_reference.location.append(locs)
+ # link the parent component for each custom reference property objects
+ if comp.display_id:
+ custom_reference.component = comp.display_id
+ # TODO: Raise error, no name for component
+ # else:
+ references.append(custom_reference)
+ comp.genbank_references = references
+ # 14. GenBank Record Comment
+ elif annotation == 'comment':
+ comp.genbank_comment = record.annotations['comment']
+ # 15. GenBank Record Structured comments
+ elif annotation == 'structured_comment':
+ identity_ind = 1
+ comments = []
+ for heading in record.annotations['structured_comment']:
+ structured_comment_object = self.CustomStructuredCommentProperty()
+ identity_ind += 1
+ if comp.display_id:
+ structured_comment_object.component = comp.display_id
+ structured_comment_object.heading = heading
+ structured_dict = record.annotations['structured_comment'][heading]
+ key_value_ind = 1
+ for key in structured_dict:
+ # NOTE: if storing list as string for keys and values both, have a check
+ # of them having same length user uses our delimiter while writing
+ structured_comment_object.structured_keys.append(f"{key_value_ind}::{key}")
+ structured_comment_object.structured_values.append(f"{key_value_ind}::{structured_dict[key]}")
+ key_value_ind += 1
+ comments.append(structured_comment_object)
+ comp.genbank_structured_comments = comments
+ else:
+ raise ValueError(f"The annotation `{annotation}` in the GenBank record `{record.id}`\n \
+ is not recognized as a standard annotation.")
+ # TODO: BioPython's parsing doesn't explicitly place a "locus" data field?
+ # 13. GenBank Record Locus
+ comp.genbank_locus = record.name
+
+ def _reset_extra_properties_in_genbank(self, obj: sbol3.Component, seq_rec: SeqRecord) -> None:
+ """Helper function for resetting properties for GenBank's extraneous properties from SBOL3 object's properties,
+ by using a modified, extended SBOL3 Component class, and a new CustomReferenceProperty TopLevel class.
+ :param obj: SBOL3 component, extra properties are stored within if an instance of the extended class
+ :param seq_rec: GenBank SeqRecord instance for the record in which to reset extra properties
+ """
+ if isinstance(obj, self.ComponentGenBankExtension):
+ if obj.genbank_record_id:
+ seq_rec.id = obj.genbank_record_id
+ # set db links using dbxrefs property of biopython
+ if obj.genbank_dblink:
+ # NOTE: see comment on `_store_extra_properties_in_sbol3`'s dbxrefs section, where we describe how '::'
+ # is used as a delimiter to store the dbxrefs list as a string to maintain order. Here, we split the
+ # string by the same delimiter to restore the list in resetting GenBank properties.
+ seq_rec.dbxrefs = str(obj.genbank_dblink).split("::")
+ # 1. GenBank Record Date
+ if obj.genbank_date:
+ seq_rec.annotations['date'] = obj.genbank_date
+ # 2. GenBank Record Division
+ if obj.genbank_division:
+ seq_rec.annotations['data_file_division'] = obj.genbank_division
+ # 3. GenBank Record Keywords
+ # seq_rec.annotations['keywords'] = sorted(list(obj.genbank_keywords))
+ if obj.genbank_keywords:
+ seq_rec.annotations['keywords'] = str(obj.genbank_keywords).split(",")
+ # 4. GenBank Record Molecule Type
+ if obj.genbank_molecule_type:
+ seq_rec.annotations['molecule_type'] = obj.genbank_molecule_type
+ # 5. GenBank Record Organism
+ if obj.genbank_organism:
+ seq_rec.annotations['organism'] = obj.genbank_organism
+ # 6. GenBank Record Source
+ # FIXME: Apparently, if a default source was used during in the GenBank file
+ # during conversion of GenBank -> SBOL, component.genbank_source is "",
+ # and while plugging it back in during conversion of SBOL -> GenBank, it
+ # simply prints "", whereas the default "." should have been printed
+ if obj.genbank_source:
+ seq_rec.annotations['source'] = obj.genbank_source
+ # 7. GenBank Record taxonomy
+ # TODO : link gh issue for note below
+ # FIXME: Even though component.genbank_taxonomy is stored in sorted order, it
+ # becomes unsorted while retrieving from the sbol file
+ if obj.genbank_taxonomy:
+ seq_rec.annotations['taxonomy'] = str(obj.genbank_taxonomy).split(",")
+ # 8. GenBank Record Topology
+ if obj.genbank_topology:
+ seq_rec.annotations['topology'] = obj.genbank_topology
+ # 9. GenBank Record GI Property
+ if obj.genbank_gi:
+ seq_rec.annotations['gi'] = obj.genbank_gi
+ # 10. GenBank Record Accessions
+ if obj.genbank_accessions:
+ seq_rec.annotations['accessions'] = sorted(list(obj.genbank_accessions))
+ # 11. GenBank Sequence Version
+ if obj.genbank_seq_version:
+ seq_rec.annotations['sequence_version'] = obj.genbank_seq_version
+ # 12. GenBank Record References
+ if obj.genbank_references:
+ # if sbol3 object has references
+ record_references = []
+ for reference in obj.genbank_references:
+ reference_object = Reference()
+ reference_object.title = reference.title
+ reference_object.authors = reference.authors
+ reference_object.comment = reference.comment
+ reference_object.journal = reference.journal
+ reference_object.consrtm = reference.consrtm
+ reference_object.pubmed_id = reference.pubmed_id
+ reference_object.medline_id = reference.medline_id
+ for obj_feat_loc in reference.location:
+ feat_strand = self.BIO_STRAND_FORWARD
+ # feature strand value which denotes orientation of the location of the feature
+ # By default its 1 for SO_FORWARD orientation of sbol3 feature location, and -1 for SO_REVERSE
+ if obj_feat_loc.orientation == sbol3.SO_REVERSE:
+ feat_strand = self.BIO_STRAND_REVERSE
+ # elif obj_feat_loc.orientation != sbol3.SO_FORWARD:
+ # raise ValueError(f"Location orientation: `{obj_feat_loc.orientation}` for feature: \n \
+ # `{obj_feat.name}` of component: `{obj.display_id}` is not a valid orientation.\n \
+ # Valid orientations are `{sbol3.SO_FORWARD}`, `{sbol3.SO_REVERSE}`")
+ # TODO: Raise custom converter class ERROR for `else:`
+ feat_loc_object = FeatureLocation(
+ start=obj_feat_loc.start,
+ end=obj_feat_loc.end,
+ strand=feat_strand,
+ )
+ reference_object.location.append(feat_loc_object)
+ record_references.append(reference_object)
+ seq_rec.annotations['references'] = record_references
+ # 13. GenBank Record Locus
+ # TODO: No explicit way to set locus via BioPython?
+ # 14. GenBank Record Comments
+ if obj.genbank_comment:
+ seq_rec.annotations['comment'] = obj.genbank_comment
+ # 15. GenBank Record Structured Comments
+ if obj.genbank_structured_comments:
+ comment_annotation = OrderedDict()
+ for structured_comment in obj.genbank_structured_comments:
+ structured_comment_object = OrderedDict()
+ total_keys = len(structured_comment.structured_keys)
+ structured_keys = sorted(list(structured_comment.structured_keys),
+ key=lambda t: int(t.split("::", 1)[0]))
+ structured_values = sorted(list(structured_comment.structured_values),
+ key=lambda t: int(t.split("::", 1)[0]))
+ for ind in range(total_keys):
+ key = structured_keys[ind].split("::", 1)[1]
+ value = structured_values[ind].split("::", 1)[1]
+ structured_comment_object[key] = value
+ comment_annotation[structured_comment.heading] = structured_comment_object
+ seq_rec.annotations['structured_comment'] = comment_annotation
+ # 4. GenBank Record Molecule Type: Set molecule type if not already annotated
+ if 'molecule_type' not in seq_rec.annotations:
+ if sbol3.SBO_DNA in obj.types:
+ seq_rec.annotations['molecule_type'] = 'DNA'
+ elif sbol3.SBO_RNA in obj.types:
+ seq_rec.annotations['molecule_type'] = 'RNA'
+ elif sbol3.SBO_PROTEIN in obj.types:
+ seq_rec.annotations['molecule_type'] = 'protein'
+ else:
+ raise ValueError('Cannot determine molecule type for object %s', obj.identity)
+ # 8. GenBank Record Topology: Set topology if not already annotated
+ if 'topology' not in seq_rec.annotations:
+ if sbol3.SO_CIRCULAR in obj.types:
+ seq_rec.annotations['topology'] = 'circular'
+ else: # either linear or not set
+ seq_rec.annotations['topology'] = 'linear'
+ # 11. GenBank Sequence Version: default to 1 if not already annotated, and also add version to ID
+ if 'sequence_version' not in seq_rec.annotations:
+ seq_rec.annotations['sequence_version'] = self.DEFAULT_GB_SEQ_VERSION
+ seq_rec.id = f'{seq_rec.id}.{self.DEFAULT_GB_SEQ_VERSION}'
+
+ def _handle_features_gb_to_sbol(self, record: SeqRecord, comp: ComponentGenBankExtension,
+ seq: sbol3.Sequence) -> None:
+ """Helper function for setting sequence features and their qualifiers to SBOL,
+ by using a modified, extended SBOL3 Sequence Feature class - Feature_GenBank_Extension.
+ :param record: GenBank SeqRecord instance for the record which contains sequence features
+ :param comp: Instance of the SBOL3 Component
+ :param seq: The Sequence used in the GenBank record corresponding to sbol3 comp
+ """
+ # parse if genbank record has any features
+ if not record.features:
+ return
+ comp.features = []
+ for ind, gb_feat in enumerate(record.features):
+ feat_locations = []
+ fuzzy_feature = False
+ feat_name = None
+ if "label" in gb_feat.qualifiers:
+ feat_name = gb_feat.qualifiers["label"][0]
+ logging.debug("Parsing feature `%s` for record `%s`", feat_name or ind, record.id)
+ for gb_loc in gb_feat.location.parts:
+ # Default orientation is "inline" except if complement is specified via strand
+ feat_loc_orientation = sbol3.SO_FORWARD
+ if gb_loc.strand == -1:
+ feat_loc_orientation = sbol3.SO_REVERSE
+ # create "Range/Cut" FeatureLocation by parsing genbank record location
+ # Create a cut or range as feature location depending on whether location is specified as
+ # Cut (eg: "n^n+1", parsed as [n:n] by biopython) or Range (eg: "n..m", parsed as [n:m] by biopython)
+ if gb_loc.start == gb_loc.end:
+ locs = sbol3.Cut(sequence=seq, at=int(gb_loc.start), orientation=feat_loc_orientation)
+ else:
+ # find int mappings for positions of start and end locations,
+ # as defined in the static class variable 'SBOL_LOCATION_POSITION'
+ # 0->BeforePosition, 1->ExactPosition, 2->AfterPosition
+ end_position = self.SBOL_LOCATION_POSITION[type(gb_loc.end)]
+ start_position = self.SBOL_LOCATION_POSITION[type(gb_loc.start)]
+ # If both start and end positions are exact positions, the
+ # feature location can be created simply as a range object
+ # Kludge truncation of fuzzy ranges (https://github.com/SynBioDex/SBOL-utilities/issues/200)
+ if start_position == 1 and end_position == 1 or True:
+ locs = sbol3.Range(sequence=seq, orientation=feat_loc_orientation, end=int(gb_loc.end),
+ # add 1 to start, as BioPython parses GenBank start locations as 0-indexed
+ start=int(gb_loc.start) + 1)
+ # If either or both of start and end locations are fuzzy, then
+ # the location object needs to be of the custom class 'Location_GenBank_Extension'
+ else:
+ locs = self.LocationGenBankExtension(sequence=seq, orientation=feat_loc_orientation)
+ # start and end int positions specified
+ locs.end = int(gb_loc.end)
+ # add 1, as BioPython parses GenBank start locations as 0-indexed instead of 1-indexed
+ locs.start = int(gb_loc.start) + 1
+ # storing location types in IntProperties of SBOL3
+ locs.end_position = end_position
+ locs.start_position = start_position
+ # if any of the location endpoints of a feature (start/end) has a fuzzy end
+ # (i.e., not Exact position) like BeforePosition/AfterPosition, we mark the
+ # feature as a 'fuzzy_feature' which decides whether to store the feature or not
+ if not fuzzy_feature and locs.end_position != 1 or locs.start_position != 1:
+ fuzzy_feature = True
+ feat_locations.append(locs)
+ # Obtain sequence feature role from Genbank to SO type mappings
+ feat_role = sbol3.SO_NS[:-3]
+ if self.gb2so_map.get(gb_feat.type):
+ feat_role += self.gb2so_map[gb_feat.type]
+ else:
+ logging.warning(f"Feature type: `{gb_feat.type}` for feature: `{gb_feat.qualifiers['label'][0]}` \n \
+ of record: `{record.name}` has no corresponding ontology term for SO, using the default SO term, "
+ f"{self.DEFAULT_SO_TERM}")
+ feat_role += self.DEFAULT_SO_TERM
+ # assign feature orientation based on the strand value in genbank feature
+ feat_orientation = sbol3.SO_FORWARD
+ if gb_feat.strand == -1:
+ feat_orientation = sbol3.SO_REVERSE
+ feat = self.FeatureGenBankExtension(
+ locations=feat_locations,
+ roles=[feat_role],
+ # name=gb_feat.qualifiers["label"][0],
+ name=feat_name,
+ orientation=feat_orientation
+ )
+ # store qualifiers key value pairs
+ for index, qualifier in enumerate(gb_feat.qualifiers):
+ feat.qualifier_key.append(f"{index}:" + qualifier)
+ feat.qualifier_value.append(f"{index}:" + gb_feat.qualifiers[qualifier][0])
+ # if feature has any fuzzy location, since SBOL does not support storing such location endpoints,
+ # instead of presenting incomplete/incorrect information to users, we would store the feature assume
+ # a property of the Extended GenBank Component class, instead as a feature of the component.
+ # See: issue ->
+ # Once the above issue gets addressed, we can remove the 'fuzzy_feature' property and simply add the
+ # concerned feature to the features of the component.
+ if not fuzzy_feature:
+ comp.features.append(feat)
+ else:
+ comp.fuzzy_features.append(feat)
+
+ def _handle_features_sbol_to_gb(self, seq_rec: SeqRecord, obj: ComponentGenBankExtension) -> None:
+ """Helper function for resetting sequence features and their qualifiers to GenBank,
+ by using a modified, extended SBOL3 Sequence Feature class - Feature_GenBank_Extension.
+ :param seq_rec: GenBank SeqRecord instance for the record which contains sequence features
+ :param obj: Instance of the SBOL3 Component
+ """
+ # parse if sbol object has any features
+ if not obj.features:
+ return
+ seq_rec_features = []
+ # for round trip conversion, consider all features - exact and fuzzy ones too
+ all_features = list(obj.features)
+ if isinstance(obj, self.ComponentGenBankExtension):
+ all_features += list(obj.fuzzy_features)
+ # converting all sequence features
+ for obj_feat in all_features:
+ # TODO: Also add ability to parse subcomponent feature type
+ # Note: Currently we only parse sequence features from sbol3 to genbank
+ if isinstance(obj_feat, sbol3.SequenceFeature):
+ logging.debug("Parsing feature `%s` for component `%s`", obj_feat.name, obj.display_id)
+ # TODO: There may be multiple locations for a feature from sbol3;
+ # add ability to parse them into a single genbank feature
+ feat_loc_parts = []
+ feat_loc_object = None
+ feat_loc_positions = []
+ feat_strand = self.BIO_STRAND_FORWARD
+ for obj_feat_loc in obj_feat.locations:
+ feat_strand = self.BIO_STRAND_FORWARD
+ # feature strand value which denotes orientation of the location of the feature
+ # By default its 1 for SO_FORWARD orientation of sbol3 feature location, and -1 for SO_REVERSE
+ if obj_feat_loc.orientation in {sbol3.SO_REVERSE, sbol3.SBOL_REVERSE_COMPLEMENT}:
+ feat_strand = self.BIO_STRAND_REVERSE
+ elif obj_feat_loc.orientation not in {sbol3.SO_FORWARD, sbol3.SBOL_INLINE}:
+ raise ValueError(f"Location orientation: `{obj_feat_loc.orientation}` for feature: \n \
+ `{obj_feat.name}` of component: `{obj.display_id}` is not a valid orientation.\n \
+ Valid orientations are `{sbol3.SO_FORWARD}`, `{sbol3.SO_REVERSE}`, `{sbol3.SBOL_INLINE}`, "
+ f"`{sbol3.SBOL_REVERSE_COMPLEMENT}`")
+ # TODO: Raise custom converter class ERROR for `else:`
+ # creating start and end Positions
+ end_position = ExactPosition(obj_feat_loc.end)
+ # subtract 1, as BioPython parses GenBank start locations as 0-indexed instead of 1-indexed
+ start_position = ExactPosition(int(obj_feat_loc.start) - 1)
+ # if custom range object, check for position being Before / After Positions
+ if isinstance(obj_feat_loc, self.LocationGenBankExtension):
+ # change end and start Positions only if user has made integer entries into them
+ if obj_feat_loc.end_position is not None:
+ position_class = self.GENBANK_LOCATION_POSITION[obj_feat_loc.end_position]
+ end_position = position_class(obj_feat_loc.end)
+ if obj_feat_loc.start_position is not None:
+ position_class = self.GENBANK_LOCATION_POSITION[obj_feat_loc.start_position]
+ # subtract 1, as BioPython parses GenBank start locations as 0-indexed instead of 1-indexed
+ start_position = position_class(int(obj_feat_loc.start) - 1)
+ feat_loc_object = FeatureLocation(start=start_position, end=end_position, strand=feat_strand)
+ feat_loc_parts.append(feat_loc_object)
+ # sort feature locations lexicographically internally first
+ # NOTE: If the feature location has an outer "complement" location
+ # operator, the sort needs to be in reverse order
+ if obj_feat.orientation == sbol3.SO_REVERSE:
+ feat_loc_parts.sort(key=lambda loc: (loc.start, loc.end, loc.strand), reverse=True)
+ else:
+ feat_loc_parts.sort(key=lambda loc: (loc.start, loc.end, loc.strand))
+ for location in feat_loc_parts:
+ feat_loc_positions += [location.start, location.end]
+ if len(feat_loc_parts) > 1:
+ feat_loc_object = CompoundLocation(parts=feat_loc_parts, operator="join")
+ elif len(feat_loc_parts) == 1:
+ feat_loc_object = feat_loc_parts[0]
+ # action to perform if no location found?
+ # else:
+
+ # FIXME: order of features not same as original genbank doc?
+ # Obtain sequence feature role from Sequence Ontology to GenBank role mappings
+ so_roles = list(filter(None, (tyto_normalize_term(tyto.SO, role) for role in obj_feat.roles)))
+ feat_role = self.DEFAULT_GB_TERM
+ if len(so_roles):
+ if len(so_roles)>1:
+ logging.warning('Found multiple SequenceOntology roles %s for feature %s, using first'
+ 'for mapping to GenBank term', str(so_roles), obj_feat.identity)
+ if self.so2gb_map.get(so_roles[0]):
+ feat_role = self.so2gb_map[so_roles[0]]
+ else:
+ logging.warning('Feature role %s (%s) for feature %s, has no corresponding ontology term for '
+ 'GenBank, using the default GenBank term, %s', so_roles[0],
+ tyto.SO.get_term_by_uri(so_roles[0]), obj_feat.identity, self.DEFAULT_GB_TERM)
+ else:
+ logging.warning('No SequenceOntology roles found for feature %s, sing the default GenBank term, %s',
+ obj_feat.identity, self.DEFAULT_GB_TERM)
+ # create sequence feature object with label qualifier
+ # TODO: create issue for presence of genbank file with features without the "label" qualifier
+ # TODO: feat_strand value ambiguous in case of multiple locations?
+ feature = SeqFeature(location=feat_loc_object, type=feat_role)
+ feature.loc_positions = feat_loc_positions
+ if isinstance(obj_feat, self.FeatureGenBankExtension):
+ keys = sorted(obj_feat.qualifier_key, key=lambda x: int(x.split(":", 1)[0]))
+ values = sorted(obj_feat.qualifier_value, key=lambda x: int(x.split(":", 1)[0]))
+ for qualifier_ind in range(len(keys)):
+ feature.qualifiers[keys[qualifier_ind].split(":", 1)[1]] = \
+ values[qualifier_ind].split(":", 1)[1]
+ if obj_feat.name:
+ feature.qualifiers['label'] = obj_feat.name
+ seq_rec_features.append(feature)
+
+ # Sort features based on feature location start/end, lexicographically, and then by
+ # strand / number of qualifiers / type of feature string comparison
+ seq_rec_features.sort(key=lambda feat: (feat.loc_positions, feat.strand, len(feat.qualifiers), feat.type))
+ seq_rec.features = seq_rec_features
diff --git a/sbol_utilities/sbol3_sbol2_conversion.py b/sbol_utilities/sbol3_sbol2_conversion.py
new file mode 100644
index 00000000..ac44446f
--- /dev/null
+++ b/sbol_utilities/sbol3_sbol2_conversion.py
@@ -0,0 +1,571 @@
+import sbol3
+import sbol2
+from sbol2 import mapsto, model, sequenceconstraint
+
+# Namespaces
+from rdflib import URIRef
+
+BACKPORT_NAMESPACE = 'http://sboltools.org/backport#'
+BACKPORT2_VERSION = f'{BACKPORT_NAMESPACE}sbol2version'
+BACKPORT3_NAMESPACE = f'{BACKPORT_NAMESPACE}sbol3namespace'
+
+NON_EXTENSION_PROPERTY_PREFIXES = {sbol3.SBOL3_NS, sbol3.SBOL2_NS, # SBOL 2 & 3 namespaces
+ sbol3.RDF_NS, sbol3.PROV_NS, sbol3.OM_NS, # Standard ontologies
+ BACKPORT_NAMESPACE} # Information added by this converter
+SBOL2_NON_EXTENSION_PROPERTY_PREFIXES = NON_EXTENSION_PROPERTY_PREFIXES.union({
+ 'http://purl.org/dc/terms/description', 'http://purl.org/dc/terms/title'})
+
+
+class SBOL3To2ConversionVisitor:
+ """This class is used to map every object in an SBOL3 document into an empty SBOL2 document"""
+
+ doc2: sbol2.Document
+
+ def __init__(self, doc3: sbol3.Document):
+ # Create the target document
+ self.doc2 = sbol2.Document()
+ # # Immediately run the conversion
+ self._convert(doc3)
+
+ def _convert(self, doc3: sbol3.Document):
+ # Bind standard namespaces that aren't bound by default in pySBOL2
+ self.doc2.addNamespace(BACKPORT_NAMESPACE, 'backport')
+ self.doc2.addNamespace(sbol3.PROV_NS, 'prov')
+ self.doc2.addNamespace(sbol3.OM_NS, 'om')
+ self.doc2.addNamespace('http://purl.org/dc/terms/', 'dcterms')
+
+ # Override parameters that will otherwise interfere in conversion, saving old values
+ saved_compliance = sbol2.Config.getOption(sbol2.ConfigOptions.SBOL_COMPLIANT_URIS.value)
+ sbol2.Config.setOption(sbol2.ConfigOptions.SBOL_COMPLIANT_URIS.value, False)
+ saved_homespace = sbol2.getHomespace()
+ sbol2.setHomespace('')
+
+ # Try conversion, resetting saved parameter values afterward
+ try:
+ doc3.accept(self)
+ # TODO: make sure that complex extension objects (e.g., from SBOLFactory) are properly converted
+ # TODO: make sure that unhandled SBOL child objects / properties will throw errors
+ # TODO: check if we need to add post-creation fix-up of links, to ensure they point to objects
+ finally:
+ sbol2.Config.setOption(sbol2.ConfigOptions.SBOL_COMPLIANT_URIS.value, saved_compliance)
+ sbol2.setHomespace(saved_homespace)
+
+ @staticmethod
+ def _convert_extension_properties(obj3: sbol3.Identified, obj2: sbol2.Identified):
+ """Copy over extension properties"""
+ extension_properties = (p for p in obj3.properties
+ if not any(p.startswith(prefix) for prefix in NON_EXTENSION_PROPERTY_PREFIXES))
+ for p in extension_properties:
+ obj2.properties[p] = obj3._properties[p].copy() # Can't use setPropertyValue because it may not be a string
+
+ @staticmethod
+ def _value_or_property(obj3: sbol3.Identified, value, prop: str):
+ if prop in obj3._properties and len(obj3._properties[prop]) == 1:
+ return value or obj3._properties[prop][0]
+ return value
+
+ def _convert_identified(self, obj3: sbol3.Identified, obj2: sbol2.Identified):
+ """Map over the other properties of an identified object"""
+ self._convert_extension_properties(obj3, obj2)
+ # Map over equivalent properties
+ obj2.displayId = obj3.display_id
+ obj2.name = self._value_or_property(obj3, obj3.name, 'http://purl.org/dc/terms/title')
+ obj2.description = self._value_or_property(obj3, obj3.description, 'http://purl.org/dc/terms/description')
+ obj2.wasDerivedFrom = obj3.derived_from
+ obj2.wasGeneratedBy = obj3.generated_by
+ # Turn measures into extension properties
+ if obj3.measures:
+ raise NotImplementedError('Conversion of measures from SBOL3 to SBOL2 not yet implemented')
+
+ def _convert_toplevel(self, obj3: sbol3.TopLevel, obj2: sbol2.TopLevel):
+ """Map over the other properties of a TopLevel object"""
+ self._convert_identified(obj3, obj2)
+ obj2.attachments = [a.identity for a in obj3.attachments]
+ obj2.properties[BACKPORT3_NAMESPACE] = [URIRef(obj3.namespace)]
+
+ @staticmethod
+ def _sbol2_version(obj: sbol3.Identified):
+ if not hasattr(obj, 'sbol2_version'):
+ obj.sbol2_version = sbol3.TextProperty(obj, BACKPORT2_VERSION, 0, 1)
+ # TODO: since version is optional, if it's missing, should this be returning '1' or None?
+ return obj.sbol2_version or '1'
+
+ def visit_activity(self, act3: sbol3.Activity):
+ # Make the Activity object and add it to the document
+ act2 = sbol2.Activity(act3.identity, version=self._sbol2_version(act3))
+ self.doc2.activities.add(act2)
+ if act3.types:
+ if len(act3.types) > 1:
+ raise NotImplementedError('Conversion of multi-type Activities to SBOL2 not yet implemented:'
+ 'pySBOL2 currently supports a maximum of one type per activity'
+ 'Bug: https://github.com/SynBioDex/pySBOL2/issues/428')
+ act2.types = act3.types[0] # Take first type from list of length 1
+ act2.startedAtTime = act3.start_time
+ act2.endedAtTime = act3.end_time
+ if act3.usage or act3.association:
+ raise NotImplementedError('Conversion of Activity usage and association properties to SBOL2 '
+ 'not yet implemented, due to visitors failing to return values'
+ 'Bug: https://github.com/SynBioDex/pySBOL3/issues/437')
+ act2.usages = [usage.accept(self) for usage in act3.usage]
+ act2.associations = [assoc.accept(self) for assoc in act3.association]
+ # TODO: pySBOL3 is currently missing wasInformedBy (https://github.com/SynBioDex/pySBOL3/issues/436
+ # act2.wasInformedBy = act3.informed_by
+ # Map over all other TopLevel properties and extensions not covered by the constructor
+ self._convert_toplevel(act3, act2)
+
+ def visit_agent(self, a: sbol3.Agent):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Agent from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_association(self, a: sbol3.Association):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Association from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_attachment(self, a: sbol3.Attachment):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Attachment from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_binary_prefix(self, a: sbol3.BinaryPrefix):
+ # Priority: 4
+ raise NotImplementedError('Conversion of BinaryPrefix from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_collection(self, coll3: sbol3.Collection):
+ # Priority: 1
+ # Make the Collection object and add it to the document
+ coll2 = sbol2.Collection(coll3.identity)
+ coll2.members = coll3.members
+ self.doc2.addCollection(coll2)
+ # Map over all other TopLevel properties and extensions not covered by the constructor
+ self._convert_toplevel(coll3, coll2)
+
+ def visit_combinatorial_derivation(self, a: sbol3.CombinatorialDerivation):
+ # Priority: 2
+ raise NotImplementedError('Conversion of CombinatorialDerivation from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_component(self, cp3: sbol3.Component):
+ # Remap type if it's one of the ones that needs remapping; otherwise pass through unchanged
+ type_map = {sbol3.SBO_DNA: sbol2.BIOPAX_DNA, # TODO: distinguish BioPAX Dna from DnaRegion
+ sbol3.SBO_RNA: sbol2.BIOPAX_RNA, # TODO: distinguish BioPAX Rna from RnaRegion
+ sbol3.SBO_PROTEIN: sbol2.BIOPAX_PROTEIN,
+ sbol3.SBO_SIMPLE_CHEMICAL: sbol2.BIOPAX_SMALL_MOLECULE,
+ sbol3.SBO_NON_COVALENT_COMPLEX: sbol2.BIOPAX_COMPLEX}
+ types2 = [type_map.get(t, t) for t in cp3.types]
+ # Make the Component object and add it to the document
+ cp2 = sbol2.ComponentDefinition(cp3.identity, types2, version=self._sbol2_version(cp3))
+ self.doc2.addComponentDefinition(cp2)
+ # Convert the Component properties not covered by the constructor
+ cp2.roles = cp3.roles
+ cp2.sequences = cp3.sequences
+ if cp3.features:
+ raise NotImplementedError('Conversion of Component features from SBOL3 to SBOL2 not yet implemented')
+ if cp3.interactions:
+ raise NotImplementedError('Conversion of Component interactions from SBOL3 to SBOL2 not yet implemented')
+ if cp3.constraints:
+ raise NotImplementedError('Conversion of Component constraints from SBOL3 to SBOL2 not yet implemented')
+ if cp3.interface:
+ raise NotImplementedError('Conversion of Component interface from SBOL3 to SBOL2 not yet implemented')
+ if cp3.models:
+ raise NotImplementedError('Conversion of Component models from SBOL3 to SBOL2 not yet implemented')
+ # Map over all other TopLevel properties and extensions not covered by the constructor
+ self._convert_toplevel(cp3, cp2)
+
+ def visit_component_reference(self, a: sbol3.ComponentReference):
+ # Priority: 3
+ raise NotImplementedError('Conversion of ComponentReference from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_constraint(self, a: sbol3.Constraint):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Constraint from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_cut(self, a: sbol3.Cut):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Cut from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_document(self, doc3: sbol3.Document):
+ for obj in doc3.objects:
+ obj.accept(self)
+
+ def visit_entire_sequence(self, a: sbol3.EntireSequence):
+ # Priority: 3
+ raise NotImplementedError('Conversion of EntireSequence from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_experiment(self, a: sbol3.Experiment):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Experiment from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_experimental_data(self, a: sbol3.ExperimentalData):
+ # Priority: 3
+ raise NotImplementedError('Conversion of ExperimentalData from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_externally_defined(self, a: sbol3.ExternallyDefined):
+ # Priority: 3
+ raise NotImplementedError('Conversion of ExternallyDefined from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_implementation(self, imp3: sbol3.Implementation):
+ # Priority: 1
+ # Make the Implement object and add it to the document
+ imp2 = sbol2.Implementation(imp3.identity, version=self._sbol2_version(imp3))
+ imp2.built = imp3.built
+ self.doc2.addImplementation(imp2)
+ # Map over all other TopLevel properties and extensions not covered by the constructor
+ self._convert_toplevel(imp3, imp2)
+
+ def visit_interaction(self, a: sbol3.Interaction):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Interaction from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_interface(self, a: sbol3.Interface):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Interface from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_local_sub_component(self, a: sbol3.LocalSubComponent):
+ # Priority: 2
+ raise NotImplementedError('Conversion of LocalSubComponent from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_measure(self, a: sbol3.Measure):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Measure from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_model(self, a: sbol3.Model):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Model from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_participation(self, a: sbol3.Participation):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Participation from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_plan(self, a: sbol3.Plan):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Plan from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_prefixed_unit(self, a: sbol3.PrefixedUnit):
+ # Priority: 4
+ raise NotImplementedError('Conversion of PrefixedUnit from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_range(self, a: sbol3.Range):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Range from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_si_prefix(self, a: sbol3.SIPrefix):
+ # Priority: 4
+ raise NotImplementedError('Conversion of SIPrefix from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_sequence(self, seq3: sbol3.Sequence):
+ # Remap encoding if it's one of the ones that needs remapping; otherwise pass through unchanged
+ encoding_map = {sbol3.IUPAC_DNA_ENCODING: sbol2.SBOL_ENCODING_IUPAC,
+ sbol3.IUPAC_PROTEIN_ENCODING: sbol2.SBOL_ENCODING_IUPAC_PROTEIN,
+ sbol3.SMILES_ENCODING: sbol2.SBOL_ENCODING_SMILES}
+ encoding2 = encoding_map.get(seq3.encoding, seq3.encoding)
+ # Make the Sequence object and add it to the document
+ seq2 = sbol2.Sequence(seq3.identity, seq3.elements, encoding=encoding2, version=self._sbol2_version(seq3))
+ self.doc2.addSequence(seq2)
+ # Map over all other TopLevel properties and extensions not covered by the constructor
+ self._convert_toplevel(seq3, seq2)
+
+ def visit_sequence_feature(self, a: sbol3.SequenceFeature):
+ # Priority: 1
+ raise NotImplementedError('Conversion of SequenceFeature from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_singular_unit(self, a: sbol3.SingularUnit):
+ # Priority: 4
+ raise NotImplementedError('Conversion of SingularUnit from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_sub_component(self, a: sbol3.SubComponent):
+ # Priority: 1
+ raise NotImplementedError('Conversion of SubComponent from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_unit_division(self, a: sbol3.UnitDivision):
+ # Priority: 4
+ raise NotImplementedError('Conversion of UnitDivision from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_unit_exponentiation(self, a: sbol3.UnitExponentiation):
+ # Priority: 4
+ raise NotImplementedError('Conversion of UnitExponentiation from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_unit_multiplication(self, a: sbol3.UnitMultiplication):
+ # Priority: 4
+ raise NotImplementedError('Conversion of UnitMultiplication from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_usage(self, a: sbol3.Usage):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Usage from SBOL3 to SBOL2 not yet implemented')
+
+ def visit_variable_feature(self, a: sbol3.VariableFeature):
+ # Priority: 2
+ raise NotImplementedError('Conversion of VariableFeature from SBOL3 to SBOL2 not yet implemented')
+
+
+class SBOL2To3ConversionVisitor:
+ """This class is used to map every object in an SBOL3 document into an empty SBOL2 document"""
+
+ doc3: sbol3.Document
+ namespaces: list
+
+ def __init__(self, doc2: sbol2.Document, namespaces: list):
+ # Create the target document
+ self.doc3 = sbol3.Document()
+ self.namespaces = namespaces
+ # # Immediately run the conversion
+ self._convert(doc2)
+
+ def _convert(self, doc2: sbol2.Document):
+ # Note: namespaces don't need to be bound for SBOL3 documents, which don't usually use XML
+ # We can skip all the preliminaries and just go to conversion
+ self.visit_document(doc2)
+ # TODO: check if there is additional work needed for Annotation & GenericTopLevel conversion
+
+ @staticmethod
+ def _convert_extension_properties(obj2: sbol2.Identified, obj3: sbol3.Identified):
+ """Copy over extension properties"""
+ extension_properties = (p for p in obj2.properties
+ if not any(p.startswith(prefix) for prefix in SBOL2_NON_EXTENSION_PROPERTY_PREFIXES))
+ for p in extension_properties:
+ obj3._properties[p] = obj2.properties[p]
+
+ def _convert_identified(self, obj2: sbol2.Identified, obj3: sbol3.Identified):
+ """Map over the other properties of an Identified object"""
+ self._convert_extension_properties(obj2, obj3)
+ # Map over equivalent properties
+ # display_id and namespace are handled during creation
+ if obj2.version: # Save version for unpacking later if needed
+ obj3.sbol2_version = sbol3.TextProperty(obj3, BACKPORT2_VERSION, 0, 1)
+ obj3.sbol2_version = obj2.version
+ obj3.name = obj2.name
+ obj3.description = obj2.description
+ obj3.derived_from = obj2.wasDerivedFrom
+ obj3.generated_by = obj2.wasGeneratedBy
+ # TODO: unpack measures from extension properties
+
+ def _convert_toplevel(self, obj2: sbol2.TopLevel, obj3: sbol3.TopLevel):
+ """Map over the other properties of a TopLevel object"""
+ self._convert_identified(obj2, obj3)
+ obj3.attachments = [a.identity for a in obj2.attachments]
+
+ def _sbol3_namespace(self, obj2: sbol2.TopLevel):
+ # If a namespace is explicitly set, that takes priority
+ if BACKPORT3_NAMESPACE in obj2.properties:
+ namespaces = obj2.properties[BACKPORT3_NAMESPACE]
+ if len(namespaces) != 1:
+ raise ValueError(f'Object {obj2.identity} backport namespace property should have precisely one value, '
+ f'but was {namespaces}')
+ return namespaces[0]
+ # Check if the object starts with any of the provided namespaces
+ for namespace in self.namespaces:
+ if obj2.identity.startswith(namespace):
+ return namespace
+ # Otherwise, use default behavior
+ return None
+
+ def visit_activity(self, act2: sbol2.Activity):
+ # Make the Activity object and add it to the document
+ act3 = sbol3.Activity(act2.identity, namespace=self._sbol3_namespace(act2),
+ start_time=act2.startedAtTime, end_time=act2.endedAtTime)
+ self.doc3.add(act3)
+ # Convert child objects after adding to document
+ if act2.types: # TODO: wrapping not needed after resolution of https://github.com/SynBioDex/pySBOL2/issues/428
+ act3.types = [act2.types]
+ act3.usage = [usage.visit_usage(self) for usage in act2.usages]
+ act3.association = [assoc.visit_association(self) for assoc in act2.associations]
+ # TODO: pySBOL3 is currently missing wasInformedBy (https://github.com/SynBioDex/pySBOL3/issues/436
+ # act3.informed_by = act2.wasInformedBy
+ # Map over all other TopLevel properties and extensions not covered by the constructor
+ self._convert_toplevel(act2, act3)
+
+ def visit_agent(self, a: sbol2.Agent):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Agent from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_association(self, a: sbol2.Association):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Association from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_attachment(self, a: sbol2.Attachment):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Attachment from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_collection(self, coll2: sbol2.Collection):
+ # Priority: 1
+ # Make the Collection object and add it to the document
+ coll3 = sbol3.Collection(coll2.identity, members=coll2.members)
+ self.doc3.add(coll3)
+ # Map over all other TopLevel properties and extensions not covered by the constructor
+ self._convert_toplevel(coll2, coll3)
+
+ def visit_combinatorial_derivation(self, a: sbol2.CombinatorialDerivation):
+ # Priority: 2
+ raise NotImplementedError('Conversion of CombinatorialDerivation from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_component_definition(self, cd2: sbol2.ComponentDefinition):
+ # Remap type if it's one of the ones that needs remapping; otherwise pass through unchanged
+ type_map = {sbol2.BIOPAX_DNA: sbol3.SBO_DNA,
+ 'http://www.biopax.org/release/biopax-level3.owl#Dna': sbol3.SBO_DNA, # TODO: make reversible
+ sbol2.BIOPAX_RNA: sbol3.SBO_RNA,
+ 'http://www.biopax.org/release/biopax-level3.owl#Rna': sbol3.SBO_RNA, # TODO: make reversible
+ sbol2.BIOPAX_PROTEIN: sbol3.SBO_PROTEIN,
+ sbol2.BIOPAX_SMALL_MOLECULE: sbol3.SBO_SIMPLE_CHEMICAL,
+ sbol2.BIOPAX_COMPLEX: sbol3.SBO_NON_COVALENT_COMPLEX}
+ types3 = [type_map.get(t, t) for t in cd2.types]
+ # Make the Component object and add it to the document
+ cp3 = sbol3.Component(cd2.identity, types3, namespace=self._sbol3_namespace(cd2),
+ roles=cd2.roles, sequences=cd2.sequences)
+ self.doc3.add(cp3)
+ # Convert the Component properties not covered by the constructor
+ if cd2.components:
+ raise NotImplementedError('Conversion of ComponentDefinition components '
+ 'from SBOL2 to SBOL3 not yet implemented')
+ if cd2.sequenceAnnotations:
+ raise NotImplementedError('Conversion of ComponentDefinition sequenceAnnotations '
+ 'from SBOL2 to SBOL3 not yet implemented')
+ if cd2.sequenceConstraints:
+ raise NotImplementedError('Conversion of ComponentDefinition sequenceConstraints '
+ 'from SBOL2 to SBOL3 not yet implemented')
+ # Map over all other TopLevel properties and extensions not covered by the constructor
+ self._convert_toplevel(cd2, cp3)
+
+ def visit_component(self, a: sbol2.Component):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Component from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_cut(self, a: sbol2.Cut):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Cut from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_document(self, doc2: sbol2.Document):
+ for obj in doc2.componentDefinitions:
+ self.visit_component_definition(obj)
+ for obj in doc2.moduleDefinitions:
+ self.visit_module_definition(obj)
+ for obj in doc2.models:
+ self.visit_model(obj)
+ for obj in doc2.sequences:
+ self.visit_sequence(obj)
+ for obj in doc2.collections:
+ self.visit_collection(obj)
+ for obj in doc2.activities:
+ self.visit_activity(obj)
+ for obj in doc2.plans:
+ self.visit_plan(obj)
+ for obj in doc2.agents:
+ self.visit_agent(obj)
+ for obj in doc2.attachments:
+ self.visit_attachment(obj)
+ for obj in doc2.combinatorialderivations:
+ self.visit_combinatorial_derivation(obj)
+ for obj in doc2.implementations:
+ self.visit_implementation(obj)
+ for obj in doc2.experiments:
+ self.visit_experiment(obj)
+ for obj in doc2.experimentalData:
+ self.visit_experimental_data(obj)
+ # TODO: handle "standard extensions" in pySBOL2:
+ # designs, builds, tests, analyses, sampleRosters, citations, keywords
+
+ def visit_experiment(self, a: sbol2.Experiment):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Experiment from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_experimental_data(self, a: sbol2.ExperimentalData):
+ # Priority: 3
+ raise NotImplementedError('Conversion of ExperimentalData from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_functional_component(self, a: sbol2.FunctionalComponent):
+ # Priority: 3
+ raise NotImplementedError('Conversion of FunctionalComponent from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_generic_location(self, a: sbol2.GenericLocation):
+ # Priority: 3
+ raise NotImplementedError('Conversion of GenericLocation from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_implementation(self, imp2: sbol2.Implementation):
+ # Priority: 1
+ # Make the Implementation object and add it to the document
+ imp3 = sbol3.Implementation(imp2.identity, namespace=self._sbol3_namespace(imp2), built=imp2.built)
+ self.doc3.add(imp3)
+ # Map over all other TopLevel properties and extensions not covered by the constructor
+ self._convert_toplevel(imp2, imp3)
+
+ def visit_interaction(self, a: sbol2.Interaction):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Interaction from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_maps_to(self, a: sbol2.mapsto.MapsTo):
+ # Priority: 3
+ raise NotImplementedError('Conversion of MapsTo from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_measure(self, a: sbol2.measurement.Measurement):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Measure from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_model(self, a: sbol2.model.Model):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Model from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_module(self, a: sbol2.Module):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Module from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_module_definition(self, a: sbol2.ModuleDefinition):
+ # Priority: 3
+ raise NotImplementedError('Conversion of ModuleDefinition from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_participation(self, a: sbol2.Participation):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Participation from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_plan(self, a: sbol2.Plan):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Plan from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_range(self, a: sbol2.Range):
+ # Priority: 2
+ raise NotImplementedError('Conversion of Range from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_sequence(self, seq2: sbol2.Sequence):
+ # Remap encoding if it's one of the ones that needs remapping; otherwise pass through unchanged
+ encoding_map = {sbol2.SBOL_ENCODING_IUPAC: sbol3.IUPAC_DNA_ENCODING,
+ sbol2.SBOL_ENCODING_IUPAC_PROTEIN: sbol3.IUPAC_PROTEIN_ENCODING,
+ sbol2.SBOL_ENCODING_SMILES: sbol3.SMILES_ENCODING}
+ encoding3 = encoding_map.get(seq2.encoding, seq2.encoding)
+ # Make the Sequence object and add it to the document
+ seq3 = sbol3.Sequence(seq2.identity, namespace=self._sbol3_namespace(seq2),
+ elements=seq2.elements, encoding=encoding3)
+ self.doc3.add(seq3)
+ # Map over all other TopLevel properties and extensions not covered by the constructor
+ self._convert_toplevel(seq2, seq3)
+
+ def visit_sequence_annotation(self, seq2: sbol2.SequenceAnnotation):
+ # Priority: 1
+ raise NotImplementedError('Conversion of SequenceAnnotation from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_sequence_constraint(self, seq2: sbol2.sequenceconstraint.SequenceConstraint):
+ # Priority: 2
+ raise NotImplementedError('Conversion of SequenceConstraint from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_usage(self, a: sbol2.Usage):
+ # Priority: 3
+ raise NotImplementedError('Conversion of Usage from SBOL2 to SBOL3 not yet implemented')
+
+ def visit_variable_component(self, a: sbol2.VariableComponent):
+ # Priority: 2
+ raise NotImplementedError('Conversion of VariableComponent from SBOL2 to SBOL3 not yet implemented')
+
+
+def convert3to2(doc3: sbol3.Document) -> sbol2.Document:
+ """Convert an SBOL3 document to an SBOL2 document
+
+ :param doc3: SBOL3 document to convert
+ :returns: SBOL2 document
+ """
+ converter = SBOL3To2ConversionVisitor(doc3)
+ return converter.doc2
+
+
+def convert2to3(doc2: sbol2.Document, namespaces=None) -> sbol3.Document:
+ """Convert an SBOL2 document to an SBOL3 document
+
+ :param doc2: SBOL2 document to convert
+ :param namespaces: list of URI prefixes to treat as namespaces
+ :returns: SBOL3 document
+ """
+ converter = SBOL2To3ConversionVisitor(doc2, namespaces)
+ return converter.doc3
diff --git a/sbol_utilities/so2gb.csv b/sbol_utilities/so2gb.csv
new file mode 100644
index 00000000..0865ccb0
--- /dev/null
+++ b/sbol_utilities/so2gb.csv
@@ -0,0 +1,79 @@
+SO_Ontology,GenBank_Ontology
+https://identifiers.org/SO:0001023,allele
+https://identifiers.org/SO:0000730,assembly_gap
+https://identifiers.org/SO:0002174,assembly_gap
+https://identifiers.org/SO:0000140,attenuator
+https://identifiers.org/SO:0001834,C_region
+https://identifiers.org/SO:0000172,CAAT_signal
+https://identifiers.org/SO:0000316,CDS
+https://identifiers.org/SO:0000577,centromere
+https://identifiers.org/SO:0000297,D-loop
+https://identifiers.org/SO:0000458,D_segment
+https://identifiers.org/SO:0000165,enhancer
+https://identifiers.org/SO:0000147,exon
+https://identifiers.org/SO:0000704,gene
+https://identifiers.org/SO:0000173,GC_signal
+https://identifiers.org/SO:0000723,iDNA
+https://identifiers.org/SO:0000188,intron
+https://identifiers.org/SO:0000470,J_region
+https://identifiers.org/SO:0000286,LTR
+https://identifiers.org/SO:0000419,mat_peptide
+https://identifiers.org/SO:0000409,misc_binding
+https://identifiers.org/SO:0000413,misc_difference
+https://identifiers.org/SO:0000001,misc_feature
+https://identifiers.org/SO:0001411,misc_feature
+https://identifiers.org/SO:0001645,misc_marker
+https://identifiers.org/SO:0000298,misc_recomb
+https://identifiers.org/SO:0000233,misc_RNA
+https://identifiers.org/SO:0000673,misc_RNA
+https://identifiers.org/SO:0005836,regulatory
+https://identifiers.org/SO:0000002,misc_structure
+https://identifiers.org/SO:0001037,mobile_element
+https://identifiers.org/SO:0000305,modified_base
+https://identifiers.org/SO:0000234,mRNA
+https://identifiers.org/SO:0001835,N_region
+https://identifiers.org/SO:0000655,ncRNA
+https://identifiers.org/SO:0000178,operon
+https://identifiers.org/SO:0000724,oriT
+https://identifiers.org/SO:0000551,polyA_signal
+https://identifiers.org/SO:0000553,polyA_site
+https://identifiers.org/SO:0000185,precursor_RNA
+https://identifiers.org/SO:0000112,primer
+https://identifiers.org/SO:0005850,primer_bind
+https://identifiers.org/SO:0000167,promoter
+https://identifiers.org/SO:0001062,propeptide
+https://identifiers.org/SO:0000410,protein_bind
+https://identifiers.org/SO:0000139,RBS
+https://identifiers.org/SO:0000552,RBS
+https://identifiers.org/SO:0000296,rep_origin
+https://identifiers.org/SO:0000657,repeat_region
+https://identifiers.org/SO:0000726,repeat_unit
+https://identifiers.org/SO:0000252,rRNA
+https://identifiers.org/SO:0001836,S_region
+https://identifiers.org/SO:0000005,satellite
+https://identifiers.org/SO:0000013,scRNA
+https://identifiers.org/SO:0000418,sig_peptide
+https://identifiers.org/SO:0000274,snRNA
+https://identifiers.org/SO:0000149,source
+https://identifiers.org/SO:0002206,source
+https://identifiers.org/SO:0000019,stem_loop
+https://identifiers.org/SO:0000313,stem_loop
+https://identifiers.org/SO:0000331,STS
+https://identifiers.org/SO:0000174,TATA_signal
+https://identifiers.org/SO:0000624,telomere
+https://identifiers.org/SO:0000141,terminator
+https://identifiers.org/SO:0000584,tmRNA
+https://identifiers.org/SO:0000725,transit_peptide
+https://identifiers.org/SO:0001054,transposon
+https://identifiers.org/SO:0000253,tRNA
+https://identifiers.org/SO:0001086,unsure
+https://identifiers.org/SO:0001833,V_region
+https://identifiers.org/SO:0000109,variation
+https://identifiers.org/SO:0001060,variation
+https://identifiers.org/SO:0000466,V_segment
+https://identifiers.org/SO:0000175,-10_signal
+https://identifiers.org/SO:0000176,-35_signal
+https://identifiers.org/SO:0000557,3'clip
+https://identifiers.org/SO:0000205,3'UTR
+https://identifiers.org/SO:0000555,5'clip
+https://identifiers.org/SO:0000204,5'UTR
diff --git a/sbol_utilities/workarounds.py b/sbol_utilities/workarounds.py
index 5cdc8757..e0e083f0 100644
--- a/sbol_utilities/workarounds.py
+++ b/sbol_utilities/workarounds.py
@@ -21,6 +21,21 @@ def tyto_lookup_with_caching(term: str) -> str:
return tyto.SO.get_uri_by_term(term)
+# TODO: remove kludge after resolution of https://github.com/SynBioDex/tyto/issues/75
+def tyto_normalize_term(ontology: tyto.Ontology, uri: str) -> Optional[str]:
+ """Change an ontology term into the "standard" form returned by tyto.ontology.get_uri_by_term
+ Workaround for tyto issue https://github.com/SynBioDex/tyto/issues/75, which will be removed after that
+ issue is addressed.
+
+ :param ontology: Ontology containing term
+ :param uri: URI to be normalized
+ :return: normalized URI (or None if term is not in the ontology)
+ """
+ try:
+ return ontology.get_uri_by_term(ontology.get_term_by_uri(uri))
+ except LookupError:
+ return None
+
#########################
# This file contains workarounds for known issues in pySBOL3
# They will be removed when pySBOL3 upgrades fix the associated issues
diff --git a/setup.py b/setup.py
index 00a04cc5..7c355ed3 100644
--- a/setup.py
+++ b/setup.py
@@ -28,19 +28,21 @@
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
- 'Programming Language :: Python :: 3.10'
+ 'Programming Language :: Python :: 3.10',
+ 'Programming Language :: Python :: 3.11'
],
# What does your project relate to?
keywords='synthetic biology',
install_requires=[
- 'sbol3>=1.0b11',
+ 'sbol3>=1.1',
'sbol2>=1.4',
'rdflib>=6.2',
'biopython',
'graphviz',
- 'tyto>=1.2.1',
+ 'tyto>=1.4',
'openpyxl',
- 'sbol_factory>=1.0a11',
+ 'requests',
+ 'sbol_factory>=1.1'
'pydna'
],
extras_require={ # requirements for development
@@ -51,16 +53,23 @@
'graph-sbol=sbol_utilities.graph_sbol:main',
'sbol-expand-derivations=sbol_utilities.expand_combinatorial_derivations:main',
'sbol-calculate-sequences=sbol_utilities.calculate_sequences:main',
+ 'sbol-calculate-complexity=sbol_utilities.calculate_complexity_scores:main',
'sbol-converter=sbol_utilities.conversion:main',
- 'sbol2to3=sbol_utilities.conversion:sbol2to3',
- 'sbol3to2=sbol_utilities.conversion:sbol3to2',
- 'sbol2genbank=sbol_utilities.conversion:sbol2genbank',
- 'sbol2fasta=sbol_utilities.conversion:sbol2fasta',
- 'genbank2sbol=sbol_utilities.conversion:genbank2sbol',
- 'fasta2sbol=sbol_utilities.conversion:fasta2sbol',
+ 'sbol2-to-sbol3=sbol_utilities.conversion:sbol2to3',
+ 'sbol3-to-sbol2=sbol_utilities.conversion:sbol3to2',
+ 'sbol-to-genbank=sbol_utilities.conversion:sbol2genbank',
+ 'sbol-to-fasta=sbol_utilities.conversion:sbol2fasta',
+ 'genbank-to-sbol=sbol_utilities.conversion:genbank2sbol',
+ 'fasta-to-sbol=sbol_utilities.conversion:fasta2sbol',
'sbol-diff=sbol_utilities.sbol_diff:main']
},
packages=['sbol_utilities'],
- package_data={'sbol_utilities': ['sbolgraph-standalone.js']},
+ package_data={
+ 'sbol_utilities': [
+ 'gb2so.csv',
+ 'sbolgraph-standalone.js',
+ 'so2gb.csv',
+ ],
+ },
include_package_data=True
)
diff --git a/test/helpers.py b/test/helpers.py
index 32779306..c08c43d6 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -2,7 +2,7 @@
import tempfile
import os
from shutil import copy
-from typing import List, Dict
+from typing import List, Dict, Union
def copy_to_tmp(package: List[str] = None, renames: Dict[str, str] = None) -> str:
@@ -29,7 +29,7 @@ def copy_to_tmp(package: List[str] = None, renames: Dict[str, str] = None) -> st
return tmp_sub
-def assert_files_identical(file1: os.PathLike, file2: os.PathLike) -> None:
+def assert_files_identical(file1: Union[os.PathLike, str], file2: Union[os.PathLike, str]) -> None:
"""check if two files are identical; if not, report their diff
:param file1: path of first file to compare
:param file2: path of second file to compare
diff --git a/test/test_calculate_complexity_scores.py b/test/test_calculate_complexity_scores.py
new file mode 100644
index 00000000..70bddee8
--- /dev/null
+++ b/test/test_calculate_complexity_scores.py
@@ -0,0 +1,90 @@
+"""Tests for calculating sequence synthesis complexity scores via the IDT interface
+
+To run these tests, you will need IDT access credentials (see: https://www.idtdna.com/pages/tools/apidoc)
+The values of the IDT access credentials should be stored in a file in the top level directory called
+'test_secret_idt_credentials.json', with the contents of the form:
+{ "username": "username", "password": "password", "ClientID": "####", "ClientSecret": "XXXXXXXXXXXXXXXXXXX" }
+"""
+from pathlib import Path
+
+import json
+
+import unittest
+import sys
+import tempfile
+import sbol3
+from unittest.mock import patch
+from sbol_utilities.calculate_complexity_scores import IDTAccountAccessor, idt_calculate_complexity_scores, \
+ idt_calculate_sequence_complexity_scores, get_complexity_scores
+import sbol_utilities.sbol_diff
+
+# TODO: add to readme
+
+def same_except_timestamps(doc1: sbol3.Document, doc2: sbol3.Document) -> bool:
+ """Check that the only triple-level difference between two SBOL documents is their time-stamps
+
+ :param doc1: first document to compare
+ :param doc2: second document to compare
+ :returns: True if identical, false if not
+ """
+ _, first_graph, second_graph = sbol_utilities.sbol_diff._diff_graphs(doc1.graph(), doc2.graph())
+ replaced_subject = 'http://igem.org/IDT_complexity_score/Complexity_Report_20230516T194547Z_a2efceb0'
+ # Return true only if all differences are time-stamps or the activity name
+ ignored_predicates = {sbol3.PROV_ENDED_AT_TIME, sbol3.SBOL_DISPLAY_ID}
+ return all(p1 == p2 and (str(p1) in ignored_predicates or
+ (str(s1) == replaced_subject and o1 == o2) or
+ (s1 == s2 and str(o1) == replaced_subject))
+ for (s1, p1, o1), (s2, p2, o2) in zip(sorted(first_graph), sorted(second_graph)))
+
+
+class TestIDTCalculateComplexityScore(unittest.TestCase):
+
+ @unittest.skipIf(sys.platform == 'win32', reason='Not working on Windows https://github.com/SynBioDex/SBOL-utilities/issues/221')
+ def test_IDT_calculate_complexity_score(self):
+ """Test that a library-call invocation of complexity scoring works"""
+ test_dir = Path(__file__).parent
+ with open(test_dir.parent / 'test_secret_idt_credentials.json') as test_credentials:
+ idt_accessor = IDTAccountAccessor.from_json(json.load(test_credentials))
+
+ doc = sbol3.Document()
+ doc.read(test_dir / 'test_files' / 'BBa_J23101.nt')
+
+ # Check the scores - they should initially be all missing
+ sequences = [obj for obj in doc if isinstance(obj, sbol3.Sequence)]
+ scores = get_complexity_scores(sequences)
+ self.assertEqual(scores, dict())
+ # Compute sequences for
+ results = idt_calculate_sequence_complexity_scores(idt_accessor, sequences)
+ self.assertEqual(len(results), 1)
+ self.assertEqual(results[sequences[0]], 0) # score is zero because the sequence both short and easy
+ scores = get_complexity_scores(sequences)
+ self.assertEqual(scores, results)
+
+ # Compute results again: results should be blank, because the calculation is already made
+ results = idt_calculate_complexity_scores(idt_accessor, doc)
+ self.assertEqual(len(results), 0)
+ self.assertEqual(results, dict())
+ scores = get_complexity_scores(sequences)
+ self.assertEqual(scores, {sequences[0]: 0})
+
+ @unittest.skipIf(sys.platform == 'win32', reason='Not working on Windows https://github.com/SynBioDex/SBOL-utilities/issues/221')
+ def test_commandline(self):
+ """Test that a command-line invocation of complexity scoring works"""
+ test_dir = Path(__file__).parent
+ temp_name = tempfile.mkstemp(suffix='.nt')[1]
+ test_args = ['calculate_complexity_scores.py',
+ '--credentials', str(test_dir.parent / 'test_secret_idt_credentials.json'),
+ str(test_dir / 'test_files' / 'Test_file_Complexity_Scores.nt'), temp_name]
+ with patch.object(sys, 'argv', test_args):
+ sbol_utilities.calculate_complexity_scores.main()
+
+ # Compare expected results to actual output file
+ expected = sbol3.Document()
+ expected.read(test_dir / 'test_files' / 'Comparison_file_Complexity_Scores.nt')
+ generated = sbol3.Document()
+ generated.read(temp_name)
+ self.assertTrue(same_except_timestamps(expected, generated))
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_conversion.py b/test/test_conversion.py
index d16db9a7..2411eaab 100644
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
import sys
import tempfile
import unittest
@@ -12,17 +14,19 @@
from sbol_utilities.conversion import convert2to3, convert3to2, convert_to_genbank, convert_to_fasta, \
convert_from_fasta, convert_from_genbank, \
main, sbol2fasta, sbol2genbank, sbol2to3, sbol3to2, fasta2sbol, genbank2sbol
-from helpers import copy_to_tmp
+from sbol_utilities.sbol3_genbank_conversion import GenBankSBOL3Converter
+from helpers import copy_to_tmp, assert_files_identical
from sbol_utilities.sbol_diff import doc_diff
# TODO: Add command-line utilities and test them too
+TEST_FILES = Path(__file__).parent / 'test_files'
+
class Test2To3Conversion(unittest.TestCase):
def test_convert_identities(self):
"""Test conversion of a complex file"""
test_dir = os.path.dirname(os.path.realpath(__file__))
- input_path = os.path.join(test_dir, 'test_files', 'sbol3-small-molecule.rdf')
- doc = convert2to3(input_path)
+ doc = convert2to3(str(TEST_FILES / 'sbol3-small-molecule.rdf'))
# check for issues in converted document
report = doc.validate()
assert len(report) == 0, "\n".join(str(issue) for issue in report)
@@ -59,7 +63,7 @@ def test_3to2_conversion(self):
sbol2.Config.setOption(sbol2.ConfigOptions.VALIDATE_ONLINE, validate_online)
assert len(doc2.componentDefinitions) == 1, f'Expected 1 CD, but found {len(doc2.componentDefinitions)}'
# TODO: bring this back after resolution of https://github.com/sboltools/sbolgraph/issues/15
- #assert len(doc2.activities) == 1, f'Expected 1 Activity, but found {len(doc2.activities)}'
+ # assert len(doc2.activities) == 1, f'Expected 1 Activity, but found {len(doc2.activities)}'
assert len(doc2.sequences) == 1, f'Expected 1 Sequence, but found {len(doc2.sequences)}'
assert doc2.componentDefinitions[0].identity == 'https://synbiohub.org/public/igem/BBa_J23101'
assert doc2.componentDefinitions[0].sequences[0] == 'https://synbiohub.org/public/igem/BBa_J23101_sequence'
@@ -148,16 +152,14 @@ def test_genbank_conversion(self):
# Convert to GenBank and check contents
outfile = os.path.join(tmp_sub, 'BBa_J23101.gb')
convert_to_genbank(doc3, outfile)
-
- test_dir = os.path.dirname(os.path.realpath(__file__))
- comparison_file = os.path.join(test_dir, 'test_files', 'BBa_J23101.gb')
- assert filecmp.cmp(outfile, comparison_file), f'Converted GenBank file {comparison_file} is not identical'
+ assert_files_identical(outfile, TEST_FILES / 'BBa_J23101.gb')
def test_conversion_from_genbank(self):
"""Test ability to convert from GenBank to SBOL3"""
# Get the GenBank test document and convert
tmp_sub = copy_to_tmp(package=['BBa_J23101.gb'])
- doc3 = convert_from_genbank(os.path.join(tmp_sub, 'BBa_J23101.gb'), 'https://synbiohub.org/public/igem')
+ doc3 = convert_from_genbank(os.path.join(tmp_sub, 'BBa_J23101.gb'), 'https://synbiohub.org/public/igem',
+ force_new_converter=False)
# Note: cannot directly round-trip because converter is a) lossy, and b) inserts extra materials
test_dir = os.path.dirname(os.path.realpath(__file__))
@@ -166,6 +168,35 @@ def test_conversion_from_genbank(self):
comparison_doc.read(comparison_file)
assert not doc_diff(doc3, comparison_doc), f'Converted GenBank file not identical to {comparison_file}'
+ def test_genbank_conversion_new_converter(self):
+ """Test ability to convert from SBOL3 to GenBank using new converter
+ by specifying the `--force-new-converter` flag """
+ # Get the SBOL3 test document
+ tmp_sub = copy_to_tmp(package=['sbol3_genbank_conversion/BBa_J23101_from_genbank_to_sbol3_direct.nt'])
+ doc3 = sbol3.Document()
+ doc3.read(os.path.join(tmp_sub, 'BBa_J23101_from_genbank_to_sbol3_direct.nt'))
+ # Convert to GenBank and check contents
+ outfile = os.path.join(tmp_sub, 'BBa_J23101.gb')
+ convert_to_genbank(doc3=doc3, path=outfile, allow_genbank_online=False, force_new_converter=True)
+ assert_files_identical(outfile, TEST_FILES / 'sbol3_genbank_conversion' / 'BBa_J23101_from_sbol3_direct.gb')
+
+ def test_conversion_from_genbank_new_converter(self):
+ """Test ability to convert from GenBank to SBOL3 using new converter
+ by specifying the `--force-new-converter` flag """
+ # Get the GenBank test document and convert
+ tmp_sub = copy_to_tmp(package=['BBa_J23101.gb'])
+ doc3 = convert_from_genbank(path=os.path.join(tmp_sub, 'BBa_J23101.gb'),
+ namespace=GenBankSBOL3Converter.TEST_NAMESPACE,
+ allow_genbank_online=False,
+ force_new_converter=True)
+
+ # Note: cannot directly round-trip because converter is a) lossy, and b) inserts extra materials
+ test_dir = os.path.dirname(os.path.realpath(__file__))
+ comparison_file = os.path.join(test_dir, 'test_files', 'sbol3_genbank_conversion', 'BBa_J23101_from_genbank_to_sbol3_direct.nt')
+ comparison_doc = sbol3.Document()
+ comparison_doc.read(comparison_file)
+ assert not doc_diff(doc3, comparison_doc), f'Converted SBOL3 file not identical to {comparison_file}'
+
def test_genbank_multi_conversion(self):
"""Test ability to convert from SBOL3 to GenBank"""
# Get the SBOL3 test document
@@ -175,11 +206,8 @@ def test_genbank_multi_conversion(self):
# Convert to GenBank and check contents
outfile = os.path.join(tmp_sub, 'iGEM_SBOL2_imports.gb')
- convert_to_genbank(doc3, outfile)
-
- test_dir = os.path.dirname(os.path.realpath(__file__))
- comparison_file = os.path.join(test_dir, 'test_files', 'iGEM_SBOL2_imports.gb')
- assert filecmp.cmp(outfile, comparison_file), f'Converted GenBank file {comparison_file} is not identical'
+ convert_to_genbank(doc3, outfile, force_new_converter=False)
+ assert_files_identical(outfile, TEST_FILES / 'iGEM_SBOL2_imports.gb')
def test_fasta_conversion(self):
"""Test ability to convert from SBOL3 to FASTA"""
@@ -250,33 +278,33 @@ def test_commandline(self):
assert filecmp.cmp(temp_name, test_file['sbol3']), f'Converted file {temp_name} is not identical'
# Run the other six tests
- test_args = ['fasta2sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['fasta']]
+ test_args = ['fasta-to-sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['fasta']]
with patch.object(sys, 'argv', test_args):
fasta2sbol()
assert filecmp.cmp(temp_name, test_file['from_fasta']), f'Converted file {temp_name} is not identical'
# genbank conversion should succeed the same way when not online if not given an online argument
- test_args = ['genbank2sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['genbank']]
+ test_args = ['genbank-to-sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['genbank']]
with patch.object(sys, 'argv', test_args):
genbank2sbol()
assert filecmp.cmp(temp_name, test_file['from_genbank']), f'Converted file {temp_name} is not identical'
- test_args = ['sbol2fasta', '-o', temp_name, test_file['sbol3']]
+ test_args = ['sbol-to-fasta', '-o', temp_name, test_file['sbol3']]
with patch.object(sys, 'argv', test_args):
sbol2fasta()
assert filecmp.cmp(temp_name, test_file['fasta']), f'Converted file {temp_name} is not identical'
- test_args = ['sbol2genbank', '-o', temp_name, test_file['sbol3']]
+ test_args = ['sbol-to-genbank', '-o', temp_name, test_file['sbol3']]
with patch.object(sys, 'argv', test_args):
sbol2genbank()
assert filecmp.cmp(temp_name, test_file['genbank']), f'Converted file {temp_name} is not identical'
# SBOL2 serialization is not stable, so test via round-trip instead
- test_args = ['sbol3to2', '-o', temp_name, test_file['sbol3']]
+ test_args = ['sbol3-to-sbol2', '-o', temp_name, test_file['sbol3']]
with patch.object(sys, 'argv', test_args):
sbol3to2()
temp_name_2 = tempfile.mkstemp()[1]
- test_args = ['sbol2to3', '-o', temp_name_2, temp_name]
+ test_args = ['sbol2-to-sbol3', '-o', temp_name_2, temp_name]
with patch.object(sys, 'argv', test_args):
sbol2to3()
assert filecmp.cmp(temp_name_2, test_file['sbol323']), f'Converted file {temp_name} is not identical'
@@ -290,7 +318,7 @@ def test_online_conversion(self):
'from_genbank': os.path.join(test_files, 'BBa_J23101_from_genbank.nt'),
}
- test_args = ['genbank2sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['genbank'],
+ test_args = ['genbank-to-sbol', '-o', temp_name, '-n', 'https://synbiohub.org/public/igem', test_file['genbank'],
'--allow-genbank-online']
with patch.object(sys, 'argv', test_args):
genbank2sbol()
diff --git a/test/test_files/BBa_J23101.xml b/test/test_files/BBa_J23101.xml
new file mode 100644
index 00000000..62c8c727
--- /dev/null
+++ b/test/test_files/BBa_J23101.xml
@@ -0,0 +1,64 @@
+
+
+ later
+
+
+ 1
+ N/A
+ In stock
+ BBa_J23101
+ Released HQ 2013
+ true
+ later
+
+ true
+
+ true
+
+
+
+ false
+ 2015-08-31T04:08:40Z
+
+ 2006-08-03T11:00:00Z
+
+ BBa_J23101
+ constitutive promoter family member
+
+
+ 483
+ 95
+ _52_
+
+ John Anderson
+ 0
+
+
+
+ BBa_J23101_sequence
+
+
+
+
+
+
+
+ tttacagctagctcagtcctaggtattatgctagc
+
+ 1
+
+
+ Chris J. Myers
+
+ 2017-03-06T15:00:00+00:00
+ 1
+
+
+ James Alastair McLaughlin
+
+
+ igem2sbol
+ iGEM to SBOL conversion
+ Conversion of the iGEM parts registry to SBOL2.1
+
+
diff --git a/test/test_files/BBa_J23101_patched.nt b/test/test_files/BBa_J23101_patched.nt
new file mode 100644
index 00000000..08763480
--- /dev/null
+++ b/test/test_files/BBa_J23101_patched.nt
@@ -0,0 +1,57 @@
+ "John Anderson" .
+ "2006-08-03T11:00:00Z" .
+ "2015-08-31T04:08:40Z" .
+ "constitutive promoter family member" .
+ "BBa_J23101" .
+ .
+ .
+ "BBa_J23101" .
+ .
+ .
+ .
+ "1" .
+ "false" .
+ "true" .
+ .
+ "_52_" .
+ "0" .
+ "483" .
+ "95" .
+ "Released HQ 2013" .
+ "In stock" .
+ .
+ "true" .
+ "later" .
+ "N/A" .
+ "later" .
+ .
+ .
+ "true" .
+ .
+ .
+ .
+ .
+ "BBa_J23101_sequence" .
+ "tttacagctagctcagtcctaggtattatgctagc" .
+ .
+ .
+ "1" .
+ .
+ .
+ .
+ .
+ .
+ .
+ "Chris J. Myers" .
+ "James Alastair McLaughlin" .
+ "Conversion of the iGEM parts registry to SBOL2.1" .
+ "iGEM to SBOL conversion" .
+ "igem2sbol" .
+ .
+ "1" .
+ .
+ .
+ .
+ .
+ .
+ "2017-03-06T15:00:00+00:00"^^ .
diff --git a/test/test_files/BBa_J23101_v2.gb b/test/test_files/BBa_J23101_v2.gb
new file mode 100644
index 00000000..9fc3c5a4
--- /dev/null
+++ b/test/test_files/BBa_J23101_v2.gb
@@ -0,0 +1,12 @@
+LOCUS BBa_J23101 35 bp DNA linear UNK 01-JAN-1980
+DEFINITION constitutive promoter family member.
+ACCESSION BBa_J23101
+VERSION BBa_J23101.2
+KEYWORDS .
+SOURCE .
+ ORGANISM .
+ .
+FEATURES Location/Qualifiers
+ORIGIN
+ 1 tttacagcta gctcagtcct aggtattatg ctagc
+//
diff --git a/test/test_files/Comparison_file_Complexity_Scores.nt b/test/test_files/Comparison_file_Complexity_Scores.nt
new file mode 100644
index 00000000..5e8acc14
--- /dev/null
+++ b/test/test_files/Comparison_file_Complexity_Scores.nt
@@ -0,0 +1,58 @@
+ "Complexity_Report_20230516T194547Z_a2efceb0" .
+ .
+ .
+ .
+ .
+ "2023-05-16T19:45:47+00:00"^^ .
+ "Measure1" .
+ .
+ "13.2"^^ .
+ .
+ .
+ .
+ .
+ "X2018_Interlab_Devices_BBa_I20270" .
+ "ttgatggctagctcagtcctaggtacaatgctagctactagagtcacacaggaaagtactagatgcgtaaaggagaagaacttttcactggagttgtcccaattcttgttgaattagatggtgatgttaatgggcacaaattttctgtcagtggagagggtgaaggtgatgcaacatacggaaaacttacccttaaatttatttgcactactggaaaactacctgttccatggccaacacttgtcactactttcggttatggtgttcaatgctttgcgagatacccagatcatatgaaacagcatgactttttcaagagtgccatgcccgaaggttatgtacaggaaagaactatatttttcaaagatgacgggaactacaagacacgtgctgaagtcaagtttgaaggtgatacccttgttaatagaatcgagttaaaaggtattgattttaaagaagatggaaacattcttggacacaaattggaatacaactataactcacacaatgtatacatcatggcagacaaacaaaagaatggaatcaaagttaacttcaaaattagacacaacattgaagatggaagcgttcaactagcagaccattatcaacaaaatactccaattggcgatggccctgtccttttaccagacaaccattacctgtccacacaatctgccctttcgaaagatcccaacgaaaagagagaccacatggtccttcttgagtttgtaacagctgctgggattacacatggcatggatgaactatacaaataataatactagagccaggcatcaaataaaacgaaaggctcagtcgaaagactgggcctttcgttttatctgttgtttgtcggtgaacgctctctactagagtcacactggctcaccttcgggtgggcctttctgcgtttata" .
+ .
+ .
+ .
+ .
+ "Measure1" .
+ .
+ "54.0"^^ .
+ .
+ .
+ .
+ .
+ "X2018_Interlab_Devices_BBa_R0040" .
+ "tccctatcagtgatagagattgacatccctatcagtgatagagatactgagcac" .
+ .
+ .
+ .
+ .
+ "Measure1" .
+ .
+ "9.9"^^ .
+ .
+ .
+ .
+ .
+ "X2018_Interlab_Devices_J364000" .
+