diff --git a/news/701-bugfix.md b/news/701-bugfix.md new file mode 100644 index 000000000..1d84e84b3 --- /dev/null +++ b/news/701-bugfix.md @@ -0,0 +1 @@ +Several improvements to Python code for handling unusually-formatted SR documents. diff --git a/src/applications/SRAnonTool/CTP_DicomToText.py b/src/applications/SRAnonTool/CTP_DicomToText.py index 1c3a88f44..a227d353b 100755 --- a/src/applications/SRAnonTool/CTP_DicomToText.py +++ b/src/applications/SRAnonTool/CTP_DicomToText.py @@ -18,11 +18,8 @@ import yaml import pydicom import re -import xml.etree.ElementTree # untangle and xmltodict not available in NSH from deepmerge import Merger # for deep merging dictionaries from SmiServices import Mongo -from SmiServices import Rabbit -from SmiServices import Dicom from SmiServices import DicomText from SmiServices import StructuredReport as SR @@ -37,6 +34,7 @@ def extract_mongojson(mongojson, output): if os.path.isdir(output): filename = mongojson['SOPInstanceUID'] + '.txt' output = os.path.join(output, filename) + logging.info('Parse %s' % mongojson.get('header',{}).get('DicomFilePath','')) with open(output, 'w') as fd: SR.SR_parse(mongojson, filename, fd) logging.info(f'Wrote {output}') @@ -98,6 +96,7 @@ def extract_file(input, output): parser.add_argument('-y', dest='yamlfile', action="append", help='path to yaml config file (can be used more than once)') parser.add_argument('-i', dest='input', action="store", help='SOPInstanceUID or path to raw DICOM file from which text will be redacted') parser.add_argument('-o', dest='output_dir', action="store", help='path to directory where extracted text will be written') + parser.add_argument('--semehr-unique', dest='semehr_unique', action="store_true", help='only extra from MongoDB/dicom if not already in MongoDB/semehr') args = parser.parse_args() if not args.input: parser.print_help() @@ -113,14 +112,18 @@ def extract_file(input, output): # Merge all the yaml dicts into one cfg_dict = Merger([(list, ["append"]),(dict, ["merge"])],["override"],["override"]).merge(cfg_dict, yaml.safe_load(fd)) - mongo_host = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('HostName',{}) - mongo_user = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('UserName',{}) - mongo_pass = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('Password',{}) - mongo_db = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('DatabaseName',{}) + mongo_dicom_host = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('HostName',{}) + mongo_dicom_user = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('UserName',{}) + mongo_dicom_pass = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('Password',{}) + mongo_dicom_db = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('DatabaseName',{}) + + mongo_semehr_host = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('HostName',{}) + mongo_semehr_user = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('UserName',{}) + mongo_semehr_pass = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('Password',{}) + mongo_semehr_db = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('DatabaseName',{}) log_dir = cfg_dict['LoggingOptions']['LogsRoot'] root_dir = cfg_dict['FileSystemOptions']['FileSystemRoot'] - extract_dir = cfg_dict['FileSystemOptions']['ExtractRoot'] # --------------------------------------------------------------------- # Now we know the LogsRoot we can set up logging @@ -141,18 +144,22 @@ def extract_file(input, output): for root, dirs, files in os.walk(args.input, topdown=False): for name in files: extract_file(os.path.join(root, name), args.output_dir) - elif mongo_db != {}: + elif mongo_dicom_db != {}: # Only DicomFilePath and StudyDate are indexed in MongoDB. # Passing a SOPInstanceUID would be handy but no point if not indexed. - mongodb = Mongo.SmiPyMongoCollection(mongo_host, mongo_user, mongo_host) - mongodb.setImageCollection('SR') - # If it looks like a date YYYY/MM/DD or YYYYMMDD: - if re.match('^\\s*\\d+/\\d+/\\d+\\s*$|^\\s*\\d{6}\\s*$', args.input): - for mongojson in mongodb.StudyDateToJSONList(args.input): - extract_mongojson(mongojson, args.output_dir) - # Otherwise assume a DICOM file path + mongodb_in = Mongo.SmiPyMongoCollection(mongo_dicom_host, mongo_dicom_user, mongo_dicom_pass) + mongodb_in.setImageCollection('SR') + mongodb_out = Mongo.SmiPyMongoCollection(mongo_semehr_host, mongo_semehr_user, mongo_semehr_pass) + mongodb_out.setSemEHRCollection('semehr_results') + # If it looks like a date YYYY/MM/DD or YYYYMMDD extract all on that day: + if re.match('^\\s*\\d+/\\d+/\\d+\\s*$|^\\s*\\d{8}\\s*$', args.input): + for mongojson in mongodb_in.StudyDateToJSONList(args.input): + # If it's already in the annotation database then don't bother extracting. + if not args.semehr_unique or not mongodb_out.findSOPInstanceUID(mongojson['SOPInstanceUID']): + extract_mongojson(mongojson, args.output_dir) + # Otherwise assume a DICOM file path which can be retrieved from MongoDB else: - mongojson = mongodb.DicomFilePathToJSON(args.input) + mongojson = mongodb_in.DicomFilePathToJSON(args.input) extract_mongojson(mongojson, args.output_dir) else: logging.error(f'Cannot find {args.input} as file and MongoDB not configured') diff --git a/src/applications/SRAnonTool/CTP_SRAnonTool.sh b/src/applications/SRAnonTool/CTP_SRAnonTool.sh index 5f929f5da..6ae594f64 100755 --- a/src/applications/SRAnonTool/CTP_SRAnonTool.sh +++ b/src/applications/SRAnonTool/CTP_SRAnonTool.sh @@ -58,7 +58,7 @@ tidy_exit() # Default executable PATHs and Python libraries export PATH=${PATH}:${SMI_ROOT}/bin:${SMI_ROOT}/scripts:${progdir} -export PYTHONPATH=${SMI_ROOT}/lib/python3:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib/python3.6/site-packages +export PYTHONPATH=${SMI_ROOT}/lib/python3:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib/python3.6/site-packages:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib64/python3.6/site-packages # Command line arguments while getopts ${options} var; do diff --git a/src/applications/SRAnonTool/CTP_XMLToDicom.py b/src/applications/SRAnonTool/CTP_XMLToDicom.py index 2300529d1..5339bbb6a 100755 --- a/src/applications/SRAnonTool/CTP_XMLToDicom.py +++ b/src/applications/SRAnonTool/CTP_XMLToDicom.py @@ -80,7 +80,7 @@ xmlroot = xml.etree.ElementTree.parse(args.input_xml).getroot() xmldictlist = Knowtator.annotation_xml_to_dict(xmlroot) if xmldictlist == []: - print('WARNING: empty document in {}'.format(xmlfilename)) + print('WARNING: empty document in {}'.format(args.input_xml)) #for annot in xmldictlist: # print('REMOVE {} from DICOM at {}'.format(annot['text'], annot['start_char'])) diff --git a/src/applications/SRAnonTool/test/CTP_SRAnonTool_test.py b/src/applications/SRAnonTool/test/CTP_SRAnonTool_test.py index 08528a741..b131ec234 100755 --- a/src/applications/SRAnonTool/test/CTP_SRAnonTool_test.py +++ b/src/applications/SRAnonTool/test/CTP_SRAnonTool_test.py @@ -15,7 +15,7 @@ import re import shutil import sys -from SmiServices import DicomText, Knowtator +from SmiServices import DicomText # Configurable: semehr_dir = '/opt/semehr' diff --git a/src/common/Smi_Common_Python/SmiServices/Dicom.py b/src/common/Smi_Common_Python/SmiServices/Dicom.py index fbcdfc17b..0c9e11c5b 100644 --- a/src/common/Smi_Common_Python/SmiServices/Dicom.py +++ b/src/common/Smi_Common_Python/SmiServices/Dicom.py @@ -171,6 +171,24 @@ def test_sr_decode_ConceptNameCodeSequence(): assert sr_decode_ConceptNameCodeSequence( [ { 'CodeMeaning': 'cm', 'CodeValue': 'cv' } ] ) == 'cm' +# --------------------------------------------------------------------- +# Decode a SourceImageSequence by returning a string consisting of the +# UID of the referenced image. +# XXX ignores the ReferencedSOPClassUID. +# XXX this is the same as ReferencedSOPSequence above. + +def sr_decode_SourceImageSequence(sis): + assert isinstance(sis, list) + for sis_item in sis: + if has_tag(sis_item, 'ReferencedSOPInstanceUID'): + return tag_val(sis_item, 'ReferencedSOPInstanceUID') + return '' + +def test_sr_decode_SourceImageSequence(): + assert sr_decode_SourceImageSequence([]) == '' + assert sr_decode_SourceImageSequence( [ {'ReferencedSOPClassUID':'1.2', 'ReferencedSOPInstanceUID':'1.2.3.4.5'} ] ) == '1.2.3.4.5' + + # --------------------------------------------------------------------- # Decode a MeasuredValueSequence by returning a string consisting of the # NumericValue inside, and having the short form of the units appended @@ -204,10 +222,14 @@ def sr_decode_MeasuredValueSequence(mvs): num_str = tag_val(mvs_item, 'NumericValue') if has_tag(mvs_item, 'MeasurementUnitsCodeSequence'): units_str = sr_decode_MeasurementUnitsCodeSequence(tag_val(mvs_item, 'MeasurementUnitsCodeSequence')) - return num_str+' '+units_str + # Have to use str() because sometimes the value is missing, i.e. None(!) + return str(num_str)+' '+str(units_str) def test_sr_decode_MeasuredValueSequence(): assert sr_decode_MeasuredValueSequence(None) == '' + assert sr_decode_MeasuredValueSequence( [ { 'NumericValue': None, 'MeasurementUnitsCodeSequence': [ { 'CodeMeaning': 'cm', 'CodeValue': 'mm' } ] } ] ) == 'None mm' + assert sr_decode_MeasuredValueSequence( [ { 'NumericValue': { 'vr':'DS' }, 'MeasurementUnitsCodeSequence': [ { 'CodeMeaning': 'cm', 'CodeValue': 'mm' } ] } ] ) == ' mm' + assert sr_decode_MeasuredValueSequence( [ { 'NumericValue': { 'vr':'DS', 'Value': '23' }, 'MeasurementUnitsCodeSequence': [ { 'CodeMeaning': 'cm', 'CodeValue': 'mm' } ] } ] ) == '23 mm' assert sr_decode_MeasuredValueSequence( [ { 'NumericValue': '23', 'MeasurementUnitsCodeSequence': [ { 'CodeMeaning': 'cm', 'CodeValue': 'mm' } ] } ] ) == '23 mm' diff --git a/src/common/Smi_Common_Python/SmiServices/Mongo.py b/src/common/Smi_Common_Python/SmiServices/Mongo.py index 763943a6d..3bb6a2cec 100644 --- a/src/common/Smi_Common_Python/SmiServices/Mongo.py +++ b/src/common/Smi_Common_Python/SmiServices/Mongo.py @@ -19,12 +19,16 @@ def __init__(self, hostname, username = None, password = None): self.mongoConnection = MongoClient(host=hostname) + def setSemEHRCollection(self, collection_name): + """ After initialisation set the desired collection using the two-letter modality, eg. SR selects dicom.image_SR """ + + self.mongoCollection = self.mongoConnection['semehr'][collection_name] + def setImageCollection(self, modality): """ After initialisation set the desired collection using the two-letter modality, eg. SR selects dicom.image_SR """ self.mongoCollection = self.mongoConnection['dicom']['image_'+modality] - def DicomFilePathToJSON(self, DicomFilePath): """ After setting a collection(modality) you can extract a document given a DICOM path (can be absolute, as everything upto PACS stripped off, or relative to root of collection)""" @@ -41,4 +45,11 @@ def StudyDateToJSONList(self, StudyDate): StudyDate = re.sub('[/ ]*', '', StudyDate) assert(len(StudyDate) == 8) - return self.mongoCollection.find( { "StudyDate" : StudyDate } ) \ No newline at end of file + return self.mongoCollection.find( { "StudyDate" : StudyDate } ) + + def findSOPInstanceUID(self, sopinstanceuid): + """ This is intended to check for the existance of a document having the + given SOPInstanceUID but since it also returns that document is can be + used as a query """ + + return self.mongoCollection.find_one( { 'SOPInstanceUID': sopinstanceuid } ) \ No newline at end of file diff --git a/src/common/Smi_Common_Python/SmiServices/StructuredReport.py b/src/common/Smi_Common_Python/SmiServices/StructuredReport.py index 23d808fe3..3d451c2ce 100644 --- a/src/common/Smi_Common_Python/SmiServices/StructuredReport.py +++ b/src/common/Smi_Common_Python/SmiServices/StructuredReport.py @@ -256,6 +256,7 @@ def sr_key_can_be_ignored(keystr): if keystr == sr_extract_dict['tag']: return True # We cannot definitively decode private tags, BUT some contain information which is not anywhere else, even person names! + # XXX TODO: add the ones we know (study description, hospital name, etc) if '-PrivateCreator' in keystr: return True if '-Unknown' in keystr: @@ -305,6 +306,8 @@ def _SR_output_string(keystr, valstr, fp): def _SR_parse_key(json_dict, json_key, fp): if tag_is(json_key, 'ConceptNameCodeSequence'): _SR_output_string('', Dicom.sr_decode_ConceptNameCodeSequence(tag_val(json_dict, json_key)), fp) + elif tag_is(json_key, 'SourceImageSequence'): + _SR_output_string('SourceImage', Dicom.sr_decode_SourceImageSequence(tag_val(json_dict, json_key)), fp) elif tag_is(json_key, 'ContentSequence'): for cs_item in tag_val(json_dict, json_key): if has_tag(cs_item, 'RelationshipType') and has_tag(cs_item, 'ValueType'): @@ -326,19 +329,31 @@ def _SR_parse_key(json_dict, json_key, fp): elif value_type == 'UIDREF' or value_type == ['UIDREF']: _SR_output_string(Dicom.sr_decode_ConceptNameCodeSequence(tag_val(cs_item, 'ConceptNameCodeSequence')), tag_val(cs_item, 'UID'), fp) elif value_type == 'IMAGE' or value_type == ['IMAGE']: - _SR_output_string(Dicom.sr_decode_ConceptNameCodeSequence(tag_val(cs_item, 'ConceptNameCodeSequence')), Dicom.sr_decode_ReferencedSOPSequence(tag_val(cs_item, 'ReferencedSOPSequence')), fp) + _SR_output_string(Dicom.sr_decode_ConceptNameCodeSequence(tag_val(cs_item, 'ReferencedSOPSequence')), Dicom.sr_decode_ReferencedSOPSequence(tag_val(cs_item, 'ReferencedSOPSequence')), fp) elif value_type == 'CONTAINER' or value_type == ['CONTAINER']: # Sometimes it has no ContentSequence or is 'null' if has_tag(cs_item, 'ContentSequence') and tag_val(cs_item, 'ContentSequence') != None: - _SR_output_string('', Dicom.sr_decode_ConceptNameCodeSequence(tag_val(cs_item, 'ConceptNameCodeSequence')), fp) + if has_tag(cs_item, 'ConceptNameCodeSequence'): + _SR_output_string('', Dicom.sr_decode_ConceptNameCodeSequence(tag_val(cs_item, 'ConceptNameCodeSequence')), fp) _SR_parse_key(cs_item, 'ContentSequence', fp) + # explicitly ignore TIME, SCOORD, TCOORD, COMPOSITE, IMAGE, WAVEFORM + # as they have no useful text to return + elif value_type == 'SCOORD' or value_type == ['SCOORD']: + pass + elif value_type == 'TCOORD' or value_type == ['TCOORD']: + pass + elif value_type == 'COMPOSITE' or value_type == ['COMPOSITE']: + pass + elif value_type == 'TIME' or value_type == ['TIME']: + pass + elif value_type == 'WAVEFORM' or value_type == ['WAVEFORM']: + pass else: print('UNEXPECTED ITEM OF TYPE %s = %s' % (value_type, cs_item), file=sys.stderr) #print('ITEM %s' % cs_item) else: if not sr_key_can_be_ignored(json_key): - print('UNEXPECTED KEY %s' % json_key, file=sys.stderr) - print(json_dict[json_key]) + print('UNEXPECTED KEY %s = %s' % (json_key, json_dict[json_key]), file=sys.stderr) # ---------------------------------------------------------------------