Skip to content

Commit

Permalink
Merge pull request #701 from SMI/bugfix/python
Browse files Browse the repository at this point in the history
Bugfix/python
  • Loading branch information
rkm authored Apr 7, 2021
2 parents c439259 + bde334e commit f2a18f6
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 27 deletions.
1 change: 1 addition & 0 deletions news/701-bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Several improvements to Python code for handling unusually-formatted SR documents.
41 changes: 24 additions & 17 deletions src/applications/SRAnonTool/CTP_DicomToText.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,8 @@
import yaml
import pydicom
import re
import xml.etree.ElementTree # untangle and xmltodict not available in NSH
from deepmerge import Merger # for deep merging dictionaries
from SmiServices import Mongo
from SmiServices import Rabbit
from SmiServices import Dicom
from SmiServices import DicomText
from SmiServices import StructuredReport as SR

Expand All @@ -37,6 +34,7 @@ def extract_mongojson(mongojson, output):
if os.path.isdir(output):
filename = mongojson['SOPInstanceUID'] + '.txt'
output = os.path.join(output, filename)
logging.info('Parse %s' % mongojson.get('header',{}).get('DicomFilePath','<NoFilePath?>'))
with open(output, 'w') as fd:
SR.SR_parse(mongojson, filename, fd)
logging.info(f'Wrote {output}')
Expand Down Expand Up @@ -98,6 +96,7 @@ def extract_file(input, output):
parser.add_argument('-y', dest='yamlfile', action="append", help='path to yaml config file (can be used more than once)')
parser.add_argument('-i', dest='input', action="store", help='SOPInstanceUID or path to raw DICOM file from which text will be redacted')
parser.add_argument('-o', dest='output_dir', action="store", help='path to directory where extracted text will be written')
parser.add_argument('--semehr-unique', dest='semehr_unique', action="store_true", help='only extra from MongoDB/dicom if not already in MongoDB/semehr')
args = parser.parse_args()
if not args.input:
parser.print_help()
Expand All @@ -113,14 +112,18 @@ def extract_file(input, output):
# Merge all the yaml dicts into one
cfg_dict = Merger([(list, ["append"]),(dict, ["merge"])],["override"],["override"]).merge(cfg_dict, yaml.safe_load(fd))

mongo_host = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('HostName',{})
mongo_user = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('UserName',{})
mongo_pass = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('Password',{})
mongo_db = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('DatabaseName',{})
mongo_dicom_host = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('HostName',{})
mongo_dicom_user = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('UserName',{})
mongo_dicom_pass = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('Password',{})
mongo_dicom_db = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('DatabaseName',{})

mongo_semehr_host = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('HostName',{})
mongo_semehr_user = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('UserName',{})
mongo_semehr_pass = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('Password',{})
mongo_semehr_db = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('DatabaseName',{})

log_dir = cfg_dict['LoggingOptions']['LogsRoot']
root_dir = cfg_dict['FileSystemOptions']['FileSystemRoot']
extract_dir = cfg_dict['FileSystemOptions']['ExtractRoot']

# ---------------------------------------------------------------------
# Now we know the LogsRoot we can set up logging
Expand All @@ -141,18 +144,22 @@ def extract_file(input, output):
for root, dirs, files in os.walk(args.input, topdown=False):
for name in files:
extract_file(os.path.join(root, name), args.output_dir)
elif mongo_db != {}:
elif mongo_dicom_db != {}:
# Only DicomFilePath and StudyDate are indexed in MongoDB.
# Passing a SOPInstanceUID would be handy but no point if not indexed.
mongodb = Mongo.SmiPyMongoCollection(mongo_host, mongo_user, mongo_host)
mongodb.setImageCollection('SR')
# If it looks like a date YYYY/MM/DD or YYYYMMDD:
if re.match('^\\s*\\d+/\\d+/\\d+\\s*$|^\\s*\\d{6}\\s*$', args.input):
for mongojson in mongodb.StudyDateToJSONList(args.input):
extract_mongojson(mongojson, args.output_dir)
# Otherwise assume a DICOM file path
mongodb_in = Mongo.SmiPyMongoCollection(mongo_dicom_host, mongo_dicom_user, mongo_dicom_pass)
mongodb_in.setImageCollection('SR')
mongodb_out = Mongo.SmiPyMongoCollection(mongo_semehr_host, mongo_semehr_user, mongo_semehr_pass)
mongodb_out.setSemEHRCollection('semehr_results')
# If it looks like a date YYYY/MM/DD or YYYYMMDD extract all on that day:
if re.match('^\\s*\\d+/\\d+/\\d+\\s*$|^\\s*\\d{8}\\s*$', args.input):
for mongojson in mongodb_in.StudyDateToJSONList(args.input):
# If it's already in the annotation database then don't bother extracting.
if not args.semehr_unique or not mongodb_out.findSOPInstanceUID(mongojson['SOPInstanceUID']):
extract_mongojson(mongojson, args.output_dir)
# Otherwise assume a DICOM file path which can be retrieved from MongoDB
else:
mongojson = mongodb.DicomFilePathToJSON(args.input)
mongojson = mongodb_in.DicomFilePathToJSON(args.input)
extract_mongojson(mongojson, args.output_dir)
else:
logging.error(f'Cannot find {args.input} as file and MongoDB not configured')
Expand Down
2 changes: 1 addition & 1 deletion src/applications/SRAnonTool/CTP_SRAnonTool.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ tidy_exit()

# Default executable PATHs and Python libraries
export PATH=${PATH}:${SMI_ROOT}/bin:${SMI_ROOT}/scripts:${progdir}
export PYTHONPATH=${SMI_ROOT}/lib/python3:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib/python3.6/site-packages
export PYTHONPATH=${SMI_ROOT}/lib/python3:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib/python3.6/site-packages:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib64/python3.6/site-packages

# Command line arguments
while getopts ${options} var; do
Expand Down
2 changes: 1 addition & 1 deletion src/applications/SRAnonTool/CTP_XMLToDicom.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
xmlroot = xml.etree.ElementTree.parse(args.input_xml).getroot()
xmldictlist = Knowtator.annotation_xml_to_dict(xmlroot)
if xmldictlist == []:
print('WARNING: empty document in {}'.format(xmlfilename))
print('WARNING: empty document in {}'.format(args.input_xml))
#for annot in xmldictlist:
# print('REMOVE {} from DICOM at {}'.format(annot['text'], annot['start_char']))

Expand Down
2 changes: 1 addition & 1 deletion src/applications/SRAnonTool/test/CTP_SRAnonTool_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import re
import shutil
import sys
from SmiServices import DicomText, Knowtator
from SmiServices import DicomText

# Configurable:
semehr_dir = '/opt/semehr'
Expand Down
24 changes: 23 additions & 1 deletion src/common/Smi_Common_Python/SmiServices/Dicom.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,24 @@ def test_sr_decode_ConceptNameCodeSequence():
assert sr_decode_ConceptNameCodeSequence( [ { 'CodeMeaning': 'cm', 'CodeValue': 'cv' } ] ) == 'cm'


# ---------------------------------------------------------------------
# Decode a SourceImageSequence by returning a string consisting of the
# UID of the referenced image.
# XXX ignores the ReferencedSOPClassUID.
# XXX this is the same as ReferencedSOPSequence above.

def sr_decode_SourceImageSequence(sis):
assert isinstance(sis, list)
for sis_item in sis:
if has_tag(sis_item, 'ReferencedSOPInstanceUID'):
return tag_val(sis_item, 'ReferencedSOPInstanceUID')
return ''

def test_sr_decode_SourceImageSequence():
assert sr_decode_SourceImageSequence([]) == ''
assert sr_decode_SourceImageSequence( [ {'ReferencedSOPClassUID':'1.2', 'ReferencedSOPInstanceUID':'1.2.3.4.5'} ] ) == '1.2.3.4.5'


# ---------------------------------------------------------------------
# Decode a MeasuredValueSequence by returning a string consisting of the
# NumericValue inside, and having the short form of the units appended
Expand Down Expand Up @@ -204,10 +222,14 @@ def sr_decode_MeasuredValueSequence(mvs):
num_str = tag_val(mvs_item, 'NumericValue')
if has_tag(mvs_item, 'MeasurementUnitsCodeSequence'):
units_str = sr_decode_MeasurementUnitsCodeSequence(tag_val(mvs_item, 'MeasurementUnitsCodeSequence'))
return num_str+' '+units_str
# Have to use str() because sometimes the value is missing, i.e. None(!)
return str(num_str)+' '+str(units_str)

def test_sr_decode_MeasuredValueSequence():
assert sr_decode_MeasuredValueSequence(None) == ''
assert sr_decode_MeasuredValueSequence( [ { 'NumericValue': None, 'MeasurementUnitsCodeSequence': [ { 'CodeMeaning': 'cm', 'CodeValue': 'mm' } ] } ] ) == 'None mm'
assert sr_decode_MeasuredValueSequence( [ { 'NumericValue': { 'vr':'DS' }, 'MeasurementUnitsCodeSequence': [ { 'CodeMeaning': 'cm', 'CodeValue': 'mm' } ] } ] ) == ' mm'
assert sr_decode_MeasuredValueSequence( [ { 'NumericValue': { 'vr':'DS', 'Value': '23' }, 'MeasurementUnitsCodeSequence': [ { 'CodeMeaning': 'cm', 'CodeValue': 'mm' } ] } ] ) == '23 mm'
assert sr_decode_MeasuredValueSequence( [ { 'NumericValue': '23', 'MeasurementUnitsCodeSequence': [ { 'CodeMeaning': 'cm', 'CodeValue': 'mm' } ] } ] ) == '23 mm'


Expand Down
15 changes: 13 additions & 2 deletions src/common/Smi_Common_Python/SmiServices/Mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,16 @@ def __init__(self, hostname, username = None, password = None):
self.mongoConnection = MongoClient(host=hostname)


def setSemEHRCollection(self, collection_name):
""" After initialisation set the desired collection using the two-letter modality, eg. SR selects dicom.image_SR """

self.mongoCollection = self.mongoConnection['semehr'][collection_name]

def setImageCollection(self, modality):
""" After initialisation set the desired collection using the two-letter modality, eg. SR selects dicom.image_SR """

self.mongoCollection = self.mongoConnection['dicom']['image_'+modality]


def DicomFilePathToJSON(self, DicomFilePath):
""" After setting a collection(modality) you can extract a document given a DICOM path (can be absolute, as everything upto PACS stripped off, or relative to root of collection)"""

Expand All @@ -41,4 +45,11 @@ def StudyDateToJSONList(self, StudyDate):
StudyDate = re.sub('[/ ]*', '', StudyDate)
assert(len(StudyDate) == 8)

return self.mongoCollection.find( { "StudyDate" : StudyDate } )
return self.mongoCollection.find( { "StudyDate" : StudyDate } )

def findSOPInstanceUID(self, sopinstanceuid):
""" This is intended to check for the existance of a document having the
given SOPInstanceUID but since it also returns that document is can be
used as a query """

return self.mongoCollection.find_one( { 'SOPInstanceUID': sopinstanceuid } )
23 changes: 19 additions & 4 deletions src/common/Smi_Common_Python/SmiServices/StructuredReport.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ def sr_key_can_be_ignored(keystr):
if keystr == sr_extract_dict['tag']:
return True
# We cannot definitively decode private tags, BUT some contain information which is not anywhere else, even person names!
# XXX TODO: add the ones we know (study description, hospital name, etc)
if '-PrivateCreator' in keystr:
return True
if '-Unknown' in keystr:
Expand Down Expand Up @@ -305,6 +306,8 @@ def _SR_output_string(keystr, valstr, fp):
def _SR_parse_key(json_dict, json_key, fp):
if tag_is(json_key, 'ConceptNameCodeSequence'):
_SR_output_string('', Dicom.sr_decode_ConceptNameCodeSequence(tag_val(json_dict, json_key)), fp)
elif tag_is(json_key, 'SourceImageSequence'):
_SR_output_string('SourceImage', Dicom.sr_decode_SourceImageSequence(tag_val(json_dict, json_key)), fp)
elif tag_is(json_key, 'ContentSequence'):
for cs_item in tag_val(json_dict, json_key):
if has_tag(cs_item, 'RelationshipType') and has_tag(cs_item, 'ValueType'):
Expand All @@ -326,19 +329,31 @@ def _SR_parse_key(json_dict, json_key, fp):
elif value_type == 'UIDREF' or value_type == ['UIDREF']:
_SR_output_string(Dicom.sr_decode_ConceptNameCodeSequence(tag_val(cs_item, 'ConceptNameCodeSequence')), tag_val(cs_item, 'UID'), fp)
elif value_type == 'IMAGE' or value_type == ['IMAGE']:
_SR_output_string(Dicom.sr_decode_ConceptNameCodeSequence(tag_val(cs_item, 'ConceptNameCodeSequence')), Dicom.sr_decode_ReferencedSOPSequence(tag_val(cs_item, 'ReferencedSOPSequence')), fp)
_SR_output_string(Dicom.sr_decode_ConceptNameCodeSequence(tag_val(cs_item, 'ReferencedSOPSequence')), Dicom.sr_decode_ReferencedSOPSequence(tag_val(cs_item, 'ReferencedSOPSequence')), fp)
elif value_type == 'CONTAINER' or value_type == ['CONTAINER']:
# Sometimes it has no ContentSequence or is 'null'
if has_tag(cs_item, 'ContentSequence') and tag_val(cs_item, 'ContentSequence') != None:
_SR_output_string('', Dicom.sr_decode_ConceptNameCodeSequence(tag_val(cs_item, 'ConceptNameCodeSequence')), fp)
if has_tag(cs_item, 'ConceptNameCodeSequence'):
_SR_output_string('', Dicom.sr_decode_ConceptNameCodeSequence(tag_val(cs_item, 'ConceptNameCodeSequence')), fp)
_SR_parse_key(cs_item, 'ContentSequence', fp)
# explicitly ignore TIME, SCOORD, TCOORD, COMPOSITE, IMAGE, WAVEFORM
# as they have no useful text to return
elif value_type == 'SCOORD' or value_type == ['SCOORD']:
pass
elif value_type == 'TCOORD' or value_type == ['TCOORD']:
pass
elif value_type == 'COMPOSITE' or value_type == ['COMPOSITE']:
pass
elif value_type == 'TIME' or value_type == ['TIME']:
pass
elif value_type == 'WAVEFORM' or value_type == ['WAVEFORM']:
pass
else:
print('UNEXPECTED ITEM OF TYPE %s = %s' % (value_type, cs_item), file=sys.stderr)
#print('ITEM %s' % cs_item)
else:
if not sr_key_can_be_ignored(json_key):
print('UNEXPECTED KEY %s' % json_key, file=sys.stderr)
print(json_dict[json_key])
print('UNEXPECTED KEY %s = %s' % (json_key, json_dict[json_key]), file=sys.stderr)


# ---------------------------------------------------------------------
Expand Down

0 comments on commit f2a18f6

Please sign in to comment.