runKerasTensorflowClassifierOnATLASImages.py

#!/usr/bin/env python
"""Run the Keras/Tensorflow classifier.

Usage:
  %s <configFile> [<candidate>...] [--hkoclassifier=<hkoclassifier>] [--mloclassifier=<mloclassifier>] [--ps1classifier=<ps1classifier>] [--outputcsv=<outputcsv>] [--listid=<listid>] [--imageroot=<imageroot>] [--update]
  %s (-h | --help)
  %s --version

Options:
  -h --help                          Show this screen.
  --version                          Show version.
  --listid=<listid>                  List ID [default: 4].
  --hkoclassifier=<hkoclassifier>    HKO Classifier file.
  --mloclassifier=<mloclassifier>    MLO Classifier file.
  --ps1classifier=<mloclassifier>    PS1 Classifier file. This option will cause the HKO and MLO classifiers to be ignored.
  --outputcsv=<outputcsv>            Output file.
  --imageroot=<imageroot>            Root location of the actual images [default: /psdb3/images/].
  --update                           Update the database.

Example:
  python %s ~/config.pso3.gw.warp.yaml --ps1classifier=/data/db4data1/scratch/kws/training/ps1/20190115/ps1_20190115_400000_1200000.best.hdf5 --listid=4 --outputcsv=/tmp/pso3_list_4.csv
  python %s ../ps13pi/config/config.yaml --ps1classifier=/data/db4data1/scratch/kws/training/ps1/20190115/ps1_20190115_400000_1200000.best.hdf5 --listid=4 --outputcsv=/tmp/ps13pi_list_4.csv

"""
import sys
__doc__ = __doc__ % (sys.argv[0], sys.argv[0], sys.argv[0], sys.argv[0], sys.argv[0])
from docopt import docopt
from gkutils import Struct, cleanOptions, readGenericDataFile, dbConnect
import sys, csv, os
from TargetImage import *
import numpy as np
from kerasTensorflowClassifier import create_model, load_data
from collections import defaultdict, OrderedDict

# 2019-05-05 KWS Limit the number of CPUs to 4 for each process. Should still overuse the CPUs
#                but should get away with this because of I/O.
#from keras import backend as K
#K.set_session(K.tf.Session(config=K.tf.ConfigProto(intra_op_parallelism_threads=16, inter_op_parallelism_threads=16)))

def getObjectsByList(conn, dbName, listId = 4, imageRoot='/psdb3/images/', ps1Data = False):
    # First get the candidates
    import MySQLdb
    try:
        cursor = conn.cursor (MySQLdb.cursors.DictCursor)

        if ps1Data:
            cursor.execute ("""
                select id
                  from tcs_transient_objects
                 where detection_list_id = %s
                   and confidence_factor is null
                   and tcs_images_id is not null
              order by followup_id desc
            """, (listId,))
        else:
            cursor.execute ("""
                select id
                  from atlas_diff_objects
                 where detection_list_id = %s
                   and zooniverse_score is null
                   and images_id is not null
            """, (listId,))
        resultSet = cursor.fetchall ()
        cursor.close ()


    except MySQLdb.Error as e:
        print("Error %d: %s" % (e.args[0], e.args[1]))

    return resultSet

# 2019-05-02 KWS Separated out the acquisiton of images so that can do
#                this multithreaded. Also so we can pass a user defined
#                list of objects to the processing.

def getImages(conn, dbName, objectList, imageRoot='/psdb3/images/'):
    import MySQLdb
    images = []
    # Now, for each candidate, get the image
    for row in objectList:
        try:
            cursor = conn.cursor (MySQLdb.cursors.DictCursor)
            cursor.execute ("""
            select concat(%s ,%s,'/',truncate(mjd_obs,0), '/', image_filename,'.fits') filename from tcs_postage_stamp_images
             where image_filename like concat(%s, '%%')
               and image_filename not like concat(%s, '%%4300000000%%')
               and image_type = 'diff'
               and image_filename is not null
               and pss_error_code = 0
               and mjd_obs is not null
            """, (imageRoot, dbName, row['id'], row['id']))
            imageResultSet = cursor.fetchall ()
            cursor.close ()
            for row in imageResultSet:
                # Only append images that actually exist!
                if os.path.exists(row['filename']):
                    images.append(row)

        except MySQLdb.Error as e:
            print("Error %d: %s" % (e.args[0], e.args[1]))

    return images

# Update the database.
def updateTransientRBValue(conn, objectId, realBogusValue, ps1Data = False):
    import MySQLdb

    rowsUpdated = 0

    try:
        cursor = conn.cursor(MySQLdb.cursors.DictCursor)

        if ps1Data:
            # It's Pan-STARRS data
            cursor.execute ("""
                 update tcs_transient_objects
                 set confidence_factor = %s
                 where id = %s
            """, (realBogusValue, objectId))
        else:
            # It's ATLAS data
            cursor.execute ("""
                 update atlas_diff_objects
                 set zooniverse_score = %s
                 where id = %s
            """, (realBogusValue, objectId))

        rowsUpdated = cursor.rowcount

        # Did we update any transient object rows? If not issue a warning.
        if rowsUpdated == 0:
            print ("WARNING: No transient object entries were updated.")

        cursor.close ()


    except MySQLdb.Error as e:
        print ("Error %d: %s" % (e.args[0], e.args[1]))

    return rowsUpdated


def getRBValues(imageFilenames, classifier, extension = 0):
    num_classes = 2
    image_dim = 20
    numImages = len(imageFilenames)
    images = np.zeros((numImages, image_dim,image_dim,1))
    #print images
    # loop through and fill the above matrix, remembering to correctly scale the
    # raw pixels for the specified sparse filter.
    for j,imageFilename in enumerate(imageFilenames):
        vector = np.nan_to_num(TargetImage(imageFilename, extension=extension).signPreserveNorm())
        #print vector
        #print vector.shape
        images[j,:,:,0] += np.reshape(vector, (image_dim,image_dim), order="F")

    #print images.shape


    model = create_model(num_classes, image_dim)
    model.load_weights(classifier)

    pred = model.predict(images, verbose=0)
    # Collect the predictions from all the files, but aggregate into objects
    objectDict = defaultdict(list)
    for i in range(len(pred[:,1])):
        candidate = os.path.basename(imageFilenames[i]).split('_')[0]
        # Each candidate will end up with a list of predictions.
        objectDict[candidate].append(pred[i,1])

        #print "%s,%.3lf"%(imageFilenames[i], pred[i,1])

    return objectDict


def runKerasTensorflowClassifier(opts, processNumber = None):

    # Use utils.Struct to convert the dict into an object for compatibility with old optparse code.
    if type(opts) is dict:
        options = Struct(**opts)
    else:
        options = opts

    import yaml
    with open(options.configFile) as yaml_file:
        config = yaml.load(yaml_file)

    username = config['databases']['local']['username']
    password = config['databases']['local']['password']
    database = config['databases']['local']['database']
    hostname = config['databases']['local']['hostname']

    conn = dbConnect(hostname, username, password, database)
    if not conn:
        print("Cannot connect to the database")
        return 1

    # 2018-07-31 KWS We have PS1 data. Don't bother with the HKO/MLO ATLAS data.
    ps1Data = False
    if options.ps1classifier:
        ps1Data = True

    if options.listid is not None:
        try:
            detectionList = int(options.listid)
            if detectionList < 0 or detectionList > 8:
                print ("Detection list must be between 0 and 8")
                return 1
        except ValueError as e:
            sys.exit("Detection list must be an integer")

    objectList = []
    imageFilenames = []

    # if candidates are specified in the options, then override the list.
    if len(options.candidate) > 0:
        objectList = [{'id': int(candidate)} for candidate in options.candidate]
    else:
        # Only collect by the list ID if we are running in single threaded mode
        if processNumber is None:
            objectList = getObjectsByList(conn, database, listId = int(options.listid), ps1Data = ps1Data)

    if len(objectList) > 0:
        imageFilenames = getImages(conn, database, objectList, imageRoot=options.imageroot)
        if len(imageFilenames) == 0:
            print("NO IMAGES")
            conn.close()
            return []

    if ps1Data:
        objectDictPS1 = getRBValues([f['filename'] for f in imageFilenames], options.ps1classifier, extension = 1)
        objectScores = defaultdict(dict)
        for k, v in list(objectDictPS1.items()):
            objectScores[k]['ps1'] = np.array(v)
        finalScores = {}

        objects = list(objectScores.keys())
        for object in objects:
            finalScores[object] = np.median(objectScores[object]['ps1'])
    else:
        # Split the images into HKO and MLO data so we can apply the HKO and MLO machines separately.
        hkoFilenames = []
        for row in imageFilenames:
            if '02a' in row['filename']:
                hkoFilenames.append(row['filename'])
        mloFilenames = []
        for row in imageFilenames:
            if '01a' in row['filename']:
                mloFilenames.append(row['filename'])

        #filename = 'hko_57966_20x20_skew3_signpreserve_f77475b232425.mat'
        #train_data, test_data, image_dim = load_data(filename)
        #x_test = test_data[0]

        #hkoClassifier = '/home/kws/keras/hko_57966_20x20_skew3_signpreserve_f77475b232425.model.best.hdf5'
        #mloClassifier = '/home/kws/keras/atlas_mlo_57925_20x20_skew3_signpreserve_f331184b993662.model.best.hdf5'

        objectDictHKO = getRBValues(hkoFilenames, options.hkoclassifier)
        objectDictMLO = getRBValues(mloFilenames, options.mloclassifier)

        # Now we have two dictionaries. Combine them.

        objectScores = defaultdict(dict)

        for k, v in list(objectDictHKO.items()):
            objectScores[k]['hko'] = np.array(v)
        for k, v in list(objectDictMLO.items()):
            objectScores[k]['mlo'] = np.array(v)

        # Some objects will have data from two telescopes, some only one.
        # If we have data from two telescopes, choose the median value of the longest length list.

        finalScores = {}

        objects = list(objectScores.keys())
        for object in objects:
            if len(objectScores[object]) > 1:
                hkoLen = len(objectScores[object]['hko'])
                mloLen = len(objectScores[object]['mlo'])
                if mloLen > hkoLen:
                    finalScores[object] = np.median(objectScores[object]['mlo'])
                else:
                    # Only if MLO is larger than HKO, use MLO. Otherise use HKO
                    finalScores[object] = np.median(objectScores[object]['hko'])

            else:
                try:
                    finalScores[object] = np.median(objectScores[object]['hko'])
                except KeyError as e:
                    finalScores[object] = np.median(objectScores[object]['mlo'])

    finalScoresSorted = OrderedDict(sorted(list(finalScores.items()), key=lambda t: t[1]))

    if options.outputcsv is not None:
        prefix = options.outputcsv.split('.')[0]
        suffix = options.outputcsv.split('.')[-1]

        if suffix == prefix:
            suffix = ''

        if suffix:
            suffix = '.' + suffix

        processSuffix = ''

        if processNumber is not None:
            processSuffix = '_%02d' % processNumber

        # Generate the insert statements
        with open('%s%s%s' % (prefix, processSuffix, suffix), 'w') as f:
            for k, v in list(finalScoresSorted.items()):
                print(k, finalScoresSorted[k])
                f.write('%s,%f\n' % (k, finalScoresSorted[k]))

    scores = list(finalScoresSorted.items())

    if options.update and processNumber is None:
        # Only allow database updates in single threaded mode. Otherwise multithreaded code
        # does the updates at the end of processing. (Minimises table locks.)
        for row in scores:
            updateTransientRBValue(conn, row[0], row[1], ps1Data = ps1Data)

    conn.close()

    return scores


def main():
    opts = docopt(__doc__, version='0.1')
    opts = cleanOptions(opts)

    # Use utils.Struct to convert the dict into an object for compatibility with old optparse code.
    options = Struct(**opts)
    objectsForUpdate = runKerasTensorflowClassifier(options)

if __name__=='__main__':
    main()