ebmdatalab · chadmiller · May 18, 2019 · May 19, 2019 · May 20, 2019 · May 20, 2019
diff --git a/README.md b/README.md
@@ -19,16 +19,13 @@ reporting easier.
 
 Operational overview:
 
-1. Python script `load_data.py`:
+1. Django management command `load_data`:
  * downloads a zip clinical trials registry data from ClinicalTrials.gov
- * converts the XML to JSON
- * uploads it to BigQuery
- * runs SQL to transform it to tabular format including fields to
-   indentify ACTs and their lateness
- * downloads SQL as a CSV file
+ * transforms XML into a CSV file
+ * all of #2, `process_data`
 
 2. Django management command `process_data`:
-  * imports CSV file into Django models
+  * imports existing CSV file into Django models
   * precomputes aggregate statistics and turns these into rankings
   * handles other metadata (in particular, hiding trials that are no
     longer ACTs)
@@ -42,12 +39,9 @@ loaded into a staging database / website.
 A separate command copies new data from staging to production
 (following moderation).
 
-Much complex logic has been expressed in SQL, which makes it hard to read
-and test.  This is a legacy of splitting the development between
-academics with the domain expertise (and who could use SQL to
-prototype) and software engineers.  Now the project has been running
-for a while and new development interations are less frequent, a useful
-project would be as much of this logic to Python.
+In the past, importing processes computed and filtered in SQL through
+Bigtable service and some JSON processing, but that is largely gone.
+You may still see scars.
 
 Similarly, the only reason step (1) exists is to create a CSV which
 can be imported to the database.  That CSV is useful in its own right
@@ -56,12 +50,6 @@ intermediate formats that could legitimately be dropped in a
 refactored solution (and the CSV could be generated directly from the
 database).
 
-The historic reason for the XML -> JSON route is because BigQuery
-includes a number of useful JSON functions which can be manipulated by
-people competent in SQL. At the time of writing, there
-is [an open issue](https://github.com/ebmdatalab/clinicaltrials-act-tracker/issues/121) with
-some ideas about refactoring this process.
-
 Static Pages
 ============
 

diff --git a/clinicaltrials/common/utils.py b/clinicaltrials/common/utils.py
@@ -2,17 +2,19 @@
 import django.core.exceptions
 
 
-# Originally taken from openprescribing
-def get_env_setting(setting, default=None):
+Unset = object()  # Explicit default value None is not the same as lacking a default.
+
+
+def get_env_setting(setting, default=Unset):
     """ Get the environment setting.
 
     Return the default, or raise an exception if none supplied
     """
     try:
         return os.environ[setting]
     except KeyError:
-        if default:
-            return default
-        else:
+        if default is Unset:
             error_msg = "Set the %s env variable" % setting
             raise django.core.exceptions.ImproperlyConfigured(error_msg)
+
+        return default
diff --git a/clinicaltrials/frontend/management/commands/load_data.py b/clinicaltrials/frontend/management/commands/load_data.py
@@ -58,7 +58,7 @@ def download_and_extract():
 
     # download and extract
     container = tempfile.mkdtemp(
-        prefix=settings.STORAGE_PREFIX.rstrip(os.sep), dir=settings.WORKING_VOLUME)
+        prefix=settings.STORAGE_PREFIX, dir=settings.WORKING_VOLUME)
     try:
         data_file = os.path.join(container, "data.zip")
         wget_file(data_file, url)
@@ -76,7 +76,7 @@ def upload_to_cloud():
     client = StorageClient()
     bucket = client.get_bucket()
     blob = bucket.blob(
-        "{}{}".format(settings.STORAGE_PREFIX, raw_json_name()),
+        "{}/{}".format(settings.STORAGE_PREFIX, raw_json_name()),
         chunk_size=1024*1024
     )
     with open(os.path.join(settings.WORKING_DIR, raw_json_name()), 'rb') as f:
@@ -89,7 +89,11 @@ def notify_slack(message):
     # Set the webhook_url to the one provided by Slack when you create
     # the webhook at
     # https://my.slack.com/services/new/incoming-webhook/
-    webhook_url = os.environ['SLACK_GENERAL_POST_KEY']
+    webhook_url = settings.SLACK_GENERAL_POST_KEY
+
+    if webhook_url is None:
+        return
+
     slack_data = {'text': message}
 
     response = requests.post(webhook_url, json=slack_data)
@@ -164,14 +168,16 @@ def convert_and_download():
         wait_for_job(job)
 
 
-    t1_exporter = TableExporter(tmp_table, settings.STORAGE_PREFIX + 'test_table-')
+    t1_exporter = TableExporter(tmp_table, settings.STORAGE_PREFIX + '/' + 'test_table-')
     t1_exporter.export_to_storage()
 
     with open(settings.INTERMEDIATE_CSV_PATH, 'w') as f:
         t1_exporter.download_from_storage_and_unzip(f)
 
 
 def get_env(path):
+    """Terrible hack to bridge using env vars to having settings files."""
+    if not path: return {}
     env = os.environ.copy()
     with open(path) as e:
         for k, v in re.findall(r"^export ([A-Z][A-Z0-9_]*)=(\S*)", e.read(), re.MULTILINE):
@@ -184,7 +190,7 @@ def process_data():
     try:
         subprocess.check_output(
             [
-                "{}python".format(settings.PROCESSING_VENV_BIN),
+                shutil.which("python"),
                 "{}/manage.py".format(settings.BASE_DIR),
                 "process_data",
                 "--input-csv={}".format(settings.INTERMEDIATE_CSV_PATH),

diff --git a/clinicaltrials/frontend/management/commands/process_data.py b/clinicaltrials/frontend/management/commands/process_data.py
@@ -109,22 +109,21 @@ def set_qa_metadata(trial):
 
 
 def _compute_ranks():
-    sql = ("WITH ranked AS (SELECT date, ranking.id, RANK() OVER ("
-           "  PARTITION BY date "
-           "ORDER BY percentage DESC"
-           ") AS computed_rank "
-           "FROM frontend_ranking ranking WHERE percentage IS NOT NULL "
-           "AND date = %s"
-           ") ")
-
-    sql += ("UPDATE "
-            " frontend_ranking "
-            "SET "
-            " rank = ranked.computed_rank "
-            "FROM ranked "
-            "WHERE ranked.id = frontend_ranking.id AND ranked.date = frontend_ranking.date")
+    sql = """
+            UPDATE frontend_ranking
+            SET rank = (
+                SELECT
+                    count(*)
+                FROM
+                    frontend_ranking AS f
+                WHERE
+                    f.percentage<frontend_ranking.percentage AND f.date=frontend_ranking
+            )
+            WHERE percentage IS NOT NULL AND date=%s
+    """
     on_date = Sponsor.objects.latest('updated_date').updated_date
     with connection.cursor() as c:
+            c.execute("UPDATE frontend_ranking SET rank=NULL WHERE date=%s", [on_date])
             c.execute(sql, [on_date])
 
 

diff --git a/clinicaltrials/frontend/migrations/0029_update_status_from_no_longer_on_website.py b/clinicaltrials/frontend/migrations/0029_update_status_from_no_longer_on_website.py
@@ -13,10 +13,10 @@ class Migration(migrations.Migration):
     operations = [
         migrations.RunSQL(
             [
-                ("UPDATE frontend_trial SET status = %s WHERE no_longer_on_website is true",
+                ("UPDATE frontend_trial SET status = %s WHERE no_longer_on_website = (1=1)",
                  [Trial.STATUS_NO_LONGER_ACT])],
             [
-                ("UPDATE frontend_trial SET no_longer_on_website = true WHERE status = %s",
+                ("UPDATE frontend_trial SET no_longer_on_website = (1=1) WHERE status = %s",
                  [Trial.STATUS_NO_LONGER_ACT])],
         )
     ]
diff --git a/clinicaltrials/frontend/migrations/0033_manual_pk_fix.py b/clinicaltrials/frontend/migrations/0033_manual_pk_fix.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-from django.db import migrations
+from django.db import migrations, models
 
 
 class Migration(migrations.Migration):
@@ -10,5 +10,5 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        migrations.RunSQL('ALTER TABLE frontend_trial ALTER sponsor_id TYPE varchar(200);'),
+        migrations.AlterField('trial', 'sponsor_id', models.CharField(max_length=200)),
     ]
diff --git a/clinicaltrials/frontend/migrations/0035_manual_pk_fix.py b/clinicaltrials/frontend/migrations/0035_manual_pk_fix.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-from django.db import migrations
+from django.db import migrations, models
 
 
 class Migration(migrations.Migration):
@@ -10,5 +10,5 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        migrations.RunSQL('ALTER TABLE frontend_ranking ALTER sponsor_id TYPE varchar(200);'),
+        migrations.AlterField('ranking', 'sponsor_id', models.CharField(max_length=200)),
     ]
diff --git a/clinicaltrials/frontend/models.py b/clinicaltrials/frontend/models.py
@@ -36,11 +36,8 @@ def current_rank(self):
     def status_choices(self):
         """A list of tuples representing valid choices for trial statuses
         """
-        statuses = [x[0] for x in
-                    self.trial_set.visible().order_by(
-                        'status').values_list(
-                            'status').distinct(
-                                'status')]
+
+        statuses = set(x[0] for x in self.trial_set.visible().values_list('status'))
         return [x for x in Trial.STATUS_CHOICES if x[0] in statuses]
 
     class Meta:
@@ -51,11 +48,7 @@ class TrialManager(models.Manager):
     def status_choices(self):
         """A list of tuples representing valid choices for trial statuses
         """
-        statuses = [x[0] for x in
-                    Trial.objects.visible().order_by(
-                        'status').values_list(
-                            'status').distinct(
-                                'status')]
+        statuses = set(x[0] for x in Trial.objects.visible().values_list('status'))
         return [x for x in Trial.STATUS_CHOICES if x[0] in statuses]
 
 

diff --git a/clinicaltrials/frontend/settings.py b/clinicaltrials/frontend/settings.py
@@ -13,7 +13,7 @@
 import os
 import datetime
 
-import common.utils
+from common.utils import Unset, get_env_setting
 
 import custom_logging
 
@@ -28,21 +28,36 @@
 # See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/
 
 # SECURITY WARNING: keep the secret key used in production secret!
-SECRET_KEY = common.utils.get_env_setting('CLINICALTRIALS_SECRET_KEY')
+SECRET_KEY = get_env_setting('CLINICALTRIALS_SECRET_KEY',
+        'cae7Ip2Utah7voochee9mahnoachingahra>faiv%ua8Uep)ai-zi!thee2xee8a')
 
 # SECURITY WARNING: don't run with debug turned on in production!
-CLINICALTRIALS_DEBUG = common.utils.get_env_setting('CLINICALTRIALS_DEBUG')
+CLINICALTRIALS_DEBUG = get_env_setting('CLINICALTRIALS_DEBUG', 'yes')
 assert CLINICALTRIALS_DEBUG in ['yes', 'no'], "CLINICALTRIALS_DEBUG was '{}'".format(CLINICALTRIALS_DEBUG)
 DEBUG = CLINICALTRIALS_DEBUG == 'yes'
 
-ALLOWED_HOSTS = [
-    'staging-fdaaa.ebmdatalab.net', '127.0.0.1', '192.168.0.55', 'localhost',
-    'fdaaa.trialstracker.net']
 
+ALLOWED_HOSTS = ['fdaaa.trialstracker.net']
+if DEBUG:
+    ALLOWED_HOSTS.extend(['staging-fdaaa.ebmdatalab.net', '127.0.0.1', '192.168.0.55', 'localhost'])
+
+if DEBUG:
+    DEFAULT_DB_NAME = "storage.sqlite3"
+    DEFAULT_DB_USER = None
+    DEFAULT_DB_PASSWORD = None
+    DEFAULT_DB_HOST = None
+    DEFAULT_DB_ENGINE = 'django.db.backends.sqlite3'
+else:
+    DEFAULT_DB_NAME = Unset
+    DEFAULT_DB_USER = Unset
+    DEFAULT_DB_PASSWORD = Unset
+    DEFAULT_DB_HOST = 'localhost'
+    DEFAULT_DB_ENGINE = 'django.db.backends.postgresql_psycopg2',
 
 # Parameters
 
-GOOGLE_TRACKING_ID = common.utils.get_env_setting('CLINICALTRIALS_GOOGLE_TRACKING_ID')
+GOOGLE_TRACKING_ID = get_env_setting('CLINICALTRIALS_GOOGLE_TRACKING_ID', None if DEBUG else Unset)
+SLACK_GENERAL_POST_KEY = get_env_setting('SLACK_GENERAL_POST_KEY', None if DEBUG else Unset)
 
 
 # Application definition
@@ -181,16 +196,15 @@
 
 WSGI_APPLICATION = 'frontend.wsgi.application'
 
-
 # Database
 # https://docs.djangoproject.com/en/1.10/ref/settings/#databases
 DATABASES = {
     'default': {
-        'ENGINE': 'django.db.backends.postgresql_psycopg2',
-        'NAME': common.utils.get_env_setting('CLINICALTRIALS_DB'),
-        'USER': common.utils.get_env_setting('CLINICALTRIALS_DB_NAME'),
-        'PASSWORD': common.utils.get_env_setting('CLINICALTRIALS_DB_PASS'),
-        'HOST': 'localhost',
+        'ENGINE': get_env_setting('CLINICALTRIALS_DB', DEFAULT_DB_ENGINE),
+        'NAME': get_env_setting('CLINICALTRIALS_DB', DEFAULT_DB_NAME),
+        'USER': get_env_setting('CLINICALTRIALS_DB_NAME', DEFAULT_DB_USER),
+        'PASSWORD': get_env_setting('CLINICALTRIALS_DB_PASS', DEFAULT_DB_PASSWORD),
+        'HOST': get_env_setting('CLINICALTRIALS_DB_PASS', DEFAULT_DB_HOST),
         'CONN_MAX_AGE': 0  # Must be zero, see api/view_utils#db_timeout
     },
 }
@@ -263,16 +277,15 @@
 
 # Twitter
 
-TWITTER_CONSUMER_SECRET = common.utils.get_env_setting('TWITTER_CONSUMER_SECRET')
-TWITTER_ACCESS_TOKEN_SECRET = common.utils.get_env_setting('TWITTER_ACCESS_TOKEN_SECRET')
+TWITTER_CONSUMER_SECRET = get_env_setting('TWITTER_CONSUMER_SECRET', None if DEBUG else Unset)
+TWITTER_ACCESS_TOKEN_SECRET = get_env_setting('TWITTER_ACCESS_TOKEN_SECRET', None if DEBUG else Unset)
 
 # Path to shell script of lines `export FOO=bar`. See environment-example for a sample.
-PROCESSING_ENV_PATH = '/etc/profile.d/fdaaa_staging.sh'
-PROCESSING_VENV_BIN = '/var/www/fdaaa_staging/venv/bin/'
+PROCESSING_ENV_PATH = '/etc/profile.d/fdaaa_staging.sh' if not DEBUG else None
 PROCESSING_STORAGE_TABLE_NAME = 'current_raw_json'
 
 # Bucket in GCS to store data
-STORAGE_PREFIX = 'clinicaltrials/'
-WORKING_VOLUME = '/mnt/volume-lon1-01/'   # should have at least 10GB space
+STORAGE_PREFIX = 'trialdata'
+WORKING_VOLUME = get_env_setting('WORKDIR', '/mnt/volume-lon1-01/' if not DEBUG else os.curdir)   # should have at least 10GB space
 WORKING_DIR = os.path.join(WORKING_VOLUME, STORAGE_PREFIX)
 INTERMEDIATE_CSV_PATH = os.path.join(WORKING_VOLUME, STORAGE_PREFIX, 'clinical_trials.csv')