From f61de11ca740a721722a25edd0b1ee2222d82846 Mon Sep 17 00:00:00 2001
From: Richard Jones <richard@cottagelabs.com>
Date: Thu, 5 Oct 2023 13:12:39 +0100
Subject: [PATCH] partial implementation of csv validator

---
 portality/bll/services/application.py         | 162 +++++++++++++++++-
 .../templates/publisher/journal_csv.html      |  64 +++++++
 portality/templates/publisher/nav.html        |   4 +-
 portality/view/publisher.py                   |  23 +++
 4 files changed, 248 insertions(+), 5 deletions(-)
 create mode 100644 portality/templates/publisher/journal_csv.html

diff --git a/portality/bll/services/application.py b/portality/bll/services/application.py
index 3e68447ce6..7cf2ec2158 100644
--- a/portality/bll/services/application.py
+++ b/portality/bll/services/application.py
@@ -1,4 +1,6 @@
 import logging
+import csv
+import json
 
 from portality.lib.argvalidate import argvalidate
 from portality.lib import dates
@@ -9,6 +11,9 @@
 from portality import lock
 from portality.bll.doaj import DOAJ
 from portality.ui.messages import Messages
+from portality.crosswalks.journal_questions import Journal2QuestionXwalk
+from portality.bll.exceptions import AuthoriseException
+from portality.forms.application_forms import ApplicationFormFactory
 
 class ApplicationService(object):
     """
@@ -281,7 +286,7 @@ def reject_update_request_of_journals(self, ids, account):
                 ur_ids.append(ur.id)
         return ur_ids
 
-    def update_request_for_journal(self, journal_id, account=None, lock_timeout=None):
+    def update_request_for_journal(self, journal_id, account=None, lock_timeout=None, lock_records=True):
         """
         Obtain an update request application object for the journal with the given journal_id
 
@@ -316,7 +321,7 @@ def update_request_for_journal(self, journal_id, account=None, lock_timeout=None
         # first retrieve the journal, and return empty if there isn't one.
         # We don't attempt to obtain a lock at this stage, as we want to check that the user is authorised first
         journal_lock = None
-        journal, _ = journalService.journal(journal_id)
+        journal, _ = journalService.journal(journal_id, lock_journal=lock_records, lock_timeout=lock_timeout)
         if journal is None:
             app.logger.info("Request for journal {x} did not find anything in the database".format(x=journal_id))
             return None, None, None
@@ -346,8 +351,9 @@ def update_request_for_journal(self, journal_id, account=None, lock_timeout=None
         elif account is not None:
             try:
                 authService.can_edit_application(account, application)
-                application_lock = lock.lock("suggestion", application.id, account.id)
-                journal_lock = lock.lock("journal", journal_id, account.id)
+                if lock_records:
+                    application_lock = lock.lock("suggestion", application.id, account.id)
+                    journal_lock = lock.lock("journal", journal_id, account.id)
             except lock.Locked as e:
                 if application_lock is not None: application_lock.delete()
                 if journal_lock is not None: journal_lock.delete()
@@ -561,3 +567,151 @@ def delete_application(self, application_id, account):
             if rjlock is not None: rjlock.delete()
 
         return
+
+    def validate_update_csv(self, file):
+        # Open with encoding that deals with the Byte Order Mark since we're given files from Windows.
+        reader = csv.DictReader(file)
+        header_row = reader.fieldnames
+
+        # verify header row with current CSV headers, report errors
+        expected_headers = Journal2QuestionXwalk.question_list()
+        lower_case_expected_headers = map(str.lower, expected_headers)
+        validation = CSVValidationReport()
+
+        # Always perform a match check on supplied headers, not counting order
+        for i, h in enumerate(header_row):
+            if h and h not in expected_headers:
+                if h.lower() in lower_case_expected_headers:
+                    validation.header(validation.WARN, i, f'"{h}" has mismatching case to expected header.')
+                else:
+                    validation.header(validation.ERROR, i, f'"{h}" is not a valid header.')
+
+        # Strict check for CSV being exactly the same as exported, including order
+        # if not args.skip_strict:
+        #     if header_row[1:] != expected_headers:
+        #         print("\nWARNING: CSV input file is the wrong format. "
+        #               "Expected ID column followed by the JournalCSV columns.\n")
+        #         for i in range(0, len(expected_headers)):
+        #             try:
+        #                 if expected_headers[i] != header_row[i + 1]:
+        #                     print('At column no {0} expected "{1}", found "{2}"'.format(i + 1, expected_headers[i],
+        #                                                                                 header_row[i + 1]))
+        #             except IndexError:
+        #                 print('At column no {0} expected "{1}", found <NO DATA>'.format(i + 1, expected_headers[i]))
+        #         if not args.force:
+        #             print(
+        #                 "\nERROR - CSV is wrong shape, exiting. Use --force to do this update anyway (and you know the consequences)")
+        #             exit(1)
+        #     else:
+        #         print("\nCSV structure check passed.\n")
+        #     print('\nContinuing to update records...\n')
+        # else:
+        #     print('\nSkipping CSV headings check.\n')
+
+        # Talking about spreadsheets, so we start at 1
+        row_ix = 1
+
+        # ~~ ->$JournalUpdateByCSV:Feature ~~
+        for row in reader:
+            row_ix += 1
+            validation.log(f'CSV row {row_ix}')
+
+            # Skip empty rows
+            if not any(row.values()):
+                validation.log("Skipping empty row.")
+                continue
+
+            # Pull by ID, If that column is missing, parse ID from the ToC URL
+            issns = [
+                row.get(Journal2QuestionXwalk.q("pissn")),
+                row.get(Journal2QuestionXwalk.q("eissn"))
+            ]
+            title_field = Journal2QuestionXwalk.q("title")
+            title_pos = header_row.index(title_field)
+            pissn_pos = header_row.index(Journal2QuestionXwalk.q("pissn"))
+
+            try:
+                j = models.Journal.find_by_issn(issns, in_doaj=True, max=1).pop(0)
+                if j.bibjson().title != row[title_field]:
+                    validation.value(validation.ERROR, row_ix, title_pos, f'"{row[title_field]}" does not match journal title "{j.bibjson().title}"')
+                    continue
+            except IndexError:
+                validation.value(validation.ERROR, row_ix, pissn_pos, "Could not find journal record for ISSN(s) {0}".format(", ".join(issns)))
+                continue
+
+            validation.log('Validating update for journal with ID ' + j.id)
+
+            # Load remaining rows into application form as an update
+            # ~~ ^->JournalQuestions:Crosswalk ~~
+            update_form, updates = Journal2QuestionXwalk.question2form(j, row)
+
+            if len(updates) > 0:
+                [validation.log(upd) for upd in updates]
+
+                # Create an update request for this journal
+                update_req = None
+                jlock = None
+                alock = None
+                try:
+                    # ~~ ^->UpdateRequest:Feature ~~
+                    update_req, jlock, alock = self.update_request_for_journal(j.id, account=j.owner_account, lock_records=False)
+                except AuthoriseException as e:
+                    validation.general(validation.ERROR, 'Could not create update request: {0}'.format(e.reason))
+                    continue
+
+                # If we don't have a UR, we can't continue
+                if update_req is None:
+                    validation.log('Journal not in DOAJ - missing or not public')
+                    continue
+
+                # validate update_form - portality.forms.application_processors.PublisherUpdateRequest
+                # ~~ ^->UpdateRequest:FormContext ~~
+                formulaic_context = ApplicationFormFactory.context("update_request")
+                fc = formulaic_context.processor(
+                    formdata=update_form,
+                    source=update_req
+                )
+
+                if not fc.validate():
+                    validation.general('Failed validation - {0}'.format(fc.form.errors))
+                    # todo: ignore validation on fields that aren't supplied
+
+            else:
+                validation.log("No updates to do")
+
+        return validation
+
+
+class CSVValidationReport:
+
+    WARN = "warn"
+    ERROR = "error"
+
+    def __init__(self):
+        self._general = []
+        self._headers = {}
+        self._values = {}
+        self._log = []
+
+    def general(self, error_type, msg):
+        self._general.append((error_type, msg))
+
+    def header(self, error_type, pos, msg):
+        self._headers[pos] = (error_type, msg)
+
+    def value(self, error_type, row, pos, msg):
+        if row not in self._values:
+            self._values[row] = {}
+        self._values[row][pos] = (error_type, msg)
+
+    def log(self, msg):
+        self._log.append(msg)
+
+    def json(self):
+        repr = {
+            "general": self._general,
+            "headers": self._headers,
+            "values": self._values,
+            "log": self._log
+        }
+        return json.dumps(repr)
diff --git a/portality/templates/publisher/journal_csv.html b/portality/templates/publisher/journal_csv.html
new file mode 100644
index 0000000000..a96f0dae5b
--- /dev/null
+++ b/portality/templates/publisher/journal_csv.html
@@ -0,0 +1,64 @@
+<!--~~JournalCSV:Page~~-->
+<!--~~->JournalCSV:Feature-->
+
+{% extends "publisher/publisher_base.html" %}
+
+{% block page_title %}Validate your Journal CSV{% endblock %}
+
+{% block publisher_content %}
+    <div class="row">
+        <header class="col-md-4">
+            <h2>Validate your Journal CSV</h2>
+
+            <div class="alert">
+                <p>Uploaded files must be in the same format in which they were supplied to you, and follow
+                    the data entry rules <a href="#">LINK TO THE RULES HERE</a></p>
+            </div>
+        </header>
+
+        <form id="upload_form" class="col-md-8 form form--compact" method="POST" action="{{ url_for("publisher.journal_csv_validate") }}" enctype="multipart/form-data">
+            <fieldset>
+                <h3 class="form__header">Journal CSV</h3>
+                <div class="form__question">
+                    <label for="upload-csv-file">Select a file</label>
+                    <input type="file" id="upload-csv-file" name="file">
+                    <p><small>Must be less than 1MB (TBC).</small></p>
+                    {% if error %}
+                        <p><small class="error">You must specify the file or upload from a link</small></p>
+                    {% endif %}
+                </div>
+            </fieldset>
+            <button type="submit" id="upload" class="button button--primary">Validate</button>
+        </form>
+    </div>
+
+    <div id="validation-results"></div>
+{% endblock %}
+
+{% block extra_js_bottom %}
+    <script type="text/javascript">
+        $(document).ready(function() {
+            $("#upload").on("click", function(event) {
+                event.preventDefault();
+
+                let fd = new FormData();
+                let file = $('#upload-csv-file')[0].files[0];
+                fd.append("journal_csv", file);
+
+                $.ajax({
+                   url: "/publisher/journal-csv/validate",
+                   type: "POST",
+                   data: fd,
+                   processData: false,
+                   contentType: false,
+                   success: function(response) {
+                       alert("success");
+                   },
+                   error: function(jqXHR, textStatus, errorMessage) {
+                       console.log(errorMessage); // Optional
+                   }
+                });
+            })
+        })
+    </script>
+{% endblock %}
diff --git a/portality/templates/publisher/nav.html b/portality/templates/publisher/nav.html
index fce8c5a32b..48aea94426 100644
--- a/portality/templates/publisher/nav.html
+++ b/portality/templates/publisher/nav.html
@@ -5,6 +5,7 @@
 {% set metadata = url_for('publisher.metadata') %}
 {% set preservation = url_for('publisher.preservation') %}
 {% set help = url_for('publisher.help') %}
+{% set csv = url_for('publisher.journal_csv') %}
 
 {% set tabs = [
     (index, "My drafts", 0),
@@ -13,7 +14,8 @@
     (xml, "Upload article XML", 3),
     (metadata, "Enter article metadata", 4),
     (preservation, "Upload preservation file", 5),
-    (help, "Help", 6),
+    (csv, "Upload journal CSV", 6),
+    (help, "Help", 7),
     ]
 %}
 
diff --git a/portality/view/publisher.py b/portality/view/publisher.py
index 2410a8323a..29ed3ed1e7 100644
--- a/portality/view/publisher.py
+++ b/portality/view/publisher.py
@@ -365,6 +365,29 @@ def metadata():
 
         return fc.render_template(validated=validated)
 
+@blueprint.route("/journal-csv", methods=["GET"])
+@login_required
+@ssl_required
+@write_required()
+def journal_csv():
+    return render_template('publisher/journal_csv.html')
+
+@blueprint.route("/journal-csv/validate", methods=["POST"])
+@login_required
+@ssl_required
+@write_required()
+def journal_csv_validate():
+    if "journal_csv" not in request.files:
+        abort(400)
+    file = request.files["journal_csv"]
+    if file.size() > app.config.get("JOURNAL_CSV_UPLOAD__MAX_FILE_SIZE", 1000000):
+        abort(400)
+
+    report = DOAJ.applicationService().validate_update_csv(file)
+    resp = make_response(report.json())
+    resp.mimetype = "application/json"
+    return resp
+
 
 @blueprint.route("/help")
 @login_required