Skip to content

Commit

Permalink
partial implementation of csv validator
Browse files Browse the repository at this point in the history
  • Loading branch information
richard-jones committed Oct 5, 2023
1 parent 29e50df commit f61de11
Show file tree
Hide file tree
Showing 4 changed files with 248 additions and 5 deletions.
162 changes: 158 additions & 4 deletions portality/bll/services/application.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
import csv
import json

from portality.lib.argvalidate import argvalidate
from portality.lib import dates
Expand All @@ -9,6 +11,9 @@
from portality import lock
from portality.bll.doaj import DOAJ
from portality.ui.messages import Messages
from portality.crosswalks.journal_questions import Journal2QuestionXwalk
from portality.bll.exceptions import AuthoriseException
from portality.forms.application_forms import ApplicationFormFactory

class ApplicationService(object):
"""
Expand Down Expand Up @@ -281,7 +286,7 @@ def reject_update_request_of_journals(self, ids, account):
ur_ids.append(ur.id)
return ur_ids

def update_request_for_journal(self, journal_id, account=None, lock_timeout=None):
def update_request_for_journal(self, journal_id, account=None, lock_timeout=None, lock_records=True):
"""
Obtain an update request application object for the journal with the given journal_id
Expand Down Expand Up @@ -316,7 +321,7 @@ def update_request_for_journal(self, journal_id, account=None, lock_timeout=None
# first retrieve the journal, and return empty if there isn't one.
# We don't attempt to obtain a lock at this stage, as we want to check that the user is authorised first
journal_lock = None
journal, _ = journalService.journal(journal_id)
journal, _ = journalService.journal(journal_id, lock_journal=lock_records, lock_timeout=lock_timeout)
if journal is None:
app.logger.info("Request for journal {x} did not find anything in the database".format(x=journal_id))
return None, None, None
Expand Down Expand Up @@ -346,8 +351,9 @@ def update_request_for_journal(self, journal_id, account=None, lock_timeout=None
elif account is not None:
try:
authService.can_edit_application(account, application)
application_lock = lock.lock("suggestion", application.id, account.id)
journal_lock = lock.lock("journal", journal_id, account.id)
if lock_records:
application_lock = lock.lock("suggestion", application.id, account.id)
journal_lock = lock.lock("journal", journal_id, account.id)
except lock.Locked as e:
if application_lock is not None: application_lock.delete()
if journal_lock is not None: journal_lock.delete()
Expand Down Expand Up @@ -561,3 +567,151 @@ def delete_application(self, application_id, account):
if rjlock is not None: rjlock.delete()

return

def validate_update_csv(self, file):
# Open with encoding that deals with the Byte Order Mark since we're given files from Windows.
reader = csv.DictReader(file)
header_row = reader.fieldnames

# verify header row with current CSV headers, report errors
expected_headers = Journal2QuestionXwalk.question_list()
lower_case_expected_headers = map(str.lower, expected_headers)
validation = CSVValidationReport()

# Always perform a match check on supplied headers, not counting order
for i, h in enumerate(header_row):
if h and h not in expected_headers:
if h.lower() in lower_case_expected_headers:
validation.header(validation.WARN, i, f'"{h}" has mismatching case to expected header.')
else:
validation.header(validation.ERROR, i, f'"{h}" is not a valid header.')

# Strict check for CSV being exactly the same as exported, including order
# if not args.skip_strict:
# if header_row[1:] != expected_headers:
# print("\nWARNING: CSV input file is the wrong format. "
# "Expected ID column followed by the JournalCSV columns.\n")
# for i in range(0, len(expected_headers)):
# try:
# if expected_headers[i] != header_row[i + 1]:
# print('At column no {0} expected "{1}", found "{2}"'.format(i + 1, expected_headers[i],
# header_row[i + 1]))
# except IndexError:
# print('At column no {0} expected "{1}", found <NO DATA>'.format(i + 1, expected_headers[i]))
# if not args.force:
# print(
# "\nERROR - CSV is wrong shape, exiting. Use --force to do this update anyway (and you know the consequences)")
# exit(1)
# else:
# print("\nCSV structure check passed.\n")
# print('\nContinuing to update records...\n')
# else:
# print('\nSkipping CSV headings check.\n')

# Talking about spreadsheets, so we start at 1
row_ix = 1

# ~~ ->$JournalUpdateByCSV:Feature ~~
for row in reader:
row_ix += 1
validation.log(f'CSV row {row_ix}')

# Skip empty rows
if not any(row.values()):
validation.log("Skipping empty row.")
continue

# Pull by ID, If that column is missing, parse ID from the ToC URL
issns = [
row.get(Journal2QuestionXwalk.q("pissn")),
row.get(Journal2QuestionXwalk.q("eissn"))
]
title_field = Journal2QuestionXwalk.q("title")
title_pos = header_row.index(title_field)
pissn_pos = header_row.index(Journal2QuestionXwalk.q("pissn"))

try:
j = models.Journal.find_by_issn(issns, in_doaj=True, max=1).pop(0)
if j.bibjson().title != row[title_field]:
validation.value(validation.ERROR, row_ix, title_pos, f'"{row[title_field]}" does not match journal title "{j.bibjson().title}"')
continue
except IndexError:
validation.value(validation.ERROR, row_ix, pissn_pos, "Could not find journal record for ISSN(s) {0}".format(", ".join(issns)))
continue

validation.log('Validating update for journal with ID ' + j.id)

# Load remaining rows into application form as an update
# ~~ ^->JournalQuestions:Crosswalk ~~
update_form, updates = Journal2QuestionXwalk.question2form(j, row)

if len(updates) > 0:
[validation.log(upd) for upd in updates]

# Create an update request for this journal
update_req = None
jlock = None
alock = None
try:
# ~~ ^->UpdateRequest:Feature ~~
update_req, jlock, alock = self.update_request_for_journal(j.id, account=j.owner_account, lock_records=False)
except AuthoriseException as e:
validation.general(validation.ERROR, 'Could not create update request: {0}'.format(e.reason))
continue

# If we don't have a UR, we can't continue
if update_req is None:
validation.log('Journal not in DOAJ - missing or not public')
continue

# validate update_form - portality.forms.application_processors.PublisherUpdateRequest
# ~~ ^->UpdateRequest:FormContext ~~
formulaic_context = ApplicationFormFactory.context("update_request")
fc = formulaic_context.processor(
formdata=update_form,
source=update_req
)

if not fc.validate():
validation.general('Failed validation - {0}'.format(fc.form.errors))
# todo: ignore validation on fields that aren't supplied

else:
validation.log("No updates to do")

return validation


class CSVValidationReport:

WARN = "warn"
ERROR = "error"

def __init__(self):
self._general = []
self._headers = {}
self._values = {}
self._log = []

def general(self, error_type, msg):
self._general.append((error_type, msg))

def header(self, error_type, pos, msg):
self._headers[pos] = (error_type, msg)

def value(self, error_type, row, pos, msg):
if row not in self._values:
self._values[row] = {}
self._values[row][pos] = (error_type, msg)

def log(self, msg):
self._log.append(msg)

def json(self):
repr = {
"general": self._general,
"headers": self._headers,
"values": self._values,
"log": self._log
}
return json.dumps(repr)
64 changes: 64 additions & 0 deletions portality/templates/publisher/journal_csv.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<!--~~JournalCSV:Page~~-->
<!--~~->JournalCSV:Feature-->

{% extends "publisher/publisher_base.html" %}

{% block page_title %}Validate your Journal CSV{% endblock %}

{% block publisher_content %}
<div class="row">
<header class="col-md-4">
<h2>Validate your Journal CSV</h2>

<div class="alert">
<p>Uploaded files must be in the same format in which they were supplied to you, and follow
the data entry rules <a href="#">LINK TO THE RULES HERE</a></p>
</div>
</header>

<form id="upload_form" class="col-md-8 form form--compact" method="POST" action="{{ url_for("publisher.journal_csv_validate") }}" enctype="multipart/form-data">
<fieldset>
<h3 class="form__header">Journal CSV</h3>
<div class="form__question">
<label for="upload-csv-file">Select a file</label>
<input type="file" id="upload-csv-file" name="file">
<p><small>Must be less than 1MB (TBC).</small></p>
{% if error %}
<p><small class="error">You must specify the file or upload from a link</small></p>
{% endif %}
</div>
</fieldset>
<button type="submit" id="upload" class="button button--primary">Validate</button>
</form>
</div>

<div id="validation-results"></div>
{% endblock %}

{% block extra_js_bottom %}
<script type="text/javascript">
$(document).ready(function() {
$("#upload").on("click", function(event) {
event.preventDefault();

let fd = new FormData();
let file = $('#upload-csv-file')[0].files[0];
fd.append("journal_csv", file);

$.ajax({
url: "/publisher/journal-csv/validate",
type: "POST",
data: fd,
processData: false,
contentType: false,
success: function(response) {
alert("success");
},
error: function(jqXHR, textStatus, errorMessage) {
console.log(errorMessage); // Optional
}
});
})
})
</script>
{% endblock %}
4 changes: 3 additions & 1 deletion portality/templates/publisher/nav.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
{% set metadata = url_for('publisher.metadata') %}
{% set preservation = url_for('publisher.preservation') %}
{% set help = url_for('publisher.help') %}
{% set csv = url_for('publisher.journal_csv') %}

{% set tabs = [
(index, "My drafts", 0),
Expand All @@ -13,7 +14,8 @@
(xml, "Upload article XML", 3),
(metadata, "Enter article metadata", 4),
(preservation, "Upload preservation file", 5),
(help, "Help", 6),
(csv, "Upload journal CSV", 6),
(help, "Help", 7),
]
%}

Expand Down
23 changes: 23 additions & 0 deletions portality/view/publisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,29 @@ def metadata():

return fc.render_template(validated=validated)

@blueprint.route("/journal-csv", methods=["GET"])
@login_required
@ssl_required
@write_required()
def journal_csv():
return render_template('publisher/journal_csv.html')

@blueprint.route("/journal-csv/validate", methods=["POST"])
@login_required
@ssl_required
@write_required()
def journal_csv_validate():
if "journal_csv" not in request.files:
abort(400)
file = request.files["journal_csv"]
if file.size() > app.config.get("JOURNAL_CSV_UPLOAD__MAX_FILE_SIZE", 1000000):
abort(400)

report = DOAJ.applicationService().validate_update_csv(file)
resp = make_response(report.json())
resp.mimetype = "application/json"
return resp


@blueprint.route("/help")
@login_required
Expand Down

0 comments on commit f61de11

Please sign in to comment.