From 8c1cf2a6309edeba5c9241878408e2b8e1ceded0 Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Tue, 12 Jan 2016 14:26:24 -0500 Subject: [PATCH 01/23] rough impl of bot interface, final data export, dupe classification prevention --- app/controllers/admin/data_controller.rb | 8 +- app/controllers/classifications_controller.rb | 62 ++++++++++--- app/models/classification.rb | 37 +++++++- app/models/subject.rb | 50 +++++++++- app/models/subject_generation_method.rb | 4 + .../collect_unique.rb | 13 ++- app/models/user.rb | 14 ++- app/models/workflow.rb | 2 +- .../final_data_subject_serializer.rb | 93 ++++++++++++++++++- .../final_data_subject_set_serializer.rb | 2 +- app/views/admin/dashboard/index.html.erb | 10 +- .../register_project_static_routes.rb | 12 ++- lib/tasks/project.rake | 62 +++++++++++++ 13 files changed, 329 insertions(+), 40 deletions(-) diff --git a/app/controllers/admin/data_controller.rb b/app/controllers/admin/data_controller.rb index 5509f670..5a8f51d5 100644 --- a/app/controllers/admin/data_controller.rb +++ b/app/controllers/admin/data_controller.rb @@ -17,8 +17,14 @@ def download format.json {render json: CompleteSubjectsSerializer.new(@subjects)} end + elsif params[:download_status] == 'flat' + @subjects = Subject.all.skip(100).limit(1).first.child_subjects.where(workflow_id: nil) + respond_to do |format| + format.json {render json: @subjects.map { |s| FinalDataSubjectSerializer.new(s, root: false) }} + end + else - @sets = SubjectSet.all + @sets = SubjectSet.all.skip(101).limit 1 respond_to do |format| format.json {render json: FinalDataSerializer.new(@sets)} end diff --git a/app/controllers/classifications_controller.rb b/app/controllers/classifications_controller.rb index 6aa85298..b6b4532b 100644 --- a/app/controllers/classifications_controller.rb +++ b/app/controllers/classifications_controller.rb @@ -4,29 +4,65 @@ class ClassificationsController < ApplicationController def create - user = require_user! + # Is it a bot? + user = nil + if request.headers["HTTP_ROBOT_AUTH"] + user = User.bot_user_by_auth request.headers["HTTP_ROBOT_AUTH"] + end - workflow_id = BSON::ObjectId.from_string params["classifications"]["workflow_id"] + user = require_user! if user.nil? + + workflow_id = params["classifications"]["workflow_id"] ? params["classifications"]["workflow_id"] : nil task_key = params["classifications"]["task_key"] annotation = params["classifications"]["annotation"] annotation = {} if annotation.nil? - started_at = params["classifications"]["metadata"]["started_at"] - finished_at = params["classifications"]["metadata"]["finished_at"] + + started_at = nil + finished_at = nil + if params["classifications"]["metadata"] + started_at = params["classifications"]["metadata"]["started_at"] + finished_at = params["classifications"]["metadata"]["finished_at"] + + else + started_at = finished_at = Time.new.strftime("%Y%m%dT%H%M%S%z") + end + subject_id = params["classifications"]["subject_id"] user_agent = request.headers["HTTP_USER_AGENT"] - @result = Classification.create( - workflow_id: workflow_id, - subject_id: subject_id, - location: location, + if workflow_id.nil? && ! params["workflow"].nil? + workflow = Workflow.find_by name: params["workflow"]["name"] + workflow_id = workflow.id + end + + workflow_id = BSON::ObjectId.from_string workflow_id if ! workflow_id.nil? + + if subject_id.nil? && (standard_url = params["subject"]["location"]["standard"]) + subject_id = Subject.find_or_create_root_by_standard_url(standard_url).id + end + + h = { annotation: annotation, - started_at: started_at, - finished_at: finished_at, - user_agent: user_agent, + location: location, + subject_id: subject_id, task_key: task_key, - user: user - ) + workflow_id: workflow_id, + user_id: user.id + } + if (@result = Classification.find_by_props(h)).nil? + @result = Classification.create( + workflow_id: workflow_id, + subject_id: subject_id, + location: location, + annotation: annotation, + started_at: started_at, + finished_at: finished_at, + user_agent: user_agent, + task_key: task_key, + user: user + ) + end render json: @result end diff --git a/app/models/classification.rb b/app/models/classification.rb index 99f91f1f..4d59b819 100644 --- a/app/models/classification.rb +++ b/app/models/classification.rb @@ -11,6 +11,11 @@ class Classification field :finished_at field :user_agent + field :data_md5 + + before_create :generate_data_md5 + + belongs_to :workflow, :foreign_key => "workflow_id" belongs_to :user belongs_to :subject, foreign_key: "subject_id", inverse_of: :classifications @@ -86,9 +91,15 @@ def increment_subject_classification_count end if self.task_key == "flag_bad_subject_task" - subject.increment_flagged_bad_count_by_one - # Push user_id onto Subject.deleting_user_ids if appropriate - Subject.where({id: subject.id}).find_and_modify({"$addToSet" => {deleting_user_ids: user_id.to_s}}) + # If deleting user is creator (or a robot), immediately change status to bad + if subject.created_solely_by?(user) || subject.created_by_robot? + subject.bad! + + else + subject.increment_flagged_bad_count_by_one + # Push user_id onto Subject.deleting_user_ids if appropriate + Subject.where({id: subject.id}).find_and_modify({"$addToSet" => {deleting_user_ids: user_id.to_s}}) + end end if self.task_key == "flag_illegible_subject_task" @@ -110,6 +121,26 @@ def to_s "#{workflow_name} Classification (#{ ann.blank? ? task_key : ann})" end + def generate_data_md5 + props = { + annotation: annotation, + location: location, + subject_id: subject_id, + task_key: task_key, + workflow_id: workflow_id + } + self.data_md5 = self.class.data_md5_for_props(props) + end + + def self.find_by_props(props) + find_by data_md5: data_md5_for_props(props) + end + + def self.data_md5_for_props(props) + Digest::MD5.hexdigest(props.to_query) + end + + # Returns hash mapping distinct values for given field to matching count: def self.group_by_hour(match={}) agg = [] diff --git a/app/models/subject.rb b/app/models/subject.rb index f309300b..b3cd0759 100644 --- a/app/models/subject.rb +++ b/app/models/subject.rb @@ -25,7 +25,7 @@ class Subject field :type, type: String, default: "root" #options: "root", "secondary" field :status, type: String, default: "active" #options: "active", "inactive", "bad", "retired", "complete", "contentious" - field :meta_data, type: Hash + field :meta_data, type: Hash, default: {} field :classification_count, type: Integer, default: 0 field :random_no, type: Float field :secondary_subject_count, type: Integer, default: 0 @@ -75,6 +75,15 @@ class Subject # Index for fetching child subjects for a parent subject, optionally filtering by region NOT NULL index({parent_subject_id: 1, status: 1, region: 1}) + def created_by_robot? + created_solely_by? User.robot + end + + def created_solely_by?(user) + created_by = created_by_user_id == user.id.to_s + created_by ||= creating_user_ids.size == 1 && creating_user_ids.first == user.id.to_s + created_by + end def thumbnail location['thumbnail'].nil? ? location['standard'] : location['thumbnail'] @@ -131,7 +140,7 @@ def check_flagged_bad_count # calculate the percetage vote for retirement (pvr) # if pvr is equal or greater than retire_limit, set self.status == retired. def check_retire_by_vote - assesment_classifications = classifications.where(task_key: "completion_assessment_task").count + assesment_classifications = number_of_completion_assessments if assesment_classifications > 2 percentage_for_retire = retire_count / assesment_classifications.to_f if percentage_for_retire >= workflow.retire_limit @@ -141,6 +150,10 @@ def check_retire_by_vote end end + def number_of_completion_assessments + classifications.where(task_key: "completion_assessment_task").count || 0 + end + def bad! status! 'bad' @@ -150,7 +163,6 @@ def bad! def retire! return if status == "bad" - return if classifying_user_ids.length < workflow.retire_limit status! 'retired' subject_set.subject_completed_on_workflow(workflow) if ! workflow.nil? @@ -174,6 +186,10 @@ def calculate_most_popular_parent_classification buckets.map { |(k,v)| {ann: k, percentage: v.to_f / parent_classifications.count } }.first end + def parent_workflow + parent_classifications.limit(1).first.workflow + end + def to_s "#{status != 'active' ? "[#{status.capitalize}] " : ''}#{workflow.nil? ? 'Final' : workflow.name.capitalize} Subject (#{type})" @@ -204,6 +220,34 @@ def self.group_by_field_for_group(group, field, match={}) end + def self.find_or_create_root_by_standard_url(standard_url) + subject = Subject.find_by type: 'root', "location.standard" => standard_url + if subject.nil? + subject = Subject.create_root_for_url standard_url + end + subject + end + + def self.create_root_for_url(standard_url) + + require 'fastimage' + width, height = FastImage.size(standard_url,:raise_on_failure=>false, :timeout=>10.0) + + subject = Subject.create({ + type: 'root', + subject_set: SubjectSet.create({project: Project.current, group: Project.current.groups.first, state: 'active'}), + location: { + standard: standard_url + }, + width: width, + height: height + }) + subject.workflow = Workflow.find_by name: 'mark' + subject.activate! + subject + end + + private def status!(status) diff --git a/app/models/subject_generation_method.rb b/app/models/subject_generation_method.rb index 9faabbbc..caf5d9dc 100644 --- a/app/models/subject_generation_method.rb +++ b/app/models/subject_generation_method.rb @@ -47,6 +47,10 @@ def subject_attributes_from_classification(classification) if (label = task.tool_label(classification)) region[:label] = label end + # If region.color not passed from client, derive it from workflow_task tool config: + if ! region[:color] && task.sub_tool_config(classification) + region[:color] = task.sub_tool_config(classification)[:color] + end { parent_subject: classification.subject, diff --git a/app/models/subject_generation_methods/collect_unique.rb b/app/models/subject_generation_methods/collect_unique.rb index b0c685f8..ebfc029d 100644 --- a/app/models/subject_generation_methods/collect_unique.rb +++ b/app/models/subject_generation_methods/collect_unique.rb @@ -34,7 +34,8 @@ def process_classification(classification) if num_parent_classifications >= classification.workflow.generates_subjects_after # Get number of distinct classifications: - num_vals = classification.child_subject.data['values'].nil? ? -1 : classification.child_subject.data['values'].size + # num_vals = classification.child_subject.data['values'].nil? ? -1 : classification.child_subject.data['values'].size + num_vals = atts[:data]['values'].size # Where will this generated subject appear, if anywhere? next_workflow = classification.child_subject.workflow @@ -49,13 +50,13 @@ def process_classification(classification) verify_method = next_workflow.generates_subjects_method # If next workflow's generation method is most-popular and everyone transcribed the same thing, auto upgrade to 'complete': - if num_vals == 1 && verify_method == 'most-popular' + # (but only if num_parent_classifications > 1) + if num_vals == 1 && verify_method == 'most-popular' && num_parent_classifications > 1 atts[:status] = 'complete' # .. Otherwise, activate the generated subject into the next workflow: else - classification.child_subject.activate! - atts.delete :status + atts[:status] = 'active' end end end @@ -68,9 +69,11 @@ def process_classification(classification) atts[:creating_user_ids] ||= [] classification.child_subject.creating_user_ids.push classification.user_id - # puts "Saving atts to classification: #{atts.inspect}" classification.child_subject.update_attributes atts + # Now that child subj is saved (with a parent subject_set) Fire activate hooks if activating: + classification.child_subject.activate! if atts[:status] == 'active' + classification.child_subject end diff --git a/app/models/user.rb b/app/models/user.rb index 00258c5f..4d9a026c 100644 --- a/app/models/user.rb +++ b/app/models/user.rb @@ -36,7 +36,7 @@ class User field :profile_url, :type => String # URI of user profile, if any field :status, :type => String, :default => 'active' - field :role, :type => String, :default => 'user' # user, admin, team + field :role, :type => String, :default => 'user' # user, admin, team, robot field :guest, :type => Boolean, :default => false field :tutorial_complete, :type => Boolean, :default => false @@ -222,4 +222,16 @@ def self.group_by_hour(match={}) end end + def self.robot + @robot_user ||= ( + find_by(role: 'robot') + ) + end + + def self.bot_user_by_auth(auth) + user = User.find_or_create_by name: 'Robot', role: 'robot' + user.save! validate: false + user + end + end diff --git a/app/models/workflow.rb b/app/models/workflow.rb index 7a380337..bc0d9220 100644 --- a/app/models/workflow.rb +++ b/app/models/workflow.rb @@ -6,7 +6,7 @@ class Workflow field :key, type: String field :label, type: String field :first_task, type: String - field :retire_limit, type: Integer, default: 3 + field :retire_limit, type: Float, default: 0.75 field :subject_fetch_limit, type: Integer, default: 10 field :generates_subjects, type: Boolean, default: true field :generates_subjects_after, type: Integer, default: 0 diff --git a/app/serializers/final_data_subject_serializer.rb b/app/serializers/final_data_subject_serializer.rb index a2573cfa..80aad4d7 100644 --- a/app/serializers/final_data_subject_serializer.rb +++ b/app/serializers/final_data_subject_serializer.rb @@ -2,11 +2,20 @@ class FinalDataSubjectSerializer < ActiveModel::MongoidSerializer attributes :id, :type, :location, :region, :width, :height, :meta_data attributes :data # , :task - attributes :classification_count + attributes :status + # attributes :classification_count attributes :generated_in_workflow - attributes :child_subjects + # attributes :child_subjects attributes :transcription_classifications + attributes :assertions_breakdown + attributes :classifications_breakdown + attributes :assertions + + # attributes :flagged_bad + # ttributes :flagged_for_retirement + attributes :flags + def attributes data = super @@ -27,17 +36,93 @@ def attributes if data[:generated_in_workflow] == 'mark' # Mark subjects have roughly same info in :data and :region so keep :region - data.delete :data + data.delete :data if data[:region] else # .. For all other child subjects, delete :region since it's avail in parent data.delete :region end data.delete :transcription_classifications if data[:transcription_classifications].empty? - data.delete :child_subjects if data[:child_subjects].empty? + # data.delete :child_subjects if data[:child_subjects].empty? data end + def assertions + @assertions ||= flattened_subjects(object.child_subjects) + @assertions + end + + def classifications_breakdown + all_classifications = [] + @all_subjects.each do |s| + all_classifications += s.classifications + end + ret = all_classifications.inject({}) { |h, c| h[c.task_key] ||= 0; h[c.task_key] += 1; h } + ret[:total] = object.classifications.count + ret + end + + def assertions_breakdown + assertions.inject({}) do |h, a| + h[:all_workflows] ||= {} + h[:all_workflows][:total] ||= 0 + h[:all_workflows][:total] += 1 + h[:all_workflows][a.status] ||= 0 + h[:all_workflows][a.status] += 1 + + h[a.created_in_workflow] ||= {} + + h[a.created_in_workflow][:total] ||= 0 + h[a.created_in_workflow][:total] += 1 + + h[a.created_in_workflow][a.status] ||= 0 + h[a.created_in_workflow][a.status] += 1 + + h + end + end + + def flattened_subjects(subjects, parents = []) + @all_subjects ||= [] + @all_subjects += subjects + + ret = [] + subjects.each do |s| + next if s.parent_classifications.limit(1).first.task_key == 'completion_assessment_task' + + if s.child_subjects.size > 0 + ret += flattened_subjects(s.child_subjects, parents + [s]) + + else + ret << FinalSubjectAssertionSerializer.new(subject: s, parents: parents) + end + end + ret + end + + def flags + { + complete: flagged_for_retirement, + bad: { + votes_in_favor: object.flagged_bad_count || 0 + } + } + end + + def flagged_for_retirement + votes = object.number_of_completion_assessments + h = { + votes_in_favor: object.retire_count || 0, + total_votes: votes, + } + h[:percentage_in_favor] = object.retire_count / votes.to_f if ! object.retire_count.nil? && votes > 0 + h + end + + def status + object.status + end + def generated_in_workflow return nil if object.parent_subject.nil? puts "parent subj: #{object}" diff --git a/app/serializers/final_data_subject_set_serializer.rb b/app/serializers/final_data_subject_set_serializer.rb index ebc04924..183ac974 100644 --- a/app/serializers/final_data_subject_set_serializer.rb +++ b/app/serializers/final_data_subject_set_serializer.rb @@ -3,7 +3,7 @@ class FinalDataSubjectSetSerializer < ActiveModel::MongoidSerializer attributes :id attributes :name attributes :meta_data - attributes :classification_count + # attributes :classification_count attributes :subjects def subjects diff --git a/app/views/admin/dashboard/index.html.erb b/app/views/admin/dashboard/index.html.erb index 0b434ca4..5944a649 100644 --- a/app/views/admin/dashboard/index.html.erb +++ b/app/views/admin/dashboard/index.html.erb @@ -106,7 +106,7 @@
-

Verify

+

Transcriptions Being Verified

0 Total @@ -117,11 +117,13 @@
Active:
-
Verify subjects actively being voted upon
+
Transcriptions actively being voted upon
Inactive:
-
Verify subjects waiting for one or more transcriptions before being activated
+
Transcriptions awaiting one or more additional transcriptions before being voted upon
Complete:
-
Verify subjects that have received sufficient votes to choose one best transcription
+
Transcriptions that skipped voting because transcriptions were identical
+
Retired:
+
Transcriptions taken out of voting because voting has ended
diff --git a/config/initializers/register_project_static_routes.rb b/config/initializers/register_project_static_routes.rb index 05cb4a14..9c61b0ad 100644 --- a/config/initializers/register_project_static_routes.rb +++ b/config/initializers/register_project_static_routes.rb @@ -1,7 +1,11 @@ API::Application.configure do - if Project.current - project_assets_path = "./project/#{Project.current.key}/assets" - puts "Routing static assets from #{project_assets_path}" - Rails.application.config.middleware.insert_after ActionDispatch::Static, ActionDispatch::Static, project_assets_path + begin + if Project.current + project_assets_path = "./project/#{Project.current.key}/assets" + puts "Routing static assets from #{project_assets_path}" + Rails.application.config.middleware.insert_after ActionDispatch::Static, ActionDispatch::Static, project_assets_path + end + rescue + puts "FAILED to register static routing" end end diff --git a/lib/tasks/project.rake b/lib/tasks/project.rake index 6bf1f6ed..9de3e0dd 100644 --- a/lib/tasks/project.rake +++ b/lib/tasks/project.rake @@ -400,6 +400,68 @@ namespace :project do end + task :export, [:project_key, :rebuild] => :environment do |task, args| + args.with_defaults rebuild: true + rebuild = args[:rebuild] != 'false' + + project = Project.find_by key: args[:project_key] + + puts "Rebuild? #{rebuild}" + + export_base = "tmp/export/#{project.key}" + Dir.mkdir(export_base) unless File.exists?(export_base) + + start = Time.now + count = project.subject_sets.count + limit = 100 + built = 0 + (0..count).step(limit).each do |offset| + sets = project.subject_sets.offset(offset).limit(limit).each_with_index do |set, i| + path = "#{export_base}/#{set.id}.json" + next if File.exist?(path) && ! rebuild + + content = nil + begin + content = FinalDataSubjectSetSerializer.new(set).to_json + rescue + puts "Error building #{set.id}" + end + + if ! content.nil? + File.open path, "w" do |f| + f << content + end + built += 1 + end + + # puts "Wrote #{i+1} of #{count}: #{content.size}b to #{path}" + ellapsed = Time.now - start + per_set = ellapsed / built + remaining = per_set * (count - (offset + i+1)) / 60 / 60 + complete = (offset + i+1).to_f / count * 100 + # puts "Est time remaining: #{ellapsed} (#{per_set}) #{remaining}h" + $stderr.print "\r#{'%.8f' % complete}% complete. #{'%.1f' % remaining}h remaining. Built #{offset +i+1} of #{count}" + + end + end + + `zip -r public/exports.zip tmp/export` + puts "Finished building exports. Download at: /exports.zip" + end + + task :import_assertions, [:project_key] => :environment do |task, args| + project_key = args[:project_key] + + FinalSubjectSet.destroy_all + + Dir.glob("tmp/export/#{project_key}/*.json").each do |file| + h = JSON.parse File.read(file) + h = h['final_data_subject_set'] + set = FinalSubjectSet.find_or_initialize_by id: h['id'] + set.update_attributes h + puts "Saved #{h['id']}" + end + end From 7ff58fcbcb9aedf6d1c64ec5511db0fa941eed30 Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Fri, 29 Jan 2016 17:55:38 -0500 Subject: [PATCH 02/23] Adding a mess of new stuff including: final_* models and serializers to support manually-built data exports, which may be made public to be discoverable by ATOM, downloaded as zips, browsed by keyword; BotUser support administered by rake, which allow programatically generated classifications to be organized under a single user and with special ability to create missing subjects by image url (includes bot-example.rb in emigrant proj folder) --- Gemfile | 4 +- Gemfile.lock | 2 +- .../javascripts/components/app-router.cjsx | 16 ++ .../components/final-subject-set-browser.cjsx | 156 ++++++++++++++++ .../components/final-subject-set-page.cjsx | 79 ++++++++ app/assets/stylesheets/application.styl | 2 + .../final-subject-set-browser.styl | 111 ++++++++++++ app/controllers/admin/data_controller.rb | 36 +--- app/controllers/application_controller.rb | 4 + app/controllers/classifications_controller.rb | 9 +- .../final_data_exports_controller.rb | 23 +++ .../final_subject_sets_controller.rb | 21 +++ app/controllers/projects_controller.rb | 9 - app/models/bot_user.rb | 47 +++++ app/models/classification.rb | 4 +- app/models/concerns/group_by_field.rb | 19 ++ app/models/final_data_export.rb | 11 ++ app/models/final_subject.rb | 117 ++++++++++++ app/models/final_subject_assertion.rb | 122 +++++++++++++ app/models/final_subject_set.rb | 82 +++++++++ app/models/project.rb | 29 ++- app/models/subject.rb | 13 +- app/models/user.rb | 15 +- app/models/workflow_task.rb | 1 + app/serializers/final_data_serializer.rb | 19 -- .../final_data_subject_serializer.rb | 170 ------------------ .../final_data_subject_set_serializer.rb | 17 -- .../final_subject_assertion_serializer.rb | 19 ++ app/serializers/final_subject_serializer.rb | 11 ++ .../final_subject_set_serializer.rb | 14 ++ app/serializers/generic_result_serializer.rb | 42 +++++ app/serializers/project_serializer.rb | 16 +- app/views/admin/data/index.html.erb | 54 +++--- .../final_data_exports/index.atom.builder | 11 ++ config/routes.rb | 6 + lib/tasks/bot.rake | 48 +++++ lib/tasks/project.rake | 118 ++++++------ project/emigrant/bot-example.rb | 151 ++++++++++++++++ project/emigrant/workflows/transcribe.json | 33 ++-- 39 files changed, 1283 insertions(+), 378 deletions(-) create mode 100644 app/assets/javascripts/components/final-subject-set-browser.cjsx create mode 100644 app/assets/javascripts/components/final-subject-set-page.cjsx create mode 100644 app/assets/stylesheets/final-subject-set-browser.styl create mode 100644 app/controllers/final_data_exports_controller.rb create mode 100644 app/controllers/final_subject_sets_controller.rb create mode 100644 app/models/bot_user.rb create mode 100644 app/models/concerns/group_by_field.rb create mode 100644 app/models/final_data_export.rb create mode 100644 app/models/final_subject.rb create mode 100644 app/models/final_subject_assertion.rb create mode 100644 app/models/final_subject_set.rb delete mode 100644 app/serializers/final_data_serializer.rb delete mode 100644 app/serializers/final_data_subject_serializer.rb delete mode 100644 app/serializers/final_data_subject_set_serializer.rb create mode 100644 app/serializers/final_subject_assertion_serializer.rb create mode 100644 app/serializers/final_subject_serializer.rb create mode 100644 app/serializers/final_subject_set_serializer.rb create mode 100644 app/serializers/generic_result_serializer.rb create mode 100644 app/views/final_data_exports/index.atom.builder create mode 100644 lib/tasks/bot.rake create mode 100644 project/emigrant/bot-example.rb diff --git a/Gemfile b/Gemfile index 70b8a49f..6c79e565 100644 --- a/Gemfile +++ b/Gemfile @@ -14,7 +14,7 @@ gem 'omniauth-facebook' gem "omniauth-google-oauth2" gem 'omniauth-zooniverse', '~> 0.0.3' -gem 'mongoid', '~> 4.0.2' +gem 'mongoid' # , '~> 4.0.2' gem 'active_model_serializers' gem 'mongoid-serializer' gem 'rack-cors', :require => 'rack/cors' @@ -38,6 +38,8 @@ gem 'puma', '~> 2.14.0' gem 'logstasher', '~> 0.6' +# gem 'mongoid_fulltext' + group :development do gem 'dotenv-rails' end diff --git a/Gemfile.lock b/Gemfile.lock index 1e5a1a39..c4875856 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -322,7 +322,7 @@ DEPENDENCIES kaminari launchy logstasher (~> 0.6) - mongoid (~> 4.0.2) + mongoid mongoid-rspec (>= 1.6.0)! mongoid-serializer moped diff --git a/app/assets/javascripts/components/app-router.cjsx b/app/assets/javascripts/components/app-router.cjsx index ba99b6ce..2ee12f02 100644 --- a/app/assets/javascripts/components/app-router.cjsx +++ b/app/assets/javascripts/components/app-router.cjsx @@ -11,6 +11,8 @@ Verify = require './verify' # TODO Group routes currently not implemented GroupPage = require './group-page' GroupBrowser = require './group-browser' +FinalSubjectSetBrowser = require './final-subject-set-browser' +FinalSubjectSetPage = require './final-subject-set-page' Project = require 'models/project.coffee' @@ -74,6 +76,20 @@ class AppRouter name={workflow.name + '_entire_page'} /> } + + + { if project.downloadable_data + + } + { # Project-configured pages: project.pages?.map (page, key) => + entered_keyword: @props.query.keyword + searched_keyword: null + fetching_keyword: null + current_page: 1 + more_pages: false + results: [] + project: null + + componentDidMount: -> + @checkKeyword() + + API.type('projects').get().then (result)=> + @setState project: new Project(result[0]) + + componentWillReceiveProps: (new_props) -> + @checkKeyword new_props + + checkKeyword: (props = @props) -> + if props.query.keyword + @fetch props.query.keyword + + fetch: (keyword, page = 1) -> + return if ! @isMounted() + + if keyword != @state.fetching_keyword + + results = @state.results + results = [] if @state.searched_keyword != keyword + @setState fetching_keyword: keyword, fetching_page: page, results: results, () => + per_page = 20 + params = + keyword: keyword + per_page: per_page + page: @state.fetching_page + + API.type('final_subject_sets').get(params).then (sets) => + results = @state.results + offset = (@state.fetching_page-1) * per_page + for s,i in sets + results[i + offset] = s + @setState + results: results + searched_keyword: @props.query.keyword + current_page: @state.fetching_page + fetching_keyword: null + fetching_page: null + more_pages: sets?[0]?.getMeta('next_page') + + handleKeyPress: (e) -> + if @isMounted() + + if [13].indexOf(e.keyCode) >= 0 # ENTER: + @search e.target.value + + search: (keyword) -> + keyword = @refs.search_input?.getDOMNode().value.trim() unless keyword? + + @transitionTo "final_subject_sets", null, {keyword: keyword} + + loadMore: -> + @fetch @state.searched_keyword, @state.current_page + 1 + + handleChange: (e) -> + @setState entered_keyword: e.target.value + + render: -> + return null if ! @state.project? + +
+ + { if ! @state.project.downloadable_data +
+

Data Exports Not Available

+

Sorry, but public data exports are not enabled for this project yet.

+
+ + else +
+ Download Latest Raw Data + + +

Data Exports

+ + { if ! @state.searched_keyword +
+

Download

+ +

Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here.

+ +

You can download the latest using the button in the upper-right. For help interpretting the data, see Scribe WIKI on Data Exports.

+ +

Browse

+ +

Preview the data by searching by keyword below:

+
+ } + +
+ + +
+ + { if @state.searched_keyword && @state.results.length == 0 +

No matches yet for "{@state.searched_keyword}"

+ + else if @state.results.length > 0 +
+

Found {@state.results[0].getMeta('total')} matches

+
    + { for set in @state.results + url = "/#/data/exports/#{set.id}?keyword=#{@state.searched_keyword}" + matches = [] + safe_keyword = (w.replace(/\W/g, "\\$&") for w in @state.searched_keyword.toLowerCase().replace(/"/g,'').split(' ')).join("|") + regex = new RegExp("(#{safe_keyword})", 'gi') + for k of set.search_terms_by_field + matches.push(field: k, term: v) for v in set.search_terms_by_field[k] when v.match(regex) +
  • +
    + + + +
    +
    + { for m,i in matches[0...2] + + } +
    +
  • + } +
+ { if @state.more_pages + + } +
+ } +
+ } +
+ diff --git a/app/assets/javascripts/components/final-subject-set-page.cjsx b/app/assets/javascripts/components/final-subject-set-page.cjsx new file mode 100644 index 00000000..4cdcd27e --- /dev/null +++ b/app/assets/javascripts/components/final-subject-set-page.cjsx @@ -0,0 +1,79 @@ +React = require 'react' +API = require '../lib/api' + +GenericButton = require('components/buttons/generic-button') + +module.exports = React.createClass + displayName: 'FinalSubjectSetPage' + + getInitialState:-> + set: null + + componentDidMount: -> + API.type("final_subject_sets").get(@props.params.final_subject_set_id).then (set) => + @setState + set: set + + render: -> + return null if ! @state.set + +
+
+ Download Raw Data +

Set {@state.set.id}

+ +
    + { for subject in @state.set.subjects +
  • + +
      + { + assertions = subject.assertions.sort (a1,a2) -> + if a1.region.y < a2.region.y + -1 + else + 1 + null + } + { for assertion,i in assertions when assertion.name +
    • +

      {assertion.name}

      + +
        + { for k of assertion.data + console.log "assertion data: ", k, assertion.data +
      • + {assertion.data[k]} + { if k != 'value' + ({k.replace /_/g, ' '}) + } +
      • + } +
      +
      +
      Confidence
      +
      {Math.round(100 * assertion.confidence)}%
      +
      Status
      +
      {assertion.status.replace /_/, ' '}
      +
      Distinct Classifications
      +
      {assertion.classifications?.length || 0}
      +
      + { + viewer_width = assertion.region.width + scale = viewer_width / assertion.region.width + s = + background: "url(#{subject.location.standard}) no-repeat -#{Math.round(assertion.region.x * scale)}px -#{Math.round(assertion.region.y * scale)}px" + width: viewer_width + 'px' + height: Math.round(assertion.region.height * scale) + 'px' +
      + } +
    • + } +
    + +
  • + } +
+
+
+ diff --git a/app/assets/stylesheets/application.styl b/app/assets/stylesheets/application.styl index 727a45f5..0680948c 100644 --- a/app/assets/stylesheets/application.styl +++ b/app/assets/stylesheets/application.styl @@ -32,6 +32,8 @@ @import './groups.styl' @import './group-browser.styl' +@import './final-subject-set-browser.styl' + /* MARK STYLES */ @import './components/mark/point-tool.styl' @import './components/mark/rectangle-tool.styl' diff --git a/app/assets/stylesheets/final-subject-set-browser.styl b/app/assets/stylesheets/final-subject-set-browser.styl new file mode 100644 index 00000000..e0b4f1b0 --- /dev/null +++ b/app/assets/stylesheets/final-subject-set-browser.styl @@ -0,0 +1,111 @@ +.final-subject-set-browser + ul + list-style none + padding-left 0 + + .json-link + width auto + float right + margin-left 10px + + + // Search page: + + form + padding-bottom 1em + + input + vertical-align middle + + input#data-search + font-size 2em + margin-right 10px + + ul.results + li + display inline-block + margin 0 10px 10px 0 + width 400px + height 150px + overflow hidden + + a + color #2b3a42 + text-decoration none + + .image + width 170px + float left + + .matches + width 200px + float left + + .match + padding-bottom 0.5em + line-height 1.5em + + .field + font-weight bold + + &:after + content ":" + + .term + padding-left 1em + + em + font-weight bold + color TERTIARY_NORMAL + + + // Set page: + .final-subject-set-page + img.standard-image + max-width 600px + + h3 + clear both + margin-bottom 0 + + .confidence + opacity 0.5 + + &:after + content "% confidence" + + + ul.assertion-data + clear left + + li + color gray + + span.value + font-weight bold + color #2b3a42 + + span.data-key + margin-left 20px + + dl.assertion-properties + clear left + + dt,dd + display inline + color gray + + dt + &:after + content ":" + + dd + margin-left 10px + margin-right 40px + + .image-crop + opacity 0.7 + + &:hover + opacity 1 + diff --git a/app/controllers/admin/data_controller.rb b/app/controllers/admin/data_controller.rb index 5a8f51d5..2eafd645 100644 --- a/app/controllers/admin/data_controller.rb +++ b/app/controllers/admin/data_controller.rb @@ -1,34 +1,16 @@ class Admin::DataController < Admin::AdminBaseController def index - @num_complete = Subject.complete.count - @num_non_root = Subject.active_non_root.count - end - - def download - if params[:download_format] - redirect_to "#{admin_data_download_path}.#{params[:download_format]}?download_status=#{params[:download_status]}" - - else - - if params[:download_status] == 'complete' - @subjects = Subject.complete - respond_to do |format| - format.json {render json: CompleteSubjectsSerializer.new(@subjects)} - end - - elsif params[:download_status] == 'flat' - @subjects = Subject.all.skip(100).limit(1).first.child_subjects.where(workflow_id: nil) - respond_to do |format| - format.json {render json: @subjects.map { |s| FinalDataSubjectSerializer.new(s, root: false) }} - end - - else - @sets = SubjectSet.all.skip(101).limit 1 - respond_to do |format| - format.json {render json: FinalDataSerializer.new(@sets)} + @project = Project.current + if request.post? + if (proj = params[:project]) + if (v = proj[:downloadable_data]) + new_val = v == '1' + puts "updating project: #{new_val} because #{v}" + @project.update_attributes downloadable_data: new_val end end end - end + @export = FinalDataExport.most_recent.first + end end diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index b7395a64..62424948 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -10,6 +10,10 @@ def require_user! current_or_guest_user(create_if_missing = true) end + def get_bot_user_from_request(request) + BotUser.by_auth request.headers + end + # Get currently logged-in user, creating guest as indicated def current_or_guest_user(create_if_missing = false) if current_user diff --git a/app/controllers/classifications_controller.rb b/app/controllers/classifications_controller.rb index b6b4532b..f2e2bc94 100644 --- a/app/controllers/classifications_controller.rb +++ b/app/controllers/classifications_controller.rb @@ -5,10 +5,7 @@ class ClassificationsController < ApplicationController def create # Is it a bot? - user = nil - if request.headers["HTTP_ROBOT_AUTH"] - user = User.bot_user_by_auth request.headers["HTTP_ROBOT_AUTH"] - end + user = get_bot_user_from_request request user = require_user! if user.nil? @@ -31,6 +28,7 @@ def create subject_id = params["classifications"]["subject_id"] user_agent = request.headers["HTTP_USER_AGENT"] + # If workflow not found by id, maybe it was specified by name? if workflow_id.nil? && ! params["workflow"].nil? workflow = Workflow.find_by name: params["workflow"]["name"] workflow_id = workflow.id @@ -38,7 +36,8 @@ def create workflow_id = BSON::ObjectId.from_string workflow_id if ! workflow_id.nil? - if subject_id.nil? && (standard_url = params["subject"]["location"]["standard"]) + # If user is a bot, consider creating the subject on the fly: + if user.is_a?(BotUser) && subject_id.nil? && (standard_url = params["subject"]["location"]["standard"]) subject_id = Subject.find_or_create_root_by_standard_url(standard_url).id end diff --git a/app/controllers/final_data_exports_controller.rb b/app/controllers/final_data_exports_controller.rb new file mode 100644 index 00000000..fbd0057a --- /dev/null +++ b/app/controllers/final_data_exports_controller.rb @@ -0,0 +1,23 @@ +class FinalDataExportsController < FinalDataController + + def latest + puts "FinalDataExport.most_recent.first: #{FinalDataExport.most_recent.first.inspect}" + show FinalDataExport.most_recent.first + end + + def show(export = nil) + export = FinalDataExport.find(params[:id]) unless export + return render text: 'Not found.', status: 404 if export.nil? + + redirect_to export.path + end + + def index + @exports = FinalDataExport.most_recent.limit(20) + + respond_to do |format| + format.atom + end + end + +end diff --git a/app/controllers/final_subject_sets_controller.rb b/app/controllers/final_subject_sets_controller.rb new file mode 100644 index 00000000..bf3e7980 --- /dev/null +++ b/app/controllers/final_subject_sets_controller.rb @@ -0,0 +1,21 @@ +class FinalSubjectSetsController < FinalDataController + respond_to :json + + def show + @set = FinalSubjectSet.find params[:id] + respond_with FinalSubjectSetSerializer.new @set + end + + def index + per_page = get_int :per_page, 20, (0..50) + page = get_int :page, 1 + + keyword = params[:keyword] + + @sets = FinalSubjectSet.page(page).per(per_page) + @sets = @sets.where({"$text" => {"$search" => keyword} } ) if keyword + + respond_with GenericResultSerializer.new(@sets) + end + +end diff --git a/app/controllers/projects_controller.rb b/app/controllers/projects_controller.rb index 4762c095..6a7c7072 100644 --- a/app/controllers/projects_controller.rb +++ b/app/controllers/projects_controller.rb @@ -12,14 +12,5 @@ def stats render :json => {:project => project, :stats => project.stats} end -=begin - def project_css - render text: Project.current.styles - end - - def project_js - render text: Project.current.custom_js - end -=end end diff --git a/app/models/bot_user.rb b/app/models/bot_user.rb new file mode 100644 index 00000000..abbf522c --- /dev/null +++ b/app/models/bot_user.rb @@ -0,0 +1,47 @@ +class BotUser < User + + AUTH_HEADER = 'HTTP_BOT_AUTH' + + # Create bot user with name + def self.create(name) + user = find_or_initialize_by name: name, role: 'bot' + token = '' + if ! user.persisted? + token = user.reset_token! + end + {user: user, token: token} + end + + # Immediately overwrite existing token with a new one + def reset_token! + token = Devise.friendly_token[0,20] + self.password = self.password_confirmation = token + self.email = "#{name}@scribe" + save! validate: false + token + end + + def self.pack_auth_header(user_id, token) + [user_id, token].join ":" + end + + def self.unpack_auth_header(str) + str.split ":" + end + + # Given hash of headers, return bot user if a header authenticates + def self.by_auth(headers) + # No header? Fail. + return nil if headers[AUTH_HEADER].blank? + + # Fail if header doesn't have two values: + parts = unpack_auth_header headers[AUTH_HEADER] + return nil if parts.size != 2 + + # Get user by name and auth using token: + user = find parts[0] + return nil if ! user.valid_password? parts[1] + + user + end +end diff --git a/app/models/classification.rb b/app/models/classification.rb index 4d59b819..93262db1 100644 --- a/app/models/classification.rb +++ b/app/models/classification.rb @@ -91,8 +91,8 @@ def increment_subject_classification_count end if self.task_key == "flag_bad_subject_task" - # If deleting user is creator (or a robot), immediately change status to bad - if subject.created_solely_by?(user) || subject.created_by_robot? + # If deleting user is creator, immediately change status to bad + if subject.created_solely_by?(user) subject.bad! else diff --git a/app/models/concerns/group_by_field.rb b/app/models/concerns/group_by_field.rb new file mode 100644 index 00000000..6ca51a41 --- /dev/null +++ b/app/models/concerns/group_by_field.rb @@ -0,0 +1,19 @@ +module GroupByField + extend ActiveSupport::Concern + + module ClassMethods + + # Returns hash mapping distinct values for given field to matching count: + def group_by_field(field, match={}) + puts "group #{collection.inspect} by #{field}" + agg = [] + agg << {"$match" => match } if match + agg << {"$group" => { "_id" => "$#{field.to_s}", count: {"$sum" => 1} }} + collection.aggregate(agg).inject({}) do |h, p| + h[p["_id"]] = p["count"] + h + end + end + + end +end diff --git a/app/models/final_data_export.rb b/app/models/final_data_export.rb new file mode 100644 index 00000000..f85c56a1 --- /dev/null +++ b/app/models/final_data_export.rb @@ -0,0 +1,11 @@ +class FinalDataExport + include Mongoid::Document + include Mongoid::Timestamps + + belongs_to :project + field :path, type: String + field :num_final_subject_sets, type: Integer + + scope :most_recent, -> { order(updated_at: -1) } + +end diff --git a/app/models/final_subject.rb b/app/models/final_subject.rb new file mode 100644 index 00000000..d7deb5a6 --- /dev/null +++ b/app/models/final_subject.rb @@ -0,0 +1,117 @@ +class FinalSubject + include Mongoid::Document + + field :type, type: String + field :location, type: Hash + field :status, type: String + field :width, type: Integer + field :height, type: Integer + field :meta_data, type: Hash + field :data, type: Hash + field :classifications_breakdown, type: Hash + field :flags, type: Hash + + belongs_to :subject + embedded_in :final_subject_set, inverse_of: :subjects + embeds_many :assertions, class_name: 'FinalSubjectAssertion' + + def fulltext_terms + assertions.select { |assertion| ! assertion.data.blank? && assertion.created_in_workflow != 'mark' }.map { |assertion| assertion.data.values }.select { |v| ! v.empty? } + end + + def fulltext_terms_by_field + assertions.select { |assertion| ! assertion.data.blank? && assertion.created_in_workflow != 'mark' }.inject({}) do |h, a| + puts "collected: #{a.name}" + h[a.name] = [] if h[a.name].nil? + h[a.name] = a.data.values.select { |v| ! v.empty? } + h + end + end + + def self.create_from_subject(subject) + inst = self.new subject: subject + [:type, :location, :status, :width, :height, :meta_data].each do |p| + inst.send("#{p}=", subject.send(p)) + end + + inst.build_assertions! + # inst.build_classifications_breakdown! + # inst.build_data! + + inst + end + + def build_data! + distinct = assertions.inject({}) do |h, assertion| + if assertion.created_in_workflow != 'mark' + h[assertion.task_key] = [] if h[assertion.task_key].nil? + data = assertion.data + data = data["values"].first if ! data["values"].nil? + data = data["value"] if data["value"] + stmt = {value: data, label: assertion.instructions['transcribe']} + has_data = ! data.blank? + has_data &= ! data.values.select { |v| ! v.blank? }.empty? if data.is_a? Hash + h[assertion.task_key] << stmt if has_data && ! h[assertion.task_key].include?(stmt) + end + h + end + self.data = distinct + end + + def build_assertions! + assertions.destroy_all + + flattened_subjects(subject.child_subjects).each do |s| + assertions << FinalSubjectAssertion.create_from_subject(s[:subject], s[:parents]) + end + + self + end + + def build_classifications_breakdown! + all_classifications = [] + @all_subjects.each do |s| + all_classifications += s.classifications + end + self.classifications_breakdown = all_classifications.inject({}) { |h, c| h[c.task_key] ||= 0; h[c.task_key] += 1; h } + self.classifications_breakdown[:total] = subject.classifications.count + end + + def flags + { + complete: flagged_for_retirement, + bad: { + votes_in_favor: subject.flagged_bad_count || 0 + } + } + end + + def flagged_for_retirement + votes = subject.number_of_completion_assessments + h = { + votes_in_favor: subject.retire_count || 0, + total_votes: votes, + } + h[:percentage_in_favor] = subject.retire_count / votes.to_f if ! subject.retire_count.nil? && votes > 0 + h + end + + def flattened_subjects(subjects, parents = []) + @all_subjects ||= [] + @all_subjects += subjects + + ret = [] + subjects.each do |s| + next if ! s.parent_classifications.empty? && s.parent_classifications.limit(1).first.task_key == 'completion_assessment_task' + + if s.child_subjects.size > 0 + ret += flattened_subjects(s.child_subjects, parents + [s]) + + else + # ret << FinalSubjectAssertionSerializer.new(subject: s, parents: parents) + ret << {subject: s, parents: parents} if s.status != 'bad' + end + end + ret + end +end diff --git a/app/models/final_subject_assertion.rb b/app/models/final_subject_assertion.rb new file mode 100644 index 00000000..9fcb50af --- /dev/null +++ b/app/models/final_subject_assertion.rb @@ -0,0 +1,122 @@ +class FinalSubjectAssertion + include Mongoid::Document + + field :name, type: String + field :status, type: String + field :created_in_workflow, type: String + field :confidence, type: Float + field :data, type: Hash + field :versions, type: Array + field :region, type: Hash + field :task_key, type: String + field :instructions, type: Hash + + embedded_in :final_subject, inverse_of: :assertions + + def self.create_from_subject(subject, parents) + inst = new + + inst.name = subject.export_name + inst.status = status_for_subject(subject) + inst.created_in_workflow = subject.parent_workflow.nil? ? nil : subject.parent_workflow.name + inst.confidence = confidence_for_subject(subject) + inst.data = data_for_subject(subject) + inst.versions = classifications_for_subject(subject) + inst.region = region_for_subject(subject) + inst.task_key = subject.parent_classifications.empty? ? nil : subject.parent_classifications.limit(1).first.task_key + inst.instructions = instructions_for_subject(subject, parents) + + inst + end + + def self.classifications_for_subject(subject) + # Hack to show all distinct classifications with counts for terminal subjects being transcribed: + # if object[:subject].parent_workflow.name == 'transcribe' + + annotations_with_confidence subject if ! subject.parent_workflow.nil? && subject.parent_workflow.name != 'mark' + end + + def self.instructions_for_subject(subject, parents) + ret = {} + + parents.each do |s| + next if s.parent_workflow.nil? + + if s.parent_workflow.name == 'mark' && subject.region[:label] + ret[s.parent_workflow.name] = subject.region[:label] + + else + ret[s.parent_workflow.name] = s.parent_workflow_task.instruction + end + end + ret[subject.parent_workflow.name] = subject.parent_workflow_task.instruction if ! subject.parent_workflow.nil? + ret + end + + def self.region_for_subject(subject) + region = subject.region + return nil if region.nil? + + # not important: + region.delete 'color' + + # Translate toolName to generic 'shape' name: + region[:shape] = case region[:toolName] + when 'rectangleTool','rowTool' then 'rectangle' + when 'pointTool' then 'point' + end + region.delete 'toolName' + + region + end + + def self.data_for_subject(subject) + data = nil + if ['complete','retired'].include? subject.status + data = subject.data + else + cl = annotations_with_confidence(subject).first + data = cl.nil? ? nil : cl[:data] + end + data = data['values'].first if data && data['values'] + + data + end + + def self.confidence_for_subject(subject) + if subject.status == 'complete' + 1 + elsif subject.status == 'retired' + 1 + else + annotations_with_confidence(subject).map { |a| a[:confidence] }.max + end + end + + def self.status_for_subject(subject) + return nil if subject.parent_workflow.nil? + + return 'complete' if subject.status == 'complete' + + if subject.parent_workflow.name == 'transcribe' + return 'awaiting_transcriptions' if subject.status == 'inactive' + return 'awaiting_votes' if subject.status == 'active' + + elsif subject.parent_workflow.name == 'verify' + return 'awaiting_votes' if subject.status == 'inactive' + end + + subject.status + end + + + def self.annotations_with_confidence(subject) + num_votes = [subject.parent_workflow.nil? ? 3 : subject.parent_workflow.generates_subjects_after, subject.parent_classifications.count].max + grouped = subject.parent_classifications.inject({}) { |h, c| h[c.annotation] ||= 0; h[c.annotation] += 1; h } + classifications_by_annotation = subject.parent_classifications.inject({}) { |h, c| h[c.annotation] ||= []; h[c.annotation] << {created: c.created_at, user_id: c.user_id, duration: c.finished_at.to_time - c.started_at.to_time, user_id: c.user_id.to_s }; h } + grouped = grouped.inject([]) { |a,(annotation,count)| a << {data: annotation, votes: count, confidence: count.to_f / num_votes, instances: classifications_by_annotation[annotation] }; a } + grouped = grouped.sort_by { |a| - a[:confidence] } + grouped + end + +end diff --git a/app/models/final_subject_set.rb b/app/models/final_subject_set.rb new file mode 100644 index 00000000..47c28b09 --- /dev/null +++ b/app/models/final_subject_set.rb @@ -0,0 +1,82 @@ +class FinalSubjectSet + include Mongoid::Document + include Mongoid::Timestamps + # include Mongoid::FullTextSearch + + belongs_to :project + belongs_to :subject_set + field :name, type: String + field :meta_data, type: Hash + + field :search_terms + field :search_terms_by_field + + index({"subjects.assertions.confidence" => 1}, {background: true}) + index({"subjects.assertions.task_key" => 1}, {background: true}) + index({"subject_set_id" => 1}, {background: true}) + index({"project_id" => 1}, {background: true}) + + index({"search_terms" => "text"}) + # can't create two... + # index({"search_terms_by_field" => "text"}) + + [:total, :complete, :awaiting_votes, :in_progress, :awaiting_transcriptions].each do |field| + index({"subjects.assertions_breakdown.all_workflows.#{field}" => 1}, {background: true}) + end + + embeds_many :subjects, class_name: 'FinalSubject' + + # fulltext_search_in :fulltext_terms + + def build_search_terms + update_attributes({ + search_terms: compute_fulltext_terms, + search_terms_by_field: compute_fulltext_terms_by_field + }) + end + + def compute_fulltext_terms + compute_fulltext_terms_by_field.values.flatten.uniq + end + + def compute_fulltext_terms_by_field + subjects.map { |subject| subject.fulltext_terms_by_field }.inject({}) do |h, terms| + terms.each do |(k,vs)| + h[k] = [] if h[k].nil? + h[k] += vs + end + h + end + end + + def self.assert_for_set(set, rebuild=false) + # If final_subject_set record was built after most recent generated subject, consider skipping + if ! rebuild && (final_ss = find_by(subject_set:set)) + subjs_updated = set.subjects.max(:updated_at) + return if final_ss.updated_at > subjs_updated + end + inst = find_or_create_by subject_set: set + inst.project = set.project + inst.meta_data = set.meta_data + inst.update_subjects + inst.build_search_terms + inst.save! + end + + def update_subjects + + subjects.destroy_all + + subject_set.subjects.root.each do |subject| + subjects << FinalSubject.create_from_subject(subject) + end + end + + def self.rebuild_indexes(for_project) + collection.indexes.drop + for_project.export_names.each do |(key,name)| + index({"search_terms_by_field.#{key}" => 1}, {background: true}) + end + create_indexes + end +end diff --git a/app/models/project.rb b/app/models/project.rb index 6bfc3e58..410a2a82 100644 --- a/app/models/project.rb +++ b/app/models/project.rb @@ -18,10 +18,10 @@ class Project field :pages, type: Array, default: [] field :menus, type: Hash, default: {} field :partials, type: Hash, default: {} - field :logo, type: String - field :background, type: String - field :favicon, type: String - field :forum, type: Hash + field :logo, type: String, default: nil + field :background, type: String, default: nil + field :favicon, type: String, default: nil + field :forum, type: Hash, default: nil field :feedback_form_url, type: String field :discuss_url, type: String field :blog_url, type: String @@ -29,12 +29,13 @@ class Project field :styles, type: String field :custom_js, type: String field :admin_email, type: String - field :team_emails, type: Array + field :team_emails, type: Array, default: [] field :metadata_search, type: Hash field :tutorial, type: Hash field :terms_map, type: Hash, default: {} # Hash mapping internal terms to project appropriate terms (e.g. 'group'=>'ship') field :status, type: String, default: 'inactive' - field :analytics, type: Hash + field :analytics, type: Hash, default: nil + field :downloadable_data, type: Boolean # 10.27.15 until we can sort out a better time to call this method, lets comment it out. include CachedStats @@ -44,6 +45,7 @@ class Project has_many :subject_sets has_many :workflows, dependent: :destroy, order: "order ASC" has_many :subjects + has_many :final_subject_sets scope :most_recent, -> { order(updated_at: -1) } scope :active, -> { where(status: 'active') } @@ -63,6 +65,21 @@ def self.current active.first end + # get Distinct export_names from all workflow_tasks + def export_names + workflows.inject([]) do |a, w| + a += w.tasks.map { |t| t.export_name } + + end.select do |n| + ! n.nil? + + end.inject({}) do |h, name| + key = name.gsub(' ', '-').gsub(/[^A-Za-z0-9-]/, '') + h[key] = name + h + end + end + def calc_stats # amount of days to calculate statistics for range_in_days = 7 diff --git a/app/models/subject.rb b/app/models/subject.rb index b3cd0759..1016da8c 100644 --- a/app/models/subject.rb +++ b/app/models/subject.rb @@ -74,10 +74,6 @@ class Subject index({"type" => 1, "subject_set_id" => 1}, {background: true}) # Index for fetching child subjects for a parent subject, optionally filtering by region NOT NULL index({parent_subject_id: 1, status: 1, region: 1}) - - def created_by_robot? - created_solely_by? User.robot - end def created_solely_by?(user) created_by = created_by_user_id == user.id.to_s @@ -126,6 +122,13 @@ def parent_workflow_task end end + def export_name + return nil if parent_workflow.nil? + + transcribe_subject = parent_workflow.name == 'transcribe' ? self : parent_subject + transcribe_subject.parent_workflow_task.export_name if transcribe_subject && transcribe_subject.parent_workflow_task + end + # find all the classifications for subject where task_key == compleletion_assesment_task # calculate the percetage vote for retirement (pvr) # if pvr is equal or greater than retire_limit, set self.status == retired. @@ -187,7 +190,7 @@ def calculate_most_popular_parent_classification end def parent_workflow - parent_classifications.limit(1).first.workflow + parent_classifications.limit(1).first.workflow if ! parent_classifications.empty? end diff --git a/app/models/user.rb b/app/models/user.rb index 4d9a026c..11baa88d 100644 --- a/app/models/user.rb +++ b/app/models/user.rb @@ -36,7 +36,7 @@ class User field :profile_url, :type => String # URI of user profile, if any field :status, :type => String, :default => 'active' - field :role, :type => String, :default => 'user' # user, admin, team, robot + field :role, :type => String, :default => 'user' # user, admin, team, bot field :guest, :type => Boolean, :default => false field :tutorial_complete, :type => Boolean, :default => false @@ -221,17 +221,4 @@ def self.group_by_hour(match={}) h end end - - def self.robot - @robot_user ||= ( - find_by(role: 'robot') - ) - end - - def self.bot_user_by_auth(auth) - user = User.find_or_create_by name: 'Robot', role: 'robot' - user.save! validate: false - user - end - end diff --git a/app/models/workflow_task.rb b/app/models/workflow_task.rb index 81a743d3..7d10d6ba 100644 --- a/app/models/workflow_task.rb +++ b/app/models/workflow_task.rb @@ -10,6 +10,7 @@ class WorkflowTask field :next_task, type: String field :help, type: Hash field :examples, type: Array + field :export_name, type: String embedded_in :workflow diff --git a/app/serializers/final_data_serializer.rb b/app/serializers/final_data_serializer.rb deleted file mode 100644 index e9256d0b..00000000 --- a/app/serializers/final_data_serializer.rb +++ /dev/null @@ -1,19 +0,0 @@ -class FinalDataSerializer < ActiveModel::MongoidSerializer - attributes :data, :links, :meta - - root false - - def data - options = serialization_options.merge({root: false}) - object.map { |s| FinalDataSubjectSetSerializer.new(s, root: false) } - end - - def meta - { - } - end - - def links - {} - end -end diff --git a/app/serializers/final_data_subject_serializer.rb b/app/serializers/final_data_subject_serializer.rb deleted file mode 100644 index 80aad4d7..00000000 --- a/app/serializers/final_data_subject_serializer.rb +++ /dev/null @@ -1,170 +0,0 @@ -class FinalDataSubjectSerializer < ActiveModel::MongoidSerializer - - attributes :id, :type, :location, :region, :width, :height, :meta_data - attributes :data # , :task - attributes :status - # attributes :classification_count - attributes :generated_in_workflow - # attributes :child_subjects - attributes :transcription_classifications - - attributes :assertions_breakdown - attributes :classifications_breakdown - attributes :assertions - - # attributes :flagged_bad - # ttributes :flagged_for_retirement - attributes :flags - - def attributes - data = super - - # For brevity, remove attributes that are redundant or always null: - - if data[:type] == 'root' - # Root subjects don't have data: - data.delete :data - data.delete :generated_in_workflow - - else - # All of these are inherited from parent subject, so remove: - data.delete :location - data.delete :width - data.delete :height - data.delete :meta_data - end - - if data[:generated_in_workflow] == 'mark' - # Mark subjects have roughly same info in :data and :region so keep :region - data.delete :data if data[:region] - else - # .. For all other child subjects, delete :region since it's avail in parent - data.delete :region - end - data.delete :transcription_classifications if data[:transcription_classifications].empty? - # data.delete :child_subjects if data[:child_subjects].empty? - - data - end - - def assertions - @assertions ||= flattened_subjects(object.child_subjects) - @assertions - end - - def classifications_breakdown - all_classifications = [] - @all_subjects.each do |s| - all_classifications += s.classifications - end - ret = all_classifications.inject({}) { |h, c| h[c.task_key] ||= 0; h[c.task_key] += 1; h } - ret[:total] = object.classifications.count - ret - end - - def assertions_breakdown - assertions.inject({}) do |h, a| - h[:all_workflows] ||= {} - h[:all_workflows][:total] ||= 0 - h[:all_workflows][:total] += 1 - h[:all_workflows][a.status] ||= 0 - h[:all_workflows][a.status] += 1 - - h[a.created_in_workflow] ||= {} - - h[a.created_in_workflow][:total] ||= 0 - h[a.created_in_workflow][:total] += 1 - - h[a.created_in_workflow][a.status] ||= 0 - h[a.created_in_workflow][a.status] += 1 - - h - end - end - - def flattened_subjects(subjects, parents = []) - @all_subjects ||= [] - @all_subjects += subjects - - ret = [] - subjects.each do |s| - next if s.parent_classifications.limit(1).first.task_key == 'completion_assessment_task' - - if s.child_subjects.size > 0 - ret += flattened_subjects(s.child_subjects, parents + [s]) - - else - ret << FinalSubjectAssertionSerializer.new(subject: s, parents: parents) - end - end - ret - end - - def flags - { - complete: flagged_for_retirement, - bad: { - votes_in_favor: object.flagged_bad_count || 0 - } - } - end - - def flagged_for_retirement - votes = object.number_of_completion_assessments - h = { - votes_in_favor: object.retire_count || 0, - total_votes: votes, - } - h[:percentage_in_favor] = object.retire_count / votes.to_f if ! object.retire_count.nil? && votes > 0 - h - end - - def status - object.status - end - - def generated_in_workflow - return nil if object.parent_subject.nil? - puts "parent subj: #{object}" - object.parent_subject.classifications.first.workflow.name - end - - def child_subjects - object.child_subjects.map { |s| FinalDataSubjectSerializer.new(s, root: false) } - end - - def task - return nil if object.parent_workflow_task.nil? - - task = object.parent_workflow_task - { - instruction: task.instruction, - help: task.help, - tool: task.tool, - tool_config: task.tool_config - } - end - - def classification_count - object.classifications.count - end - - def id - object._id.to_s - end - - def include_data? - ! object.data.nil? - end - - def include_task? - ! object.parent_workflow_task.nil? - end - - def transcription_classifications - transcribe_workflow_id = Workflow.where(name:"transcribe").to_a[0]._id - transcription_classifications = object.classifications.where( {workflow_id: transcribe_workflow_id} ).to_a - object.classifications.where( {workflow_id: transcribe_workflow_id} ).map{ |c| FinalClassificationSerializer.new(c, root: false) } - end - -end diff --git a/app/serializers/final_data_subject_set_serializer.rb b/app/serializers/final_data_subject_set_serializer.rb deleted file mode 100644 index 183ac974..00000000 --- a/app/serializers/final_data_subject_set_serializer.rb +++ /dev/null @@ -1,17 +0,0 @@ -class FinalDataSubjectSetSerializer < ActiveModel::MongoidSerializer - - attributes :id - attributes :name - attributes :meta_data - # attributes :classification_count - attributes :subjects - - def subjects - object.subjects.root.map { |s| FinalDataSubjectSerializer.new(s, root: false) } - end - - def id - object._id.to_s - end - -end diff --git a/app/serializers/final_subject_assertion_serializer.rb b/app/serializers/final_subject_assertion_serializer.rb new file mode 100644 index 00000000..0e223109 --- /dev/null +++ b/app/serializers/final_subject_assertion_serializer.rb @@ -0,0 +1,19 @@ +class FinalSubjectAssertionSerializer < ActiveModel::MongoidSerializer + + attributes :id, :status + attributes :name + attributes :created_in_workflow + attributes :confidence + attributes :data + attributes :versions + attributes :region + attributes :task_key + attributes :instructions + + root false + + def id + object.id.to_s + end + +end diff --git a/app/serializers/final_subject_serializer.rb b/app/serializers/final_subject_serializer.rb new file mode 100644 index 00000000..95f13ca4 --- /dev/null +++ b/app/serializers/final_subject_serializer.rb @@ -0,0 +1,11 @@ +class FinalSubjectSerializer < ActiveModel::MongoidSerializer + + attributes :id, :type, :location, :status, :width, :height, :meta_data + has_many :assertions + + # scope :by_keyword, -> (keyword) { where(: keyword) } + + def id + object.id.to_s + end +end diff --git a/app/serializers/final_subject_set_serializer.rb b/app/serializers/final_subject_set_serializer.rb new file mode 100644 index 00000000..aeca277c --- /dev/null +++ b/app/serializers/final_subject_set_serializer.rb @@ -0,0 +1,14 @@ +class FinalSubjectSetSerializer < ActiveModel::MongoidSerializer + + attributes :id, :meta_data, :type, :search_terms_by_field + + has_many :subjects + + def id + object.id.to_s + end + + def type + 'final_subject_set' + end +end diff --git a/app/serializers/generic_result_serializer.rb b/app/serializers/generic_result_serializer.rb new file mode 100644 index 00000000..7990790f --- /dev/null +++ b/app/serializers/generic_result_serializer.rb @@ -0,0 +1,42 @@ +# Generic serializer for arrays of objects of arbitrary types +# Produces JSONAPI style results with pagination meta +class GenericResultSerializer < ActiveModel::MongoidSerializer + attributes :data, :links, :meta + + root false + + # This serializes both single objects and arrays of objects, so data should output either a hash or an array respectively: + def data + options = serialization_options.merge({root: false, scope: scope}) + + # Array of results? + if object.respond_to? :each + return [] if object.empty? + + # Determine what serializer to use based on class of first item: + klass = object.first.class.to_s + serializer_class = eval("#{klass}Serializer") + object.map { |s| serializer_class.new(s, options) } + + else + # Determine what serializer to use based on class of first item: + klass = object.class.to_s + serializer_class = eval("#{klass}Serializer") + serializer_class.new(object, options) + end + end + + def meta + { + current_page: object.current_page, + next_page: object.next_page, + prev_page: object.prev_page, + total_pages: object.total_pages, + total: object.count + } if object.respond_to? :current_page + end + + def links + serialization_options[:links] + end +end diff --git a/app/serializers/project_serializer.rb b/app/serializers/project_serializer.rb index b12ec002..6b129840 100644 --- a/app/serializers/project_serializer.rb +++ b/app/serializers/project_serializer.rb @@ -1,20 +1,16 @@ class ProjectSerializer < ActiveModel::MongoidSerializer - attributes :id, :title, :short_title, :summary, :home_page_content, :organizations , :team, :pages, :menus, :partials, :logo, :background, :workflows, :forum, :tutorial, :feedback_form_url, :metadata_search, :terms_map, :blog_url, :discuss_url, :privacy_policy + attributes :id, :title, :short_title, :summary, :home_page_content, :organizations , :team, :pages, :menus, :partials, :logo, :background, :workflows, :forum, :tutorial, :feedback_form_url, :metadata_search, :terms_map, :blog_url, :discuss_url, :privacy_policy, :downloadable_data + attributes :classification_count has_many :workflows # delegate :current_or_guest_user, to: :scope - def id - object._id.to_s + def classification_count + Classification.count end -=begin - def current_user_tutorial - user = scope.nil? ? nil : current_or_guest_user - unless user == nil - user.tutorial_complete - end + def id + object._id.to_s end -=end end diff --git a/app/views/admin/data/index.html.erb b/app/views/admin/data/index.html.erb index 119376c1..41b03d76 100644 --- a/app/views/admin/data/index.html.erb +++ b/app/views/admin/data/index.html.erb @@ -1,36 +1,32 @@

Data

-
- -

<%= @num_complete %> complete subject(s) ready for export (<%= @num_non_root %> pending).

- -

Format

-<% { "JSON" => 'json', - "CSV" => 'csv' - }.each do |(label, key)| - input_id = "download_format_#{key}" -%> -
- /> - -
-<% end %> +<% if @export.nil? %> +

No data exports have yet been built.

+

Please run `rake project:export_final_data` first

-

Completeness

-<% { "Complete (Only crowd-verified subjects)" => 'complete', - "All (All data in a massive json struc)" => 'all' - }.each do |(label, key)| - input_id = "download_status_#{key}" -%> -
- /> - -
-<% end %> +<% else %> +

Most recent data export: + +

+
Items
+
<%= @export.num_final_subject_sets %>
+ +
Built
+
<%= @export.updated_at.strftime('%B %-d, %Y') %> +
-

Download

+ Download - +

Make Public?

-
+

Should the public be able to download the latest from /data/latest and subscribe to the data updates ATOM feed?

+ +
+ <%= check_box 'project', "downloadable_data" %> + <%= label 'project','downloadable_data', 'Allow the public to download data' %> + +

+
+ +<% end %> diff --git a/app/views/final_data_exports/index.atom.builder b/app/views/final_data_exports/index.atom.builder new file mode 100644 index 00000000..2b22ca15 --- /dev/null +++ b/app/views/final_data_exports/index.atom.builder @@ -0,0 +1,11 @@ +atom_feed do |feed| + + feed.title("#{Project.current.title} Data Exports") + feed.updated(@exports[0].created_at) if @exports.length > 0 + + @exports.each do |export| + feed.entry(export) do |entry| + entry.title("#{export.updated_at.strftime('%c')}: #{export.num_final_subject_sets} subjects") + end + end +end diff --git a/config/routes.rb b/config/routes.rb index b6b9f778..bb9148bc 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -35,10 +35,16 @@ resources :groups, only: [:show, :index], :defaults => { :format => 'json' } + # Final data: + resources :final_subject_sets, only: [:show, :index], :defaults => { :format => 'json' } + get '/data/latest', to: 'final_data_exports#latest' + resources :final_data_exports, only: [:show, :index], path: "/data" + namespace :admin do resources :subject_sets, :subjects, :classifications, :users get 'dashboard' => 'dashboard#index' get 'data' => 'data#index' + post 'data' => 'data#index' get 'data/download' => 'data#download' get 'signin' => 'auth#signin' post 'stats/recalculate' => 'dashboard#recalculate_stats' diff --git a/lib/tasks/bot.rake b/lib/tasks/bot.rake new file mode 100644 index 00000000..14870e3c --- /dev/null +++ b/lib/tasks/bot.rake @@ -0,0 +1,48 @@ +require 'fileutils' + +namespace :bot do + + desc "Create Bot with name, printing out token to use in HTTP_BOT_AUTH" + task :create, [:name] => :environment do |task, args| + args.with_defaults name: 'ScribeBot' + + ret = BotUser.create args[:name] + + if ! ret[:token].blank? + puts "Created #{ret[:user].name}. Use HTTP header to authenticate:" + puts " #{BotUser::AUTH_HEADER}=#{BotUser::pack_auth_header(ret[:user].id, ret[:token])}" + else + puts "#{ret[:user].name} already exists, so token can not be read but may be reset. Use bot:reset to reset token." + end + end + + desc "Reset Bot token with name, printing out token to use in HTTP_ROBOT_AUTH" + task :reset, [:name] => :environment do |task, args| + args.with_defaults name: 'ScribeBot' + + user = BotUser.find_by name: args[:name] + token = user.reset_token! + + if token + puts "Reset #{user.name}. Use HTTP header to authenticate:" + puts " #{BotUser::AUTH_HEADER}=#{BotUser::pack_auth_header(user.id, token)}" + end + end + + desc "Delete Bot by name" + task :delete, [:name] => :environment do |task, args| + if args[:name].blank? + puts "No name given. Aborting." + exit + end + + user = BotUser.find_by name: args[:name] + if user + user.destroy + puts "Removed #{user.name}" + else + puts "Bot user #{args[:name]} could not be found" + end + end + +end diff --git a/lib/tasks/project.rake b/lib/tasks/project.rake index 9de3e0dd..6563b957 100644 --- a/lib/tasks/project.rake +++ b/lib/tasks/project.rake @@ -126,25 +126,10 @@ namespace :project do # load project_file_path project = Project.find_or_create_by key: project_key - # Establish some defaults so that if they're not set in the project hash, we overwrite the old value with the null default - project_defaults = { - background: nil, - logo: nil, - favicon: nil, - terms_map: {}, - team_emails: [], - team: [], - organizations: [], - analytics: nil, - forum: nil, - menus: {}, - partials: {} - } # Set all valid fields from hash: - project_hash = project_hash.inject(project_defaults) { |h, (k,v)| h[k] = v if Project.fields.keys.include?(k.to_s); h } + project_hash = project_hash.inject({}) { |h, (k,v)| h[k] = v if Project.fields.keys.include?(k.to_s); h } project.update project_hash - puts "Created project: #{project.title}" # Load pages from content/*: content_path = Rails.root.join('project', project_key, 'content') @@ -400,69 +385,93 @@ namespace :project do end - task :export, [:project_key, :rebuild] => :environment do |task, args| + task :build_final_data, [:project_key, :rebuild] => :environment do |task, args| args.with_defaults rebuild: true rebuild = args[:rebuild] != 'false' - project = Project.find_by key: args[:project_key] - - puts "Rebuild? #{rebuild}" - - export_base = "tmp/export/#{project.key}" - Dir.mkdir(export_base) unless File.exists?(export_base) + project = project_by_key args[:project_key] start = Time.now count = project.subject_sets.count limit = 100 built = 0 + + # Rebuild indexes + FinalSubjectSet.rebuild_indexes Project.current + (0..count).step(limit).each do |offset| sets = project.subject_sets.offset(offset).limit(limit).each_with_index do |set, i| - path = "#{export_base}/#{set.id}.json" - next if File.exist?(path) && ! rebuild - - content = nil - begin - content = FinalDataSubjectSetSerializer.new(set).to_json - rescue - puts "Error building #{set.id}" - end - if ! content.nil? - File.open path, "w" do |f| - f << content - end - built += 1 - end + final_set = FinalSubjectSet.assert_for_set set, rebuild - # puts "Wrote #{i+1} of #{count}: #{content.size}b to #{path}" ellapsed = Time.now - start per_set = ellapsed / built remaining = per_set * (count - (offset + i+1)) / 60 / 60 complete = (offset + i+1).to_f / count * 100 # puts "Est time remaining: #{ellapsed} (#{per_set}) #{remaining}h" $stderr.print "\r#{'%.8f' % complete}% complete. #{'%.1f' % remaining}h remaining. Built #{offset +i+1} of #{count}" - end end - `zip -r public/exports.zip tmp/export` - puts "Finished building exports. Download at: /exports.zip" end - task :import_assertions, [:project_key] => :environment do |task, args| - project_key = args[:project_key] + task :export_final_data, [:project_key] => :environment do |task, args| + project = project_by_key args[:project_key] + + # Make sure user has run build_final_data first: + if project.final_subject_sets.empty? + puts "No FinalSubjectSets found. Invoking project:build_final_data" + Rake::Task['project:build_final_data'].invoke(args[:project_key]) + puts "----------------" + end + + export_base = "tmp/export/#{project.key}" + + # Remove previous: + `rm -rf #{export_base}` if File.exists?(export_base) + + Dir.mkdir(export_base) unless File.exists?(export_base) + + start = Time.now + built = 0 + limit = 10 # 100 + count = FinalSubjectSet.count + count = 9 - FinalSubjectSet.destroy_all + (0..count).step(limit).each do |offset| + project.final_subject_sets.offset(offset).limit(limit).each_with_index do |set, i| + path = "#{export_base}/#{set.subject_set_id}.json" + content = FinalSubjectSetSerializer.new(set, root:false).to_json + puts "content: #{content}" + File.open path, "w" do |f| + f << content + end + built += 1 - Dir.glob("tmp/export/#{project_key}/*.json").each do |file| - h = JSON.parse File.read(file) - h = h['final_data_subject_set'] - set = FinalSubjectSet.find_or_initialize_by id: h['id'] - set.update_attributes h - puts "Saved #{h['id']}" + # puts "Wrote #{i+1} of #{count}: #{content.size}b to #{path}" + ellapsed = Time.now - start + per_set = ellapsed / built + remaining = per_set * (count - (offset + i+1)) / 60 / 60 + complete = (offset + i+1).to_f / count * 100 + # puts "Est time remaining: #{ellapsed} (#{per_set}) #{remaining}h" + $stderr.print "\r#{'%.8f' % complete}% complete. #{'%.1f' % remaining}h remaining. Built #{offset +i+1} of #{count}" + end end - end + # Generate timestamped filename with random suffix so it can't be guessed: + rand_suffix = (('a'..'z').to_a + (0..9).to_a).shuffle[0,16].join + max_updated = project.final_subject_sets.max(:updated_at) + filename = "scribe-#{project.key}-#{max_updated.strftime("%F")}-#{rand_suffix}" + + path = "/#{filename}.zip" + puts "Zipping #{path}" + `zip --junk-paths -r public#{path} #{export_base}` + puts "Finished building exports. Download at: /#{filename}.zip" + + FinalDataExport.create path: path, num_final_subject_sets: count + + puts "Done." + end def translate_pick_one_tool_config(task_hash) @@ -549,5 +558,10 @@ namespace :project do end end + def project_by_key(key, default=Project.current) + p = Project.find_by key: key + p = default if ! p + p + end end diff --git a/project/emigrant/bot-example.rb b/project/emigrant/bot-example.rb new file mode 100644 index 00000000..dcdd1451 --- /dev/null +++ b/project/emigrant/bot-example.rb @@ -0,0 +1,151 @@ + +require 'open-uri' +require 'json' +require 'cgi' + +# Useful extension to Hash to create query strings: +class Hash + def to_params + params = '' + stack = [] + + each do |k, v| + if v.is_a?(Hash) + stack << [k,v] + elsif v.is_a?(Array) + stack << [k,Hash.from_array(v)] + else + params << "#{k}=#{v}&" + end + end + + stack.each do |parent, hash| + hash.each do |k, v| + if v.is_a?(Hash) + stack << ["#{parent}[#{k}]", v] + else + params << "#{parent}[#{k}]=#{v}&" + end + end + end + + params.chop! + params + end + + def self.from_array(array = []) + h = Hash.new + array.size.times do |t| + h[t] = array[t] + end + h + end + +end + +# Example Scribe bot class: +class ScribeBot + + def initialize(scribe_endpoint) + @classifications_endpoint = scribe_endpoint + end + + # Post classification for a known subject_id + def classify_subject_by_id(subject_id, workflow_name, task_key, data) + params = { + workflow: { + name: workflow_name + }, + classifications: { + annotation: data, + task_key: task_key, + subject_id: subject_id + } + } + + submit_classification params + end + + # Post classification for subject specified by URL: + def classify_subject_by_url(subject_url, workflow_name, task_key, data) + params = { + subject: { + location: { + standard: CGI::escape(subject_url) + } + }, + workflow: { + name: workflow_name + }, + classifications: { + annotation: data, + task_key: task_key + } + } + + submit_classification params + end + + # Posts params as-is to classifications endpoint: + def submit_classification(params) + + require 'uri' + require "net/http" + + uri = URI(@classifications_endpoint) + + req = Net::HTTP::Post.new(uri.path, {'BOT_AUTH' => ENV['SCRIBE_BOT_TOKEN']}) + req.body = params.to_params + http = Net::HTTP.new(uri.host, uri.port) + + response = http.start {|http| http.request(req) } + + begin + JSON.parse response.body + rescue + nil + end + end +end + +# This simple script demonstrates use of the Scribe Classifications endpoint to generate data +# +# Useage: +# ruby bot-example.rb [-scribe-endpoint="http://localhost:3000"] +# + +options = Hash[ ARGV.join(' ').scan(/--?([^=\s]+)(?:=(\S+))?/) ] +options["scribe-endpoint"] = "http://localhost:3000/classifications" if ! options["scribe-endpoint"] + +args = ARGV.select { |a| ! a.match /^-/ } + +bot = ScribeBot.new options["scribe-endpoint"] + +# The following generates generates two classfiications: One mark classification +# and one transcription classification (applied to the subject generated by the +# mark classification). + +# Specify subject by standard URL (since this is a bot classification, it will be created automatically if it doesn't exist) +image_uri = "https://s3.amazonaws.com/scribe.nypl.org/emigrant-s4/full/619aed10-23fd-0133-16de-58d385a7bbd0.right-bottom.jpg" + +# Must manually specify workflow name ('mark'), and task_key ('mark_primary') +classification = bot.classify_subject_by_url( image_uri, "mark", "mark_primary", { + x: 100, + y: 200, + width: 300, + height: 200, + subToolIndex: 0 # Must specify subToolIndex (integer index into the tools array configured for workflow task) +})['classification'] + +# Response should contain a classification with a nested child_subject: +puts "Created classification: #{classification.to_json}" + +# Assuming above was successful, use the returned, generated subject_id to create next classification: +mark_id = classification['child_subject']['id'] +# Subjects generated in Mark tend to have `type`s that correspond to Transcribe task keys: +transcribe_task_key = classification['child_subject']['type'] +# Create transcription classification: +classification = bot.classify_subject_by_id( mark_id, "transcribe", transcribe_task_key, { value: 'foo' }) + +# Response should contain a classification with a nested verify subject (or orphaned subject if there is no Verify workflow) +puts "Created transcription classification: #{classification.to_json}" diff --git a/project/emigrant/workflows/transcribe.json b/project/emigrant/workflows/transcribe.json index 2bb1f9f0..ffdc47d1 100644 --- a/project/emigrant/workflows/transcribe.json +++ b/project/emigrant/workflows/transcribe.json @@ -19,7 +19,8 @@ "help": { "file": "t_record_date" }, - "generates_subject_type": "em_transcribed_date" + "generates_subject_type": "em_transcribed_date", + "export_name": "Record Date" }, "em_record_number": { @@ -30,7 +31,8 @@ "help": { "file": "t_record_number" }, - "generates_subject_type": "em_transcribed_record_number" + "generates_subject_type": "em_transcribed_record_number", + "export_name": "Record Number" }, "em_record_mortgager": { @@ -40,7 +42,8 @@ "generates_subject_type": "em_transcribed_mortgager", "help": { "file": "t_record_mortgager" - } + }, + "export_name": "Mortgager" }, "em_record_street_address": { @@ -51,7 +54,8 @@ "generates_subject_type": "em_transcribed_address", "help": { "file": "t_record_street_address" - } + }, + "export_name": "Street Address" }, "em_record_amount_loaned": { @@ -62,7 +66,8 @@ "generates_subject_type": "em_transcribed_amount_loaned", "help": { "file": "t_record_amount_loaned" - } + }, + "export_name": "Amount Loaned" }, "em_record_valuation": { @@ -74,7 +79,8 @@ "file": "t_record_valuation" }, "generates_subject_type": "em_transcribed_valuation_date", - "next_task": "em_record_valuation_ground_building" + "next_task": "em_record_valuation_ground_building", + "export_name": "Valuation Date" }, "em_record_valuation_ground_building": { @@ -101,7 +107,8 @@ }, "generates_subject_type": "em_transcribed_valuation_itemized", "instruction": "Sometimes valuations include itemized dollar values for \"ground\" and \"building\". Enter these amounts if you can find them. In the next screen, you'll enter the total valuation.", - "next_task": "em_record_valuation_total" + "next_task": "em_record_valuation_total", + "export_name": "Land & Building Value" }, "em_record_valuation_total": { @@ -113,7 +120,8 @@ "file": "t_record_valuation" }, "generates_subject_type": "em_transcribed_valuation_total", - "next_task": null + "next_task": null, + "export_name": "Total Value" }, "em_record_survey": { @@ -138,7 +146,8 @@ }, "generates_subject_type": "em_transcribed_survey", "instruction": "Enter, as they appear, any land and building dimensions that were recorded. In the next screen, you'll enter the descriptive information.", - "next_task": "em_record_survey_stories_materials" + "next_task": "em_record_survey_stories_materials", + "export_name": "Land & Building Dimensions" }, "em_record_survey_stories_materials": { @@ -163,7 +172,8 @@ }, "generates_subject_type": "em_transcribed_stories_materials", "instruction": "Enter, as they appear, the number of stories and the building materials.", - "next_task": "em_record_survey_additional_info" + "next_task": "em_record_survey_additional_info", + "export_name": "Stories & Materials" }, "em_record_survey_additional_info": { @@ -175,7 +185,8 @@ "file": "t_record_additional_info" }, "generates_subject_type": null, - "next_task": null + "next_task": null, + "export_name": "Additional Info" } } } From 76dd468d4664aa95368939c3a28c45d4bce652eb Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Tue, 2 Feb 2016 16:12:52 -0500 Subject: [PATCH 03/23] missing final_data_controller base controller class; build_and_export_final_data task; minor exports improvements --- app/controllers/final_data_controller.rb | 9 ++++++++ app/models/final_subject.rb | 6 +++--- app/models/final_subject_set.rb | 2 +- app/views/admin/data/index.html.erb | 2 +- lib/tasks/bot.rake | 2 -- lib/tasks/project.rake | 27 +++++++++++++++--------- 6 files changed, 31 insertions(+), 17 deletions(-) create mode 100644 app/controllers/final_data_controller.rb diff --git a/app/controllers/final_data_controller.rb b/app/controllers/final_data_controller.rb new file mode 100644 index 00000000..8cc1b4b3 --- /dev/null +++ b/app/controllers/final_data_controller.rb @@ -0,0 +1,9 @@ +class FinalDataController < ApplicationController + before_filter :ensure_data_downloadable + + def ensure_data_downloadable + project = Project.current + return render text: 'Data is not yet publicly available for this Scribe project.', status: 404 if ! project.downloadable_data + end + +end diff --git a/app/models/final_subject.rb b/app/models/final_subject.rb index d7deb5a6..d134ded0 100644 --- a/app/models/final_subject.rb +++ b/app/models/final_subject.rb @@ -21,9 +21,9 @@ def fulltext_terms def fulltext_terms_by_field assertions.select { |assertion| ! assertion.data.blank? && assertion.created_in_workflow != 'mark' }.inject({}) do |h, a| - puts "collected: #{a.name}" - h[a.name] = [] if h[a.name].nil? - h[a.name] = a.data.values.select { |v| ! v.empty? } + field_name = a.name.blank? ? '_' : a.name + h[field_name] = [] if h[field_name].nil? + h[field_name] += a.data.values.select { |v| ! v.empty? } h end end diff --git a/app/models/final_subject_set.rb b/app/models/final_subject_set.rb index 47c28b09..74941dce 100644 --- a/app/models/final_subject_set.rb +++ b/app/models/final_subject_set.rb @@ -73,7 +73,7 @@ def update_subjects end def self.rebuild_indexes(for_project) - collection.indexes.drop + collection.indexes.drop unless self.count == 0 # If no records yet saved, moped will error when dropping indexes for_project.export_names.each do |(key,name)| index({"search_terms_by_field.#{key}" => 1}, {background: true}) end diff --git a/app/views/admin/data/index.html.erb b/app/views/admin/data/index.html.erb index 41b03d76..bddc8654 100644 --- a/app/views/admin/data/index.html.erb +++ b/app/views/admin/data/index.html.erb @@ -2,7 +2,7 @@ <% if @export.nil? %>

No data exports have yet been built.

-

Please run `rake project:export_final_data` first

+

Please run `rake project:build_and_export_final_data` first

<% else %>

Most recent data export: diff --git a/lib/tasks/bot.rake b/lib/tasks/bot.rake index 14870e3c..e52cb053 100644 --- a/lib/tasks/bot.rake +++ b/lib/tasks/bot.rake @@ -1,5 +1,3 @@ -require 'fileutils' - namespace :bot do desc "Create Bot with name, printing out token to use in HTTP_BOT_AUTH" diff --git a/lib/tasks/project.rake b/lib/tasks/project.rake index 6563b957..d6f30e9f 100644 --- a/lib/tasks/project.rake +++ b/lib/tasks/project.rake @@ -385,6 +385,7 @@ namespace :project do end + desc "Build final_subject* data in database" task :build_final_data, [:project_key, :rebuild] => :environment do |task, args| args.with_defaults rebuild: true rebuild = args[:rebuild] != 'false' @@ -396,6 +397,10 @@ namespace :project do limit = 100 built = 0 + # Do any of this project's workflow tasks have configured export_names? If not, warn: + has_export_names = ! project.workflows.map { |w| w.tasks }.flatten.select { |t| ! t.export_name.blank? }.empty? + puts "WARNING: No export_names found in workflow configuration. This may make it tricky to interpret the field-level data. See `export_name` documentation in https://github.com/zooniverse/scribeAPI/wiki/Project-Workflows#tasks" if ! has_export_names + # Rebuild indexes FinalSubjectSet.rebuild_indexes Project.current @@ -403,26 +408,26 @@ namespace :project do sets = project.subject_sets.offset(offset).limit(limit).each_with_index do |set, i| final_set = FinalSubjectSet.assert_for_set set, rebuild + built += 1 ellapsed = Time.now - start per_set = ellapsed / built remaining = per_set * (count - (offset + i+1)) / 60 / 60 complete = (offset + i+1).to_f / count * 100 - # puts "Est time remaining: #{ellapsed} (#{per_set}) #{remaining}h" $stderr.print "\r#{'%.8f' % complete}% complete. #{'%.1f' % remaining}h remaining. Built #{offset +i+1} of #{count}" end end end + desc "Using data in final_subject* collections, generate a series of JSON exports and attempt to create a downloadable ZIP" task :export_final_data, [:project_key] => :environment do |task, args| project = project_by_key args[:project_key] # Make sure user has run build_final_data first: if project.final_subject_sets.empty? - puts "No FinalSubjectSets found. Invoking project:build_final_data" - Rake::Task['project:build_final_data'].invoke(args[:project_key]) - puts "----------------" + puts "No FinalSubjectSets found." + exit end export_base = "tmp/export/#{project.key}" @@ -434,15 +439,13 @@ namespace :project do start = Time.now built = 0 - limit = 10 # 100 + limit = 100 count = FinalSubjectSet.count - count = 9 (0..count).step(limit).each do |offset| project.final_subject_sets.offset(offset).limit(limit).each_with_index do |set, i| path = "#{export_base}/#{set.subject_set_id}.json" content = FinalSubjectSetSerializer.new(set, root:false).to_json - puts "content: #{content}" File.open path, "w" do |f| f << content end @@ -451,10 +454,9 @@ namespace :project do # puts "Wrote #{i+1} of #{count}: #{content.size}b to #{path}" ellapsed = Time.now - start per_set = ellapsed / built - remaining = per_set * (count - (offset + i+1)) / 60 / 60 + remaining = per_set * (count - (offset + i+1)) / 60 complete = (offset + i+1).to_f / count * 100 - # puts "Est time remaining: #{ellapsed} (#{per_set}) #{remaining}h" - $stderr.print "\r#{'%.8f' % complete}% complete. #{'%.1f' % remaining}h remaining. Built #{offset +i+1} of #{count}" + $stderr.print "\r#{'%.8f' % complete}% complete. #{'%.1f' % remaining}m remaining. Built #{offset +i+1} of #{count}" end end @@ -473,6 +475,11 @@ namespace :project do puts "Done." end + desc "Convenience method that, in one call, builds all data JSONs and zips them up into a single ZIP release" + task :build_and_export_final_data, [:project_key, :rebuild] => :environment do |task, args| + Rake::Task['project:build_final_data'].invoke(args[:project_key], args[:rebuild]) + Rake::Task['project:export_final_data'].invoke(args[:project_key]) + end def translate_pick_one_tool_config(task_hash) config = task_hash[:tool_config] || {} From ef9d79b4cc40dbbc6264a5eb33fae41653490dd8 Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Wed, 3 Feb 2016 17:19:20 -0500 Subject: [PATCH 04/23] improved data export by pushing to s3 if env vars avail instead of writing to disk, which will fail for any host with ephemeral storage..; allow browsing final-subject-sets in /#/data/exports even if no finaldataexport record avail --- Gemfile | 4 +- Gemfile.lock | 10 +++- .../components/final-subject-set-browser.cjsx | 31 +++++++----- app/models/project.rb | 1 + app/serializers/project_serializer.rb | 6 ++- lib/tasks/project.rake | 50 ++++++++++++++----- 6 files changed, 74 insertions(+), 28 deletions(-) diff --git a/Gemfile b/Gemfile index 6c79e565..030abf9d 100644 --- a/Gemfile +++ b/Gemfile @@ -14,7 +14,7 @@ gem 'omniauth-facebook' gem "omniauth-google-oauth2" gem 'omniauth-zooniverse', '~> 0.0.3' -gem 'mongoid' # , '~> 4.0.2' +gem 'mongoid', '~> 4.0.2' gem 'active_model_serializers' gem 'mongoid-serializer' gem 'rack-cors', :require => 'rack/cors' @@ -38,6 +38,8 @@ gem 'puma', '~> 2.14.0' gem 'logstasher', '~> 0.6' +gem 'aws-sdk', '~> 2' + # gem 'mongoid_fulltext' group :development do diff --git a/Gemfile.lock b/Gemfile.lock index c4875856..8f27022c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -52,6 +52,12 @@ GEM tzinfo (~> 0.3.37) addressable (2.3.8) arel (4.0.2) + aws-sdk (2.2.14) + aws-sdk-resources (= 2.2.14) + aws-sdk-core (2.2.14) + jmespath (~> 1.0) + aws-sdk-resources (2.2.14) + aws-sdk-core (= 2.2.14) bcrypt (3.1.10) better_errors (2.1.1) coderay (>= 1.0.0) @@ -126,6 +132,7 @@ GEM jbuilder (1.5.3) activesupport (>= 3.0.0) multi_json (>= 1.2.0) + jmespath (1.1.3) jquery-rails (3.1.2) railties (>= 3.0, < 5.0) thor (>= 0.14, < 2.0) @@ -303,6 +310,7 @@ PLATFORMS DEPENDENCIES actionpack-action_caching active_model_serializers + aws-sdk (~> 2) better_errors binding_of_caller browserify-rails (~> 0.9.1) @@ -322,7 +330,7 @@ DEPENDENCIES kaminari launchy logstasher (~> 0.6) - mongoid + mongoid (~> 4.0.2) mongoid-rspec (>= 1.6.0)! mongoid-serializer moped diff --git a/app/assets/javascripts/components/final-subject-set-browser.cjsx b/app/assets/javascripts/components/final-subject-set-browser.cjsx index 3e99c5a7..4abfccdc 100644 --- a/app/assets/javascripts/components/final-subject-set-browser.cjsx +++ b/app/assets/javascripts/components/final-subject-set-browser.cjsx @@ -88,23 +88,30 @@ module.exports = React.createClass else

- Download Latest Raw Data - + { if @state.project.latest_export? +
+ Download Latest Raw Data -

Data Exports

+

Data Exports

- { if ! @state.searched_keyword -
-

Download

+ { if ! @state.searched_keyword +
+

Download

-

Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here.

- -

You can download the latest using the button in the upper-right. For help interpretting the data, see Scribe WIKI on Data Exports.

+

Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here.

+ +

You can download the latest using the button in the upper-right. For help interpretting the data, see Scribe WIKI on Data Exports.

-

Browse

+

Browse

-

Preview the data by searching by keyword below:

-
+

Preview the data by searching by keyword below:

+
+ } +
+ else +
+

Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized snapshot of that data, which can be browsed here.

+
}
diff --git a/app/models/project.rb b/app/models/project.rb index 410a2a82..27c99c5d 100644 --- a/app/models/project.rb +++ b/app/models/project.rb @@ -46,6 +46,7 @@ class Project has_many :workflows, dependent: :destroy, order: "order ASC" has_many :subjects has_many :final_subject_sets + has_many :final_data_exports scope :most_recent, -> { order(updated_at: -1) } scope :active, -> { where(status: 'active') } diff --git a/app/serializers/project_serializer.rb b/app/serializers/project_serializer.rb index 6b129840..90f33cf3 100644 --- a/app/serializers/project_serializer.rb +++ b/app/serializers/project_serializer.rb @@ -1,9 +1,11 @@ class ProjectSerializer < ActiveModel::MongoidSerializer - attributes :id, :title, :short_title, :summary, :home_page_content, :organizations , :team, :pages, :menus, :partials, :logo, :background, :workflows, :forum, :tutorial, :feedback_form_url, :metadata_search, :terms_map, :blog_url, :discuss_url, :privacy_policy, :downloadable_data + attributes :id, :title, :short_title, :summary, :home_page_content, :organizations , :team, :pages, :menus, :partials, :logo, :background, :workflows, :forum, :tutorial, :feedback_form_url, :metadata_search, :terms_map, :blog_url, :discuss_url, :privacy_policy, :downloadable_data, :latest_export attributes :classification_count has_many :workflows - # delegate :current_or_guest_user, to: :scope + def latest_export + FinalDataExportSerializer.new FinalDataExport.most_recent.first, root: false + end def classification_count Classification.count diff --git a/lib/tasks/project.rake b/lib/tasks/project.rake index d6f30e9f..93446e25 100644 --- a/lib/tasks/project.rake +++ b/lib/tasks/project.rake @@ -430,13 +430,20 @@ namespace :project do exit end - export_base = "tmp/export/#{project.key}" + missing_env_keys = ['S3_EXPORT_BUCKET','S3_EXPORT_PATH','S3KEY','S3SECRET'].select { |k| ENV[k].nil? } + if ! missing_env_keys.empty? + puts "Can not export data without setting #{missing_env_keys.join ", "}" + exit + end - # Remove previous: - `rm -rf #{export_base}` if File.exists?(export_base) + s3client = Aws::S3::Client.new - Dir.mkdir(export_base) unless File.exists?(export_base) + local_export_base = "#{Rails.root}/tmp/export/#{project.key}" + + # Remove previous: + # `rm -rf #{local_export_base}` if File.exists?(local_export_base) + Dir.mkdir(local_export_base) unless File.exists?(local_export_base) start = Time.now built = 0 limit = 100 @@ -444,7 +451,7 @@ namespace :project do (0..count).step(limit).each do |offset| project.final_subject_sets.offset(offset).limit(limit).each_with_index do |set, i| - path = "#{export_base}/#{set.subject_set_id}.json" + path = "#{local_export_base}/#{set.subject_set_id}.json" content = FinalSubjectSetSerializer.new(set, root:false).to_json File.open path, "w" do |f| f << content @@ -463,16 +470,35 @@ namespace :project do # Generate timestamped filename with random suffix so it can't be guessed: rand_suffix = (('a'..'z').to_a + (0..9).to_a).shuffle[0,16].join max_updated = project.final_subject_sets.max(:updated_at) - filename = "scribe-#{project.key}-#{max_updated.strftime("%F")}-#{rand_suffix}" + filename = "scribe-#{project.key}-#{max_updated.strftime("%F")}-#{rand_suffix}.tar.gz" + + # Zip it up + Rails.logger.info "Rake Complete, Begin GZIP, Go to S3" + sh %{cd #{local_export_base}; tar cfvz #{filename} --exclude '*.gz' .;} + Rails.logger.info "Tar-ing Complete" + + # Upload it to S3 + s3client = Aws::S3::Client.new + local_path = "#{local_export_base}/#{filename}" + remote_path = "#{ENV['S3_EXPORT_PATH']}/#{filename}" + + Rails.logger.info "Uploading #{local_path} to #{ENV['S3_EXPORT_BUCKET']}#{remote_path}" + s3client.put_object({ + acl: 'public-read', + bucket: ENV['S3_EXPORT_BUCKET'], + key: remote_path, + body: File.read(local_path) + }) + + # Remove local temp files + sh %{rm -rf #{local_export_base};} - path = "/#{filename}.zip" - puts "Zipping #{path}" - `zip --junk-paths -r public#{path} #{export_base}` - puts "Finished building exports. Download at: /#{filename}.zip" + # Create the final-data-export record so it appears on /#/data/exports + s3_url = "http://#{ENV['S3_EXPORT_BUCKET']}/#{remote_path}" + FinalDataExport.create path: s3_url, num_final_subject_sets: count, project: project - FinalDataExport.create path: path, num_final_subject_sets: count + puts "Finished building exports. Download at: #{s3_url}" - puts "Done." end desc "Convenience method that, in one call, builds all data JSONs and zips them up into a single ZIP release" From 78f9f5b836fd4d980141e3c20578898233426939 Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Wed, 3 Feb 2016 17:20:56 -0500 Subject: [PATCH 05/23] add missing serializer --- app/serializers/final_data_export_serializer.rb | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 app/serializers/final_data_export_serializer.rb diff --git a/app/serializers/final_data_export_serializer.rb b/app/serializers/final_data_export_serializer.rb new file mode 100644 index 00000000..f26c2a8c --- /dev/null +++ b/app/serializers/final_data_export_serializer.rb @@ -0,0 +1,3 @@ +class FinalDataExportSerializer < ActiveModel::MongoidSerializer + attributes :created_at, :num_final_subject_sets +end From e24f5a99cdcd7cd4eb67b23d101d19c8dc9500f5 Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Wed, 3 Feb 2016 17:31:55 -0500 Subject: [PATCH 06/23] wrong check for env keys in export-final-data script --- lib/tasks/project.rake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tasks/project.rake b/lib/tasks/project.rake index 93446e25..d4fbc70f 100644 --- a/lib/tasks/project.rake +++ b/lib/tasks/project.rake @@ -430,7 +430,7 @@ namespace :project do exit end - missing_env_keys = ['S3_EXPORT_BUCKET','S3_EXPORT_PATH','S3KEY','S3SECRET'].select { |k| ENV[k].nil? } + missing_env_keys = ['S3_EXPORT_BUCKET','S3_EXPORT_PATH','AWS_REGION','AWS_ACCESS_KEY_ID','AWS_SECRET_ACCESS_KEY'].select { |k| ENV[k].nil? } if ! missing_env_keys.empty? puts "Can not export data without setting #{missing_env_keys.join ", "}" exit From 04fb6e0df3b253a422298da4b8404675b0ac110e Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Wed, 3 Feb 2016 17:42:18 -0500 Subject: [PATCH 07/23] lets mkdir recursive --- lib/tasks/project.rake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tasks/project.rake b/lib/tasks/project.rake index d4fbc70f..c0f3f8be 100644 --- a/lib/tasks/project.rake +++ b/lib/tasks/project.rake @@ -443,7 +443,7 @@ namespace :project do # Remove previous: # `rm -rf #{local_export_base}` if File.exists?(local_export_base) - Dir.mkdir(local_export_base) unless File.exists?(local_export_base) + FileUtils.mkdir_p(local_export_base) unless File.exists?(local_export_base) start = Time.now built = 0 limit = 100 From d70f6f012a4d83dfb3b1c9ff7863a5f6e408fb44 Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Thu, 4 Feb 2016 13:32:22 -0500 Subject: [PATCH 08/23] final data browsing bugs; show/hide region; breaking assertion into its own component --- .../components/final-subject-assertion.cjsx | 57 +++++++++++++++++++ .../components/final-subject-set-page.cjsx | 34 +---------- .../final-subject-set-browser.styl | 9 +++ 3 files changed, 69 insertions(+), 31 deletions(-) create mode 100644 app/assets/javascripts/components/final-subject-assertion.cjsx diff --git a/app/assets/javascripts/components/final-subject-assertion.cjsx b/app/assets/javascripts/components/final-subject-assertion.cjsx new file mode 100644 index 00000000..2f085ed1 --- /dev/null +++ b/app/assets/javascripts/components/final-subject-assertion.cjsx @@ -0,0 +1,57 @@ +React = require 'react' +API = require '../lib/api' + +module.exports = React.createClass + displayName: 'FinalSubjectAssertion' + + propTypes: -> + assertion: React.PropTypes.object.isRequired + + getInitialState: -> + showingRegion: false + + toggleRegion: (e) -> + console.log "show: ", ! @state.showingRegion + @setState showingRegion: ! @state.showingRegion + + render: -> +
+

{@props.assertion.name}

+ +
    + { for k of @props.assertion.data +
  • + {@props.assertion.data[k]} + { if k != 'value' + ({k.replace /_/g, ' '}) + } +
  • + } +
+
+
Confidence
+
{Math.round(100 * @props.assertion.confidence)}%
+
Status
+
{@props.assertion.status.replace /_/, ' '}
+
Distinct Transcriptions
+
{@props.assertion.versions?.length || 0}
+
+ + { if @state.showingRegion + Hide {@props.project.term('mark')} + else + Show {@props.project.term('mark')} + } + + { + viewer_width = @props.assertion.region.width + scale = viewer_width / @props.assertion.region.width + s = + background: "url(#{@props.subject.location.standard}) no-repeat -#{Math.round(@props.assertion.region.x * scale)}px -#{Math.round(@props.assertion.region.y * scale)}px" + width: viewer_width + 'px' + height: (if @state.showingRegion then Math.round(@props.assertion.region.height * scale) else 0) + 'px' + classes = ['image-crop'] + classes.push 'showing' if @state.showingRegion +
+ } +
diff --git a/app/assets/javascripts/components/final-subject-set-page.cjsx b/app/assets/javascripts/components/final-subject-set-page.cjsx index 4cdcd27e..a77cecee 100644 --- a/app/assets/javascripts/components/final-subject-set-page.cjsx +++ b/app/assets/javascripts/components/final-subject-set-page.cjsx @@ -1,7 +1,7 @@ React = require 'react' API = require '../lib/api' -GenericButton = require('components/buttons/generic-button') +FinalSubjectAssertion = require('components/final-subject-assertion') module.exports = React.createClass displayName: 'FinalSubjectSetPage' @@ -28,6 +28,7 @@ module.exports = React.createClass
    { + # Sort assertions by order they appear in document: assertions = subject.assertions.sort (a1,a2) -> if a1.region.y < a2.region.y -1 @@ -37,36 +38,7 @@ module.exports = React.createClass } { for assertion,i in assertions when assertion.name
  • -

    {assertion.name}

    - -
      - { for k of assertion.data - console.log "assertion data: ", k, assertion.data -
    • - {assertion.data[k]} - { if k != 'value' - ({k.replace /_/g, ' '}) - } -
    • - } -
    -
    -
    Confidence
    -
    {Math.round(100 * assertion.confidence)}%
    -
    Status
    -
    {assertion.status.replace /_/, ' '}
    -
    Distinct Classifications
    -
    {assertion.classifications?.length || 0}
    -
    - { - viewer_width = assertion.region.width - scale = viewer_width / assertion.region.width - s = - background: "url(#{subject.location.standard}) no-repeat -#{Math.round(assertion.region.x * scale)}px -#{Math.round(assertion.region.y * scale)}px" - width: viewer_width + 'px' - height: Math.round(assertion.region.height * scale) + 'px' -
    - } +
  • }
diff --git a/app/assets/stylesheets/final-subject-set-browser.styl b/app/assets/stylesheets/final-subject-set-browser.styl index e0b4f1b0..4558b717 100644 --- a/app/assets/stylesheets/final-subject-set-browser.styl +++ b/app/assets/stylesheets/final-subject-set-browser.styl @@ -103,9 +103,18 @@ margin-left 10px margin-right 40px + .show-region-link + font-size 12px + .image-crop opacity 0.7 + transition: height 0.3s ease-out + + &.showing + border solid gray 1px + + &:hover opacity 1 From 12b09285c494c59d67ee0f8cfad978775ff75a06 Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Thu, 4 Feb 2016 15:32:57 -0500 Subject: [PATCH 09/23] reorganized final subject set browser for clarity; added loading spinner, confidence & status css classes --- .../components/final-subject-assertion.cjsx | 19 ++- .../components/final-subject-set-browser.cjsx | 146 ++++++++++-------- .../components/final-subject-set-page.cjsx | 4 + .../final-subject-set-browser.styl | 22 ++- 4 files changed, 116 insertions(+), 75 deletions(-) diff --git a/app/assets/javascripts/components/final-subject-assertion.cjsx b/app/assets/javascripts/components/final-subject-assertion.cjsx index 2f085ed1..01857308 100644 --- a/app/assets/javascripts/components/final-subject-assertion.cjsx +++ b/app/assets/javascripts/components/final-subject-assertion.cjsx @@ -15,7 +15,16 @@ module.exports = React.createClass @setState showingRegion: ! @state.showingRegion render: -> -
+ + confidence = Math.round(100 * @props.assertion.confidence) + confidence_label = 'low' + confidence_label = 'med' if confidence >= 50 + confidence_label = 'high' if confidence >= 66 + confidence_label = 'max' if confidence == 100 + + status_label = @props.assertion.status.replace /_/, ' ' + +

{@props.assertion.name}

    @@ -29,10 +38,10 @@ module.exports = React.createClass }
-
Confidence
-
{Math.round(100 * @props.assertion.confidence)}%
-
Status
-
{@props.assertion.status.replace /_/, ' '}
+
Confidence
+
{confidence}%
+
Status
+
{status_label}
Distinct Transcriptions
{@props.assertion.versions?.length || 0}
diff --git a/app/assets/javascripts/components/final-subject-set-browser.cjsx b/app/assets/javascripts/components/final-subject-set-browser.cjsx index 4abfccdc..742be140 100644 --- a/app/assets/javascripts/components/final-subject-set-browser.cjsx +++ b/app/assets/javascripts/components/final-subject-set-browser.cjsx @@ -3,6 +3,7 @@ React = require 'react' API = require '../lib/api' Project = require 'models/project.coffee' GenericButton = require('components/buttons/generic-button') +LoadingIndicator = require('components/loading-indicator') module.exports = React.createClass displayName: 'FinalSubjectSetBrowser' @@ -53,10 +54,10 @@ module.exports = React.createClass @setState results: results searched_keyword: @props.query.keyword - current_page: @state.fetching_page - fetching_keyword: null + current_page: page fetching_page: null more_pages: sets?[0]?.getMeta('next_page') + fetching_keyword: null handleKeyPress: (e) -> if @isMounted() @@ -74,11 +75,82 @@ module.exports = React.createClass handleChange: (e) -> @setState entered_keyword: e.target.value - + + renderSearch: -> +
+

Browse

+ +

Preview the data by searching by keyword below:

+ + + + + + { if @state.fetching_keyword && @state.fetching_keyword != @state.searched_keyword + + + else if @state.searched_keyword && @state.results.length == 0 +

No matches yet for "{@state.searched_keyword}"

+ + else if @state.results.length > 0 +
+

Found {@state.results[0].getMeta('total')} matches

+
    + { for set in @state.results + url = "/#/data/exports/#{set.id}?keyword=#{@state.searched_keyword}" + matches = [] + safe_keyword = (w.replace(/\W/g, "\\$&") for w in @state.searched_keyword.toLowerCase().replace(/"/g,'').split(' ')).join("|") + regex = new RegExp("(#{safe_keyword})", 'gi') + for k of set.search_terms_by_field + matches.push(field: k, term: v) for v in set.search_terms_by_field[k] when v.match(regex) +
  • +
    + + + +
    +
    + { for m,i in matches[0...2] + + } +
    +
  • + } +
+ { if @state.fetching_keyword && @state.fetching_keyword == @state.searched_keyword + + + else if @state.more_pages + + } +
+ } +
+ + renderDownloadCopy: -> +
+ + { if ! @state.fetching_keyword && ! @state.searched_keyword +
+

Download

+ +

You can download the latest using the button in the upper-right. For help interpretting the data, see Scribe WIKI on Data Exports.

+ +
+ } +
+ render: -> return null if ! @state.project?
+

Data Exports

+ { if ! @state.project.downloadable_data
@@ -91,72 +163,22 @@ module.exports = React.createClass { if @state.project.latest_export?
Download Latest Raw Data - -

Data Exports

- - { if ! @state.searched_keyword -
-

Download

- -

Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here.

- -

You can download the latest using the button in the upper-right. For help interpretting the data, see Scribe WIKI on Data Exports.

- -

Browse

- -

Preview the data by searching by keyword below:

-
- }
+ else -
-

Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized snapshot of that data, which can be browsed here.

-
+

Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized snapshot of that data, which can be browsed here.

} -
- - -
+ { if ! @state.searched_keyword +

Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here.

+ } - { if @state.searched_keyword && @state.results.length == 0 -

No matches yet for "{@state.searched_keyword}"

+ { @renderSearch() } - else if @state.results.length > 0 -
-

Found {@state.results[0].getMeta('total')} matches

-
    - { for set in @state.results - url = "/#/data/exports/#{set.id}?keyword=#{@state.searched_keyword}" - matches = [] - safe_keyword = (w.replace(/\W/g, "\\$&") for w in @state.searched_keyword.toLowerCase().replace(/"/g,'').split(' ')).join("|") - regex = new RegExp("(#{safe_keyword})", 'gi') - for k of set.search_terms_by_field - matches.push(field: k, term: v) for v in set.search_terms_by_field[k] when v.match(regex) -
  • -
    - - - -
    -
    - { for m,i in matches[0...2] - - } -
    -
  • - } -
- { if @state.more_pages - - } -
+ { if ! @state.searched_keyword + @renderDownloadCopy() } +
}
diff --git a/app/assets/javascripts/components/final-subject-set-page.cjsx b/app/assets/javascripts/components/final-subject-set-page.cjsx index a77cecee..021cd873 100644 --- a/app/assets/javascripts/components/final-subject-set-page.cjsx +++ b/app/assets/javascripts/components/final-subject-set-page.cjsx @@ -18,7 +18,11 @@ module.exports = React.createClass return null if ! @state.set
+
+ + Back + Download Raw Data

Set {@state.set.id}

diff --git a/app/assets/stylesheets/final-subject-set-browser.styl b/app/assets/stylesheets/final-subject-set-browser.styl index 4558b717..f2d067f8 100644 --- a/app/assets/stylesheets/final-subject-set-browser.styl +++ b/app/assets/stylesheets/final-subject-set-browser.styl @@ -21,6 +21,13 @@ font-size 2em margin-right 10px + .loading-indicator + width 100px + margin 100px auto + color black + position inherit + display block + ul.results li display inline-block @@ -68,13 +75,6 @@ clear both margin-bottom 0 - .confidence - opacity 0.5 - - &:after - content "% confidence" - - ul.assertion-data clear left @@ -98,11 +98,17 @@ dt &:after content ":" - + dd margin-left 10px margin-right 40px + &.confidence-med + background-color yellow + + &.status-complete, &.confidence-max, &.confidence-high + background-color lime + .show-region-link font-size 12px From f1487baf709430f40a6744e988de89d807fe29f7 Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Tue, 16 Feb 2016 15:53:42 -0500 Subject: [PATCH 10/23] Adding new ExportDocument model and associated config, builders, + serializers to support configurable automatic data cleanup against a simple schema that specifies data type, constraints, composition, repeatability --- .../components/final-subject-assertion.cjsx | 9 + .../components/final-subject-set-browser.cjsx | 80 +++++--- .../components/final-subject-set-page.cjsx | 98 +++++++--- .../final-subject-set-browser.styl | 86 +++++++- .../final_subject_sets_controller.rb | 49 ++++- app/models/export/document.rb | 36 ++++ app/models/export/document_builder.rb | 185 ++++++++++++++++++ app/models/export/document_field.rb | 33 ++++ app/models/export/spec/document.rb | 16 ++ app/models/export/spec/document_field.rb | 26 +++ app/models/final_subject.rb | 1 - app/models/final_subject_assertion.rb | 2 + app/models/final_subject_set.rb | 44 ++++- app/models/project.rb | 2 + .../export/document_field_serializer.rb | 8 + app/serializers/export/document_serializer.rb | 5 + .../export/spec/document_field_serializer.rb | 7 + .../export/spec/document_serializer.rb | 5 + .../final_subject_set_serializer.rb | 1 + app/serializers/generic_result_serializer.rb | 4 +- app/serializers/project_serializer.rb | 2 + lib/tasks/project.rake | 33 +++- 22 files changed, 658 insertions(+), 74 deletions(-) create mode 100644 app/models/export/document.rb create mode 100644 app/models/export/document_builder.rb create mode 100644 app/models/export/document_field.rb create mode 100644 app/models/export/spec/document.rb create mode 100644 app/models/export/spec/document_field.rb create mode 100644 app/serializers/export/document_field_serializer.rb create mode 100644 app/serializers/export/document_serializer.rb create mode 100644 app/serializers/export/spec/document_field_serializer.rb create mode 100644 app/serializers/export/spec/document_serializer.rb diff --git a/app/assets/javascripts/components/final-subject-assertion.cjsx b/app/assets/javascripts/components/final-subject-assertion.cjsx index 01857308..1cf68b78 100644 --- a/app/assets/javascripts/components/final-subject-assertion.cjsx +++ b/app/assets/javascripts/components/final-subject-assertion.cjsx @@ -30,7 +30,16 @@ module.exports = React.createClass
    { for k of @props.assertion.data
  • + { + cleaned_version = null + if @props.field && @props.field.value + cleaned_version = if (typeof @props.field.value) == 'object' then @props.field.value[k] else @props.field.value + null + } {@props.assertion.data[k]} + { if cleaned_version && ('' + cleaned_version) != @props.assertion.data[k] + ( Interpretted as {if (typeof cleaned_version) == 'object' then cleaned_version.join(' x ') else cleaned_version } ) + } { if k != 'value' ({k.replace /_/g, ' '}) } diff --git a/app/assets/javascripts/components/final-subject-set-browser.cjsx b/app/assets/javascripts/components/final-subject-set-browser.cjsx index 742be140..9b6d59cd 100644 --- a/app/assets/javascripts/components/final-subject-set-browser.cjsx +++ b/app/assets/javascripts/components/final-subject-set-browser.cjsx @@ -12,7 +12,8 @@ module.exports = React.createClass getInitialState:-> entered_keyword: @props.query.keyword - searched_keyword: null + selected_field: @props.query.field + searched_query: {} fetching_keyword: null current_page: 1 more_pages: false @@ -30,19 +31,20 @@ module.exports = React.createClass checkKeyword: (props = @props) -> if props.query.keyword - @fetch props.query.keyword + @fetch({keyword: props.query.keyword, field: props.query.field}) - fetch: (keyword, page = 1) -> + fetch: (query, page = 1) -> return if ! @isMounted() - if keyword != @state.fetching_keyword + if query.keyword != @state.fetching_keyword || query.field != @state.selected_field results = @state.results - results = [] if @state.searched_keyword != keyword - @setState fetching_keyword: keyword, fetching_page: page, results: results, () => + results = [] if @state.searched_query?.keyword != query.keyword + @setState fetching_keyword: query.keyword, fetching_page: page, results: results, () => per_page = 20 params = - keyword: keyword + keyword: query.keyword + field: query.field per_page: per_page page: @state.fetching_page @@ -53,7 +55,9 @@ module.exports = React.createClass results[i + offset] = s @setState results: results - searched_keyword: @props.query.keyword + searched_query: + keyword: @props.query.keyword + field: @props.query.field current_page: page fetching_page: null more_pages: sets?[0]?.getMeta('next_page') @@ -65,44 +69,70 @@ module.exports = React.createClass if [13].indexOf(e.keyCode) >= 0 # ENTER: @search e.target.value - search: (keyword) -> - keyword = @refs.search_input?.getDOMNode().value.trim() unless keyword? + search: (keyword, search_field) -> + keyword = @state.entered_keyword # refs.search_input?.getDOMNode().value.trim() unless keyword? + field = @state.selected_field # @refs.search_field?.getDOMNode().value.trim() - @transitionTo "final_subject_sets", null, {keyword: keyword} + @transitionTo "final_subject_sets", null, {keyword: keyword, field: field} loadMore: -> - @fetch @state.searched_keyword, @state.current_page + 1 + @fetch @state.searched_query, @state.current_page + 1 handleChange: (e) -> @setState entered_keyword: e.target.value + handleFieldSelect: (e) -> + @setState selected_field: e.target.value + + renderSearch: ->

    Browse

    Preview the data by searching by keyword below:

    - - + { if @state.project.export_document_specs?[0]?.spec_fields + + } +
    + + +
    - { if @state.fetching_keyword && @state.fetching_keyword != @state.searched_keyword + { if @state.fetching_keyword && @state.fetching_keyword != @state.searched_query?.keyword - else if @state.searched_keyword && @state.results.length == 0 -

    No matches yet for "{@state.searched_keyword}"

    + else if @state.searched_query?.keyword && @state.results.length == 0 +

    No matches yet for "{@state.searched_query.keyword}"

    else if @state.results.length > 0

    Found {@state.results[0].getMeta('total')} matches

      { for set in @state.results - url = "/#/data/exports/#{set.id}?keyword=#{@state.searched_keyword}" + url = "/#/data/exports/#{set.id}?keyword=#{@state.searched_query.keyword}&field=#{@state.searched_query.field}" matches = [] - safe_keyword = (w.replace(/\W/g, "\\$&") for w in @state.searched_keyword.toLowerCase().replace(/"/g,'').split(' ')).join("|") + + safe_keyword = (w.replace(/\W/g, "\\$&") for w in @state.searched_query.keyword.toLowerCase().replace(/"/g,'').split(' ')).join("|") + safe_keyword = (c for c in safe_keyword).join ",?" regex = new RegExp("(#{safe_keyword})", 'gi') - for k of set.search_terms_by_field - matches.push(field: k, term: v) for v in set.search_terms_by_field[k] when v.match(regex) + + # If a specific field searched, always show that: + if @state.searched_query?.field + term = set.search_terms_by_field[@state.searched_query.field]?.join("; ") + matches.push(field: @state.searched_query.field, term: term) if term + + # Otherwise show all fields that match + else + for k of set.search_terms_by_field + matches.push(field: k, term: v) for v in set.search_terms_by_field[k] when v.match(regex) +
    • }
    - { if @state.fetching_keyword && @state.fetching_keyword == @state.searched_keyword + { if @state.fetching_keyword && @state.fetching_keyword == @state.searched_query?.keyword else if @state.more_pages @@ -135,7 +165,7 @@ module.exports = React.createClass renderDownloadCopy: ->
    - { if ! @state.fetching_keyword && ! @state.searched_keyword + { if ! @state.fetching_keyword && ! @state.searched_query?.keyword

    Download

    @@ -169,13 +199,13 @@ module.exports = React.createClass

    Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized snapshot of that data, which can be browsed here.

    } - { if ! @state.searched_keyword + { if ! @state.searched_query?.keyword

    Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here.

    } { @renderSearch() } - { if ! @state.searched_keyword + { if ! @state.searched_query?.keyword @renderDownloadCopy() } diff --git a/app/assets/javascripts/components/final-subject-set-page.cjsx b/app/assets/javascripts/components/final-subject-set-page.cjsx index 021cd873..0acee62a 100644 --- a/app/assets/javascripts/components/final-subject-set-page.cjsx +++ b/app/assets/javascripts/components/final-subject-set-page.cjsx @@ -8,11 +8,28 @@ module.exports = React.createClass getInitialState:-> set: null + tab: null + tabs: [] componentDidMount: -> API.type("final_subject_sets").get(@props.params.final_subject_set_id).then (set) => + tabs = [] + tabs.push 'export-doc' if set.export_document + tabs.push 'assertions' @setState set: set + tab: tabs[0] + tabs: tabs + + showExportDoc: -> + @showTab 'export-doc' + + showAssertions: -> + @showTab 'assertions' + + showTab: (which) -> + @setState tab: which + render: -> return null if ! @state.set @@ -21,35 +38,64 @@ module.exports = React.createClass
    - Back + Back Download Raw Data -

    Set {@state.set.id}

    - -
      - { for subject in @state.set.subjects -
    • - -
        - { - # Sort assertions by order they appear in document: - assertions = subject.assertions.sort (a1,a2) -> - if a1.region.y < a2.region.y - -1 - else - 1 - null - } - { for assertion,i in assertions when assertion.name -
      • - -
      • - } -
      - -
    • - } + { if @state.set.export_document? && (display_field = @state.set.export_document.export_fields[0])? +

      {display_field.name} {display_field.value}

      + } + + + + + + { if @state.tab == 'export-doc' + if @state.set.export_document + for field,i in @state.set.export_document.export_fields + if field.assertion_ids + assertion = subject = null + for s in @state.set.subjects + for a in s.assertions + if field.assertion_ids.indexOf(a.id) >= 0 + assertion = a + subject = s + if assertion && subject +
      + +
      + } + + + { if @state.tab == 'assertions' +
        + { for subject in @state.set.subjects +
      • +
          + { + # Sort assertions by order they appear in document: + assertions = subject.assertions.sort (a1,a2) -> + if a1.region.y < a2.region.y + -1 + else + 1 + null + } + { for assertion,i in assertions when assertion.name +
        • + +
        • + } +
        + +
      • + } +
      + }
    diff --git a/app/assets/stylesheets/final-subject-set-browser.styl b/app/assets/stylesheets/final-subject-set-browser.styl index f2d067f8..baa27fb3 100644 --- a/app/assets/stylesheets/final-subject-set-browser.styl +++ b/app/assets/stylesheets/final-subject-set-browser.styl @@ -1,4 +1,37 @@ .final-subject-set-browser + + .tabs + padding 0 + margin 0 + margin-top 20px + list-style none + border-bottom solid #bbb 1px + + li + display inline-block + border solid #bbb + border-width 1px 1px 0 1px + margin 0 20px 0 0 + font-size 1.5em + border-radius 5px 5px 0 0 + + a + padding 10px 30px + text-decoration none + color gray + + &.active + background-color rgba(255,255,255,0.50) + + a + font-weight bold + color TERTIARY_NORMAL + + + + &.page-content + h2 + text-align left ul list-style none padding-left 0 @@ -21,6 +54,10 @@ font-size 2em margin-right 10px + select + font-size 1.3em + + .loading-indicator width 100px margin 100px auto @@ -62,6 +99,7 @@ padding-left 1em em + font-style unset font-weight bold color TERTIARY_NORMAL @@ -72,9 +110,17 @@ max-width 600px h3 + border-left none + padding 0 + line-height normal + width auto + font-size 28px clear both margin-bottom 0 + ul.assertion-data, dl.assertion-properties + margin 4px 0 4px 10px + ul.assertion-data clear left @@ -85,35 +131,57 @@ font-weight bold color #2b3a42 + span.cleaned-version + padding-left 1em + span.data-key margin-left 20px dl.assertion-properties + margin-left 10px clear left + font-size 14px dt,dd display inline color gray dt + margin 0 2px 0 0 &:after content ":" dd - margin-left 10px - margin-right 40px + margin 0 20px 0 0 + + &.confidence + color red + + &.status-complete + color green + + &.status + color TERTIARY_NORMAL &.confidence-med - background-color yellow + color TERTIARY_NORMAL + + &.confidence-max, &.confidence-high + color green - &.status-complete, &.confidence-max, &.confidence-high - background-color lime + &.confidence-low + color #FC2260 .show-region-link font-size 12px + display block + margin-left 10px + text-decoration none + .image-crop opacity 0.7 + margin-left 10px transition: height 0.3s ease-out @@ -124,3 +192,11 @@ &:hover opacity 1 + a.back + font-size: 1.3em; + text-decoration: none; + + &:before + content "<" + + diff --git a/app/controllers/final_subject_sets_controller.rb b/app/controllers/final_subject_sets_controller.rb index bf3e7980..ddc9bf2f 100644 --- a/app/controllers/final_subject_sets_controller.rb +++ b/app/controllers/final_subject_sets_controller.rb @@ -10,12 +10,57 @@ def index per_page = get_int :per_page, 20, (0..50) page = get_int :page, 1 + field = params[:field] keyword = params[:keyword] - @sets = FinalSubjectSet.page(page).per(per_page) - @sets = @sets.where({"$text" => {"$search" => keyword} } ) if keyword + + @sets = Project.current.final_subject_sets.page(page).per(per_page) + if ! field.blank? && (field_spec = FinalSubjectSet.export_spec_fields.select { |f| f.name == field }.first) + + match_exact = ['numeric','monetary','date'].include? field_spec.format + if field_spec && field_spec.format + split = "(-|to)" + split = " #{split} " if field_spec.format == 'date' + if keyword.match /\w+ ?#{split} ?\w+/i + values = keyword.split(/#{split}/i) + values = [values.first, values.last] + values = parse_range values, field_spec.format + + @sets = @sets.by_export_field_range(field, values) + + # specially handle searching by year: + elsif field_spec.format == 'date' && keyword.match(/^\d+$/) + values = parse_range [keyword,keyword], field_spec.format + @sets = @sets.by_export_field_range(field, values) + + # search by exact val: + else + value = parse_keyword keyword, field_spec.format + @sets = value.blank? ? [] : @sets.by_export_field(field, value, match_exact) + end + else + value = parse_keyword keyword, field_spec.format + @sets = value.blank? ? [] : @sets.by_export_field(field, value, match_exact) + end + + else + @sets = @sets.where({"$text" => {"$search" => keyword} } ) if keyword + end respond_with GenericResultSerializer.new(@sets) end + def parse_range(values, format) + parsed = values.map { |v| Export::DocumentBuilder.apply_format v, format } + if format == 'date' + parsed[0] = Export::DocumentBuilder.apply_format("#{values.first}-01-01",format) if parsed.first.nil? + parsed[1] = Export::DocumentBuilder.apply_format("#{values.last}-12-31",format) if parsed.last.nil? + end + parsed + end + + def parse_keyword(value, format) + parsed = Export::DocumentBuilder.apply_format value, format + parsed + end end diff --git a/app/models/export/document.rb b/app/models/export/document.rb new file mode 100644 index 00000000..3c861d66 --- /dev/null +++ b/app/models/export/document.rb @@ -0,0 +1,36 @@ +class Export::Document + include Mongoid::Document + + field :name, type: String + + belongs_to :spec, class_name: 'Export::Spec::Document' + embeds_many :export_fields, class_name: 'Export::DocumentField' + embedded_in :final_subject_set + + def self.from_set(set, specs) + specs.each do |spec| + return Export::DocumentBuilder.new(set, spec).export_document + end + end + + def data + export_fields.inject({}) do |h, f| + if h[f.name] + h[f.name] = [h[f.name]] if ! h[f.name].is_a?(Array) + h[f.name] << f.data + else + h[f.name] = f.data + end + h + end + end + + def to_s + ret = [] + ret << "#{spec.name}" + export_fields.each do |field| + ret << " #{field.to_s(2)}" + end + ret.join "\n" + end +end diff --git a/app/models/export/document_builder.rb b/app/models/export/document_builder.rb new file mode 100644 index 00000000..ad323cef --- /dev/null +++ b/app/models/export/document_builder.rb @@ -0,0 +1,185 @@ +class Export::DocumentBuilder + + def initialize(set, spec) + @set = set + @spec = spec + end + + def export_document + doc = Export::Document.create name: @spec.name, final_subject_set: @set, spec: @spec + @spec.spec_fields.each do |field_spec| + fields = fields_for_field_spec(field_spec) + doc.export_fields += fields if ! fields.blank? + end + if doc.export_fields.size < 3 + puts "Insufficient fields found in final-subject-set #{@set.id}: #{@set.subjects.first.location['standard']}" + nil + + else + doc + end + end + + def fields_for_field_spec(spec, base_assertion=nil) + if ! spec.repeats + best = best_for_field_spec(spec, base_assertion) + [best] if ! best.nil? + else + all_for_field_spec(spec, base_assertion) + end + end + + def best_for_field_spec(spec, base_assertion=nil) + all = all_for_field_spec(spec, base_assertion) + all.first if ! all.nil? + end + + def all_for_field_spec(spec, base_assertion=nil) + assertions = assertions_for_field_spec(spec, base_assertion).sort_by { |a| - a.confidence } + puts "[Nothing found for #{spec.name}...]" if assertions.blank? + return nil if assertions.blank? + + fields = assertions.map do |assertion| + if ! spec.sub_fields.empty? + # puts "parsing out #{spec.name}...." + field = Export::DocumentField.new name: spec.name + spec.sub_fields.each do |field_spec| + # puts " parsing out #{field_spec.name}...." + fields = fields_for_field_spec field_spec, assertion + field.sub_fields += fields if ! fields.blank? + end + field + + else + clean_val = value_for_assertion assertion, spec.format, spec.format_options + Export::DocumentField.new name: spec.name, value: clean_val, original_value: assertion.data, assertion_ids: [assertion.id] + end + end + + fields.uniq do |field| + field.data + end + end + + def value_for_assertion(assertion, format=nil, format_options) + v = assertion.data + v = v["value"] if ! v["value"].nil? + v = self.class.apply_format(v, format, format_options) if ! format.nil? + v + end + + def assertions_for_field_spec(spec, base_assertion=nil) + # @doc["subjects"].first["assertions"].select { |a| a["name"] == name } + # TODO add assertion.subject_id so that we can do this: + subjects = base_assertion.nil? ? @set.subjects : [base_assertion.final_subject] + # in the meantime we'll just do this: + # subjects = @set.subjects + subjects.map do |subject| + assertions = subject.assertions + # puts "selecting within region: #{base_assertion.region}" if ! base_assertion.nil? + assertions = assertions.select { |assertion| assertion.region == base_assertion.region } if ! base_assertion.nil? + assertions = assertions.select { |a| a.name == (spec.select.nil? ? spec.name : spec.select ) } + assertions + end.flatten + end + + def self.apply_format(value, format, options=nil) + # puts "apply format: #{format} to #{value.inspect}" + case format + when 'date' + parse_date(value, options) + when 'address' + parse_address(value) + when 'monetary' + parse_monetary(value) + when 'dimensions' + parse_dimensions(value) + when 'numeric' + parse_numeric value + else + # puts "it's a hash? #{format.inspect}" + if value.is_a?(Hash) && format.is_a?(Hash) + # puts "it's a hash: #{format.inspect}" + ret = {} + value.keys.each do |k| + ret[k] = apply_format(value[k], format[k], options) + end + ret + else + value + end + end + end + + def self.parse_numeric(value) + return nil if ! value.match /\d/ + v = value.gsub(/,|\$|\.(-|\d{2}$)?/, '').to_i + v + end + + # Pull arbitrary number of English system dimensions from string + def self.parse_dimensions(value) + dims = [] + value.split(/x/).each do |v| + v.strip! + fract = 0 + # If there's a fraction... + fract_reg = / (\d+)\/(\d+)$/ + if (m = v.match(fract_reg)) + fract = m[1].to_f / m[2].to_f + v.sub! fract_reg, '' + end + # If inches given as [FEET].[INCHES] or [FEET] [INCHES]" .. + inches_reg = /(\.(\d+)| (\d+)")$/ + if (m = v.match(inches_reg)) + # This means previous fract was inches: (e.g. 1/2 inch) + fract /= 12 + # puts "summing fact: #{fract} + (#{m[2].to_f / 12})" + fract += m[2].to_f / 12 + v.sub! inches_reg, '' + end + dims << v.to_f + fract + end + dims + end + + def self.parse_monetary(value) + return nil if ! value.match /\d/ + v = value.gsub(/,|\$|\.(-|\d{2}$)?/, '').to_f + v + end + + def self.parse_date(value, options) + ret = nil + begin + ret = Date.parse(value) + rescue ArgumentError + puts "invalid date: #{value}" + end + + # Override default year expansion if a target range is configured and computed date is outside range: + # e.g. if options["range"] == [1850,1950], `16 should default to 1916, not 2016 + # Known issue: range should be a 100 yr span (or smaller), because otherwise century may be ambiguous + if ! ret.nil? && ! options.nil? && options["range"] && ret.year > options["range"].last + range = options["range"] + # Get two digit year: + partial_year = ret.year % 100 + # Round-down range to decades (e.g. [1800,1900]) + decades = range.map { |r| r - (r % 100) } + # See which of the (presumably 2) decades places the partial_year within range: + corrected_year = partial_year + decades.first > range.first ? decades.first + partial_year : decades.last + partial_year + # Rebuild date using corrected_year: + ret = Date.new corrected_year, ret.month, ret.day + end + + ret + end + + def self.parse_address(value) + value = value.dup + value.gsub! /^no\.? /i, '' + value + end + + +end diff --git a/app/models/export/document_field.rb b/app/models/export/document_field.rb new file mode 100644 index 00000000..9eb689c4 --- /dev/null +++ b/app/models/export/document_field.rb @@ -0,0 +1,33 @@ +class Export::DocumentField + include Mongoid::Document + + embedded_in :export_document + + field :name, type: String + field :value + field :original_value + field :assertion_ids, type: Array + + has_one :spec, class_name: 'Export::Spec::DocumentField' + embeds_many :sub_fields, class_name: 'Export::DocumentField' + + def data + if sub_fields.empty? + value + else + sub_fields.inject({}) do |h, f| + h[f.name] = f.data + h + end + end + end + + def to_s(indent=0) + if ! sub_fields.empty? + "#{name}:\n" + (" " * indent) + sub_fields.map { |f| f.to_s(indent+1) }.join("\n" + (" " * indent)) + + else + "#{name}: #{value} (orig \"#{original_value}\")" # [assertion(s) #{assertion_ids}]" + end + end +end diff --git a/app/models/export/spec/document.rb b/app/models/export/spec/document.rb new file mode 100644 index 00000000..88f14ea7 --- /dev/null +++ b/app/models/export/spec/document.rb @@ -0,0 +1,16 @@ +class Export::Spec::Document + include Mongoid::Document + + field :name, type: String + + embeds_many :spec_fields, class_name: 'Export::Spec::DocumentField' + embedded_in :project + + def self.from_hash(h, project) + inst = self.new project: project, name: h['name'] + inst.spec_fields = h['spec_fields'].map do |h| + Export::Spec::DocumentField.from_hash h, inst + end + inst + end +end diff --git a/app/models/export/spec/document_field.rb b/app/models/export/spec/document_field.rb new file mode 100644 index 00000000..9021f667 --- /dev/null +++ b/app/models/export/spec/document_field.rb @@ -0,0 +1,26 @@ +class Export::Spec::DocumentField + include Mongoid::Document + + field :name, type: String + field :select, type: String + field :format # string, monetary, address, {} + field :format_options, type: Hash # e.g. "format_options": {"range": [1850,1950]} + field :repeats, type: Boolean + embeds_many :sub_fields, class_name: 'Export::Spec::DocumentField' + embedded_in :export_document_spec + embedded_in :export_document_spec_field + + def to_s + name + (select.nil? ? '' : " (select: \"#{select}\")") + end + + def self.from_hash(h, doc_spec, parent_field=nil) + inst = self.new export_document_spec: doc_spec, name: h['name'], select: h['select'], format: h['format'], format_options: h['format_options'], repeats: h['repeats'] + if ! h['sub_fields'].blank? + h['sub_fields'].each do |sub_h| + inst.sub_fields << from_hash(sub_h, nil, inst) + end + end + inst + end +end diff --git a/app/models/final_subject.rb b/app/models/final_subject.rb index d134ded0..b045897c 100644 --- a/app/models/final_subject.rb +++ b/app/models/final_subject.rb @@ -108,7 +108,6 @@ def flattened_subjects(subjects, parents = []) ret += flattened_subjects(s.child_subjects, parents + [s]) else - # ret << FinalSubjectAssertionSerializer.new(subject: s, parents: parents) ret << {subject: s, parents: parents} if s.status != 'bad' end end diff --git a/app/models/final_subject_assertion.rb b/app/models/final_subject_assertion.rb index 9fcb50af..bf85f3a8 100644 --- a/app/models/final_subject_assertion.rb +++ b/app/models/final_subject_assertion.rb @@ -11,6 +11,8 @@ class FinalSubjectAssertion field :task_key, type: String field :instructions, type: Hash + belongs_to :root_subject, class_name: "Subject" + embedded_in :final_subject, inverse_of: :assertions def self.create_from_subject(subject, parents) diff --git a/app/models/final_subject_set.rb b/app/models/final_subject_set.rb index 74941dce..0318c92c 100644 --- a/app/models/final_subject_set.rb +++ b/app/models/final_subject_set.rb @@ -1,7 +1,31 @@ class FinalSubjectSet include Mongoid::Document include Mongoid::Timestamps - # include Mongoid::FullTextSearch + + scope :by_export_field, -> (name, value, exact) do + where({ + "export_document.export_fields" => { + '$elemMatch' => { + name: name, + value: ( exact ? value : { "$regex" => /#{value}/i } ) + } + } + }) + end + + scope :by_export_field_range, -> (name, values) do + where({ + "export_document.export_fields" => { + '$elemMatch' => { + name: name, + value: { + "$gte" => values.first, + "$lte" => values.last + } + } + } + }) + end belongs_to :project belongs_to :subject_set @@ -17,16 +41,13 @@ class FinalSubjectSet index({"project_id" => 1}, {background: true}) index({"search_terms" => "text"}) - # can't create two... - # index({"search_terms_by_field" => "text"}) [:total, :complete, :awaiting_votes, :in_progress, :awaiting_transcriptions].each do |field| index({"subjects.assertions_breakdown.all_workflows.#{field}" => 1}, {background: true}) end embeds_many :subjects, class_name: 'FinalSubject' - - # fulltext_search_in :fulltext_terms + embeds_one :export_document, class_name: "Export::Document" def build_search_terms update_attributes({ @@ -35,6 +56,16 @@ def build_search_terms }) end + def self.export_spec_fields + Project.current.export_document_specs.map do |spec| + spec.spec_fields + end.flatten + end + + def build_export_document + self.export_document = Export::Document.from_set self, Project.current.export_document_specs + end + def compute_fulltext_terms compute_fulltext_terms_by_field.values.flatten.uniq end @@ -60,6 +91,8 @@ def self.assert_for_set(set, rebuild=false) inst.meta_data = set.meta_data inst.update_subjects inst.build_search_terms + inst.build_export_document + puts "Saving final subject set: #{inst.id}" inst.save! end @@ -76,6 +109,7 @@ def self.rebuild_indexes(for_project) collection.indexes.drop unless self.count == 0 # If no records yet saved, moped will error when dropping indexes for_project.export_names.each do |(key,name)| index({"search_terms_by_field.#{key}" => 1}, {background: true}) + index({"export_document.export_fields.name" => 1, "export_document.export_fields.value" => 1}) end create_indexes end diff --git a/app/models/project.rb b/app/models/project.rb index 27c99c5d..87cea299 100644 --- a/app/models/project.rb +++ b/app/models/project.rb @@ -48,6 +48,8 @@ class Project has_many :final_subject_sets has_many :final_data_exports + embeds_many :export_document_specs, class_name: "Export::Spec::Document" + scope :most_recent, -> { order(updated_at: -1) } scope :active, -> { where(status: 'active') } diff --git a/app/serializers/export/document_field_serializer.rb b/app/serializers/export/document_field_serializer.rb new file mode 100644 index 00000000..f1d572ee --- /dev/null +++ b/app/serializers/export/document_field_serializer.rb @@ -0,0 +1,8 @@ +class Export::DocumentFieldSerializer < ActiveModel::MongoidSerializer + attributes :name, :value, :original_value, :assertion_ids + + def assertion_ids + object.assertion_ids.map { |oid| oid.to_s } unless object.assertion_ids.blank? + end + +end diff --git a/app/serializers/export/document_serializer.rb b/app/serializers/export/document_serializer.rb new file mode 100644 index 00000000..b6824484 --- /dev/null +++ b/app/serializers/export/document_serializer.rb @@ -0,0 +1,5 @@ +class Export::DocumentSerializer < ActiveModel::MongoidSerializer + attributes :name + + has_many :export_fields +end diff --git a/app/serializers/export/spec/document_field_serializer.rb b/app/serializers/export/spec/document_field_serializer.rb new file mode 100644 index 00000000..30cf4db1 --- /dev/null +++ b/app/serializers/export/spec/document_field_serializer.rb @@ -0,0 +1,7 @@ +class Export::Spec::DocumentFieldSerializer < ActiveModel::MongoidSerializer + attributes :format, :name + + def format + object.sub_fields.blank? && object.format.nil? ? 'string' : object.format + end +end diff --git a/app/serializers/export/spec/document_serializer.rb b/app/serializers/export/spec/document_serializer.rb new file mode 100644 index 00000000..0156aac5 --- /dev/null +++ b/app/serializers/export/spec/document_serializer.rb @@ -0,0 +1,5 @@ +class Export::Spec::DocumentSerializer < ActiveModel::MongoidSerializer + + has_many :spec_fields + +end diff --git a/app/serializers/final_subject_set_serializer.rb b/app/serializers/final_subject_set_serializer.rb index aeca277c..8aa84a73 100644 --- a/app/serializers/final_subject_set_serializer.rb +++ b/app/serializers/final_subject_set_serializer.rb @@ -3,6 +3,7 @@ class FinalSubjectSetSerializer < ActiveModel::MongoidSerializer attributes :id, :meta_data, :type, :search_terms_by_field has_many :subjects + has_one :export_document def id object.id.to_s diff --git a/app/serializers/generic_result_serializer.rb b/app/serializers/generic_result_serializer.rb index 7990790f..b9bda015 100644 --- a/app/serializers/generic_result_serializer.rb +++ b/app/serializers/generic_result_serializer.rb @@ -27,13 +27,15 @@ def data end def meta - { + m = { current_page: object.current_page, next_page: object.next_page, prev_page: object.prev_page, total_pages: object.total_pages, total: object.count } if object.respond_to? :current_page + m = m.merge(serialization_options[:meta]) if ! serialization_options[:meta].nil? + m end def links diff --git a/app/serializers/project_serializer.rb b/app/serializers/project_serializer.rb index 90f33cf3..4b6f50a3 100644 --- a/app/serializers/project_serializer.rb +++ b/app/serializers/project_serializer.rb @@ -3,6 +3,8 @@ class ProjectSerializer < ActiveModel::MongoidSerializer attributes :classification_count has_many :workflows + has_many :export_document_specs + def latest_export FinalDataExportSerializer.new FinalDataExport.most_recent.first, root: false end diff --git a/lib/tasks/project.rake b/lib/tasks/project.rake index c0f3f8be..86e182e8 100644 --- a/lib/tasks/project.rake +++ b/lib/tasks/project.rake @@ -126,11 +126,12 @@ namespace :project do # load project_file_path project = Project.find_or_create_by key: project_key + load_export_specs(project, project_hash['export_specs']) if project_hash['export_specs'] + # Set all valid fields from hash: project_hash = project_hash.inject({}) { |h, (k,v)| h[k] = v if Project.fields.keys.include?(k.to_s); h } project.update project_hash - # Load pages from content/*: content_path = Rails.root.join('project', project_key, 'content') puts "Loading pages from #{content_path}:" @@ -201,6 +202,12 @@ namespace :project do project end + def load_export_specs(project, config) + project.export_document_specs = config.map do |h| + ExportDocumentSpec.from_hash h, project + end + end + def load_styles(project) load_images(project.key) @@ -386,17 +393,25 @@ namespace :project do end desc "Build final_subject* data in database" - task :build_final_data, [:project_key, :rebuild] => :environment do |task, args| - args.with_defaults rebuild: true + task :build_final_data, [:project_key, :rebuild, :start, :limit] => :environment do |task, args| + args.with_defaults rebuild: true, start: 0, limit: Float::INFINITY rebuild = args[:rebuild] != 'false' + start = args[:start].to_i + limit = args[:limit].to_f project = project_by_key args[:project_key] - start = Time.now + start_time = Time.now count = project.subject_sets.count - limit = 100 + last_index = [count, start + limit - 1].min + step = [100, limit].min built = 0 + # puts "set: #{SubjectSet.find("5637a11432623300030a0100").inspect}" + # FinalSubjectSet.assert_for_set SubjectSet.find("56b115677061755afb539701"), rebuild + # FinalSubjectSet.assert_for_set FinalSubjectSet.find('56b118e07061755afbfcd801').subject_set, rebuild + # exit + # Do any of this project's workflow tasks have configured export_names? If not, warn: has_export_names = ! project.workflows.map { |w| w.tasks }.flatten.select { |t| ! t.export_name.blank? }.empty? puts "WARNING: No export_names found in workflow configuration. This may make it tricky to interpret the field-level data. See `export_name` documentation in https://github.com/zooniverse/scribeAPI/wiki/Project-Workflows#tasks" if ! has_export_names @@ -404,17 +419,17 @@ namespace :project do # Rebuild indexes FinalSubjectSet.rebuild_indexes Project.current - (0..count).step(limit).each do |offset| - sets = project.subject_sets.offset(offset).limit(limit).each_with_index do |set, i| + (start..last_index).step(step).each do |offset| + sets = project.subject_sets.offset(offset).limit(step).each_with_index do |set, i| final_set = FinalSubjectSet.assert_for_set set, rebuild built += 1 - ellapsed = Time.now - start + ellapsed = Time.now - start_time per_set = ellapsed / built remaining = per_set * (count - (offset + i+1)) / 60 / 60 complete = (offset + i+1).to_f / count * 100 - $stderr.print "\r#{'%.8f' % complete}% complete. #{'%.1f' % remaining}h remaining. Built #{offset +i+1} of #{count}" + $stderr.print "\r#{'%.8f' % complete}% complete. #{'%.1f' % remaining}h remaining. Built item #{offset +i+1} of #{count}" end end From cbc31e9d5ca74682a3f9f0e5c3252e21302bb74a Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Tue, 16 Feb 2016 17:10:57 -0500 Subject: [PATCH 11/23] fixed some classname issues; prevent final-data-build if no export-document-spec configured --- .../components/final-subject-set-page.cjsx | 14 +++++++++----- app/models/export/document_builder.rb | 2 +- app/models/export/spec/document_field.rb | 4 ++-- app/models/final_subject_assertion.rb | 2 +- app/models/final_subject_set.rb | 6 +++++- lib/tasks/project.rake | 7 ++++++- 6 files changed, 24 insertions(+), 11 deletions(-) diff --git a/app/assets/javascripts/components/final-subject-set-page.cjsx b/app/assets/javascripts/components/final-subject-set-page.cjsx index 0acee62a..cb7d0e9f 100644 --- a/app/assets/javascripts/components/final-subject-set-page.cjsx +++ b/app/assets/javascripts/components/final-subject-set-page.cjsx @@ -43,16 +43,20 @@ module.exports = React.createClass Download Raw Data { if @state.set.export_document? && (display_field = @state.set.export_document.export_fields[0])?

    {display_field.name} {display_field.value}

    + else +

    Record {@state.set.id}

    } -
      - { if @state.tabs.indexOf('export-doc') >= 0 -
    • Best Data
    • + { if @state.tabs.length == 2 + } -
    • All Data
    • -
    { if @state.tab == 'export-doc' if @state.set.export_document diff --git a/app/models/export/document_builder.rb b/app/models/export/document_builder.rb index ad323cef..9a7310a9 100644 --- a/app/models/export/document_builder.rb +++ b/app/models/export/document_builder.rb @@ -36,7 +36,7 @@ def best_for_field_spec(spec, base_assertion=nil) def all_for_field_spec(spec, base_assertion=nil) assertions = assertions_for_field_spec(spec, base_assertion).sort_by { |a| - a.confidence } - puts "[Nothing found for #{spec.name}...]" if assertions.blank? + # puts "[Nothing found for #{spec.name}...]" if assertions.blank? return nil if assertions.blank? fields = assertions.map do |assertion| diff --git a/app/models/export/spec/document_field.rb b/app/models/export/spec/document_field.rb index 9021f667..54cd9706 100644 --- a/app/models/export/spec/document_field.rb +++ b/app/models/export/spec/document_field.rb @@ -7,8 +7,8 @@ class Export::Spec::DocumentField field :format_options, type: Hash # e.g. "format_options": {"range": [1850,1950]} field :repeats, type: Boolean embeds_many :sub_fields, class_name: 'Export::Spec::DocumentField' - embedded_in :export_document_spec - embedded_in :export_document_spec_field + embedded_in :export_document_spec, class_name: 'Export::Spec::Document' + embedded_in :export_document_spec_field, class_name: 'Export::Spec::DocumentField' def to_s name + (select.nil? ? '' : " (select: \"#{select}\")") diff --git a/app/models/final_subject_assertion.rb b/app/models/final_subject_assertion.rb index bf85f3a8..592c5d17 100644 --- a/app/models/final_subject_assertion.rb +++ b/app/models/final_subject_assertion.rb @@ -44,7 +44,7 @@ def self.instructions_for_subject(subject, parents) parents.each do |s| next if s.parent_workflow.nil? - if s.parent_workflow.name == 'mark' && subject.region[:label] + if s.parent_workflow.name == 'mark' && subject.region && subject.region[:label] ret[s.parent_workflow.name] = subject.region[:label] else diff --git a/app/models/final_subject_set.rb b/app/models/final_subject_set.rb index 0318c92c..2d2c9a9b 100644 --- a/app/models/final_subject_set.rb +++ b/app/models/final_subject_set.rb @@ -63,7 +63,11 @@ def self.export_spec_fields end def build_export_document - self.export_document = Export::Document.from_set self, Project.current.export_document_specs + if ! Project.current.export_document_specs.blank? + self.export_document = Export::Document.from_set self, Project.current.export_document_specs + else + puts "No export_document_specs configured for #{Project.current.title}" + end end def compute_fulltext_terms diff --git a/lib/tasks/project.rake b/lib/tasks/project.rake index 86e182e8..4db684c6 100644 --- a/lib/tasks/project.rake +++ b/lib/tasks/project.rake @@ -204,7 +204,7 @@ namespace :project do def load_export_specs(project, config) project.export_document_specs = config.map do |h| - ExportDocumentSpec.from_hash h, project + Export::Spec::Document.from_hash h, project end end @@ -416,6 +416,11 @@ namespace :project do has_export_names = ! project.workflows.map { |w| w.tasks }.flatten.select { |t| ! t.export_name.blank? }.empty? puts "WARNING: No export_names found in workflow configuration. This may make it tricky to interpret the field-level data. See `export_name` documentation in https://github.com/zooniverse/scribeAPI/wiki/Project-Workflows#tasks" if ! has_export_names + if project.export_document_specs.blank? + puts "No export_spec configured; Add one before building" + exit + end + # Rebuild indexes FinalSubjectSet.rebuild_indexes Project.current From 3423418209bf303c14695c01c03a4ef1ca1a6a8c Mon Sep 17 00:00:00 2001 From: wlla Date: Wed, 17 Feb 2016 14:01:25 -0500 Subject: [PATCH 12/23] Copy updates for about and data page. --- project/emigrant/content/about.html.erb | 3 +++ project/emigrant/content/data.html.erb | 31 +++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/project/emigrant/content/about.html.erb b/project/emigrant/content/about.html.erb index cfc8ca84..de3e4191 100644 --- a/project/emigrant/content/about.html.erb +++ b/project/emigrant/content/about.html.erb @@ -3,6 +3,9 @@ ### Contact information Questions? Comments? Contact us at emigrantcity@nypl.org or reach out to us on [Twitter](https://twitter.com/nypl_labs). +### Data +This is an active project and we’re continuing to gather data from the records. Every two weeks, we build a merged, anonymized dump of that data. You can browse or download the entire data set on the Data page. + ### About Emigrant Bank [Emigrant Bank](https://www.emigrant.com/Information/aboutus/AboutUs.jsp) was founded in 1850 by members of the Irish Emigrant society to serve the needs of the Irish immigrant community in New York. In its early history, the bank grew to become the seventh largest bank in the nation, and it made major investments in the growth of New York City by underwriting loans for such important initiatives as the construction of St. Patrickís Cathedral and a public works project that ultimately became Central Park. In 1995, Emigrant Bank generously donated to The New York Public Library [extensive archival records](http://archives.nypl.org/mss/925) that are valuable historical and genealogical resources documenting the lives of immigrant families. The Library microfilmed the entire collection and compiled a [detailed finding aid](http://archives.nypl.org/uploads/documents/documentation/collection_1837_mss925-extra.pdf) that outlines the full scope of the Emigrant Savings Bank records. This heavily trafficked collection is housed in the [Manuscripts and Archives Division](http://www.nypl.org/locations/divisions/manuscripts-division). Though, users primarily encounter it through the [Irma and Paul Milstein Division of United States History, Local History and Genealogy](http://www.nypl.org/locations/divisions/milstein). diff --git a/project/emigrant/content/data.html.erb b/project/emigrant/content/data.html.erb index d711efd3..7caf415b 100644 --- a/project/emigrant/content/data.html.erb +++ b/project/emigrant/content/data.html.erb @@ -1,8 +1,35 @@ ## Data exports -With help from volunteers like yourself, we are extracting structured, building-level information from about 6,400 mortgages contained in the Emigrant Savings Bank Records at The New York Public Library. Of course, the ultimate goal is to make this data publicly available. Having a keyword-searchable and structured index of names and mortgage details will of great use to genealogists, historians, digital humanities researchers, and others interested in exploring historical data sources. +Participants have made 481,562 contributions to Emigrant City to date. This project periodically builds a merged, anonymized dump of that data, which is made public here. This is an active project and we’re continuing to gather data from the records. The data made available here is refreshed every two weeks. -After enough transcriptions are made, our team will determine the best way to get the data to you in an easy and accessible way. In the meantime, feel free to explore the collection assets themselves on the Library's [Digital Collections](http://digitalcollections.nypl.org/collections/emigrant-savings-bank-records) website. In this project, we're working with the subset of these digitized materials, the volumes containing Mortgage and Bond records. The volumes appearing in Emigrant City are: +### Browse + +Preview the data by searching by keyword below. You may restrict your search by field. See below for Tips & Tricks for browsing the dataset and records. + + +### Download +Use the button in the upper-right to download the entire data set. This is a large dataset in json format containing all the assertions, confidence ratings, and fields for the roughly 6,400 records in Emigrant City. For help interpreting the data, see Scribe WIKI on Data Exports. We are actively gathering contributions to the data set and it is refreshed every two weeks. The last dump was February 16, 2016. Use the RRS button in the upper-right to stay updated as we refresh the data. + +### Tips & Tricks +* **Use quotation marks to search for full phrases.** +* **Capitalization doesn’t matter in search.** +* **Why are there field repetitions in this record? Why is a field missing from this record?** + You may notice that for some records, fields appear multiple times. Sometimes multiple different fields are associated with one record. For example, [Record 3758](http://emigrantcity.nypl.org/#/data/exports/56b109ac7061755afbfdbb00?keyword="Eunice R. Waterbury"&field=undefined) has two mortgagors listed: "Eunice R. Waterbury" and "Eloise B. Crothers." + +* **What fields appear?** +This project intended to create a index to enable further discovery. With this project, the goal is to create an index of the bond and mortgage records to enable further discovery and use. We’ve worked with experts on the material to scope the data collected to fields which appear with greater regularity across documents: Record Date, Record Number, Mortgager Name, Street Address, Amount Loaned, Valuation, and Dimension & Description. Collecting and verifying these fields from the records creates resource that opens up the search of these materials. However, the data being gathered is not an exhaustive. Rather than creating a complete transcription of each record, the transcribed fields create an index to give some toeholds for future reference and querying of these rich materials which previously were largely invisible and difficult to search. + +* **Consider abbreviations.** +We asked users to transcribe exactly what they saw written on the records. For this reason, abbreviations may have been used. For example, the word Brooklyn may appear as “BKLYN” in one record and “Brooklyn” in another. + +* **What is the confidence field?** +The metadata for these records was created through contributions from many users. A result of this process is that we can guage how confident we are for each field. + + + + +### Assets +Feel free to explore the collection assets themselves on the Library's [Digital Collections](http://digitalcollections.nypl.org/collections/emigrant-savings-bank-records) website. In this project, we're working with the subset of these digitized materials, the volumes containing Mortgage and Bond records. The volumes appearing in Emigrant City are: * [Bond and Mortgage Record Book 1 (1 to 1,555)](http://digitalcollections.nypl.org/items/df712aa0-00b1-0133-fbd7-58d385a7bbd0) * [Bond and Mortgage Record Book 2 (1,556 to 2, 721)](http://digitalcollections.nypl.org/items/c0c38370-015a-0133-065e-58d385a7bbd0) * [Bond and Mortgage Record Book 3 (2,722 to 3,699)](http://digitalcollections.nypl.org/items/5bb969d0-0241-0133-f196-58d385a7b928) From b097f89664e96fbdd4667f54224a8443c7054faf Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Thu, 18 Feb 2016 12:29:56 -0500 Subject: [PATCH 13/23] adding subject metadata to final-subject-set-page; add option to restrict build-export invocation to certain day of week to overcome heroku inability to schedule weekly; proper pagination for final-subject-set results --- .../components/final-subject-set-browser.cjsx | 41 ++++--- .../components/final-subject-set-page.cjsx | 108 +++++++++++------- .../javascripts/components/pagination.cjsx | 59 ++++++++++ .../final-subject-set-browser.styl | 41 ++++++- .../final_subject_sets_controller.rb | 2 +- app/serializers/generic_result_serializer.rb | 10 +- lib/tasks/project.rake | 10 +- 7 files changed, 204 insertions(+), 67 deletions(-) create mode 100644 app/assets/javascripts/components/pagination.cjsx diff --git a/app/assets/javascripts/components/final-subject-set-browser.cjsx b/app/assets/javascripts/components/final-subject-set-browser.cjsx index 9b6d59cd..321904a9 100644 --- a/app/assets/javascripts/components/final-subject-set-browser.cjsx +++ b/app/assets/javascripts/components/final-subject-set-browser.cjsx @@ -4,6 +4,7 @@ API = require '../lib/api' Project = require 'models/project.coffee' GenericButton = require('components/buttons/generic-button') LoadingIndicator = require('components/loading-indicator') +Pagination = require('components/pagination') module.exports = React.createClass displayName: 'FinalSubjectSetBrowser' @@ -15,28 +16,28 @@ module.exports = React.createClass selected_field: @props.query.field searched_query: {} fetching_keyword: null - current_page: 1 + current_page: @props.query.page ? 1 more_pages: false results: [] project: null componentDidMount: -> - @checkKeyword() + @checkQueryString() API.type('projects').get().then (result)=> @setState project: new Project(result[0]) componentWillReceiveProps: (new_props) -> - @checkKeyword new_props + @checkQueryString new_props - checkKeyword: (props = @props) -> + checkQueryString: (props = @props) -> if props.query.keyword - @fetch({keyword: props.query.keyword, field: props.query.field}) + @fetch({keyword: props.query.keyword, field: props.query.field}, props.query.page) fetch: (query, page = 1) -> return if ! @isMounted() - if query.keyword != @state.fetching_keyword || query.field != @state.selected_field + if query.keyword != @state.searched_keyword || query.field != @state.selected_field || @props.current_page != page results = @state.results results = [] if @state.searched_query?.keyword != query.keyword @@ -49,12 +50,8 @@ module.exports = React.createClass page: @state.fetching_page API.type('final_subject_sets').get(params).then (sets) => - results = @state.results - offset = (@state.fetching_page-1) * per_page - for s,i in sets - results[i + offset] = s @setState - results: results + results: sets searched_query: keyword: @props.query.keyword field: @props.query.field @@ -85,6 +82,15 @@ module.exports = React.createClass @setState selected_field: e.target.value + renderPagination: -> + + renderSearch: ->

    Browse

    @@ -105,7 +111,7 @@ module.exports = React.createClass
    - { if @state.fetching_keyword && @state.fetching_keyword != @state.searched_query?.keyword + { if @state.fetching_keyword else if @state.searched_query?.keyword && @state.results.length == 0 @@ -114,9 +120,10 @@ module.exports = React.createClass else if @state.results.length > 0

    Found {@state.results[0].getMeta('total')} matches

    +
      { for set in @state.results - url = "/#/data/exports/#{set.id}?keyword=#{@state.searched_query.keyword}&field=#{@state.searched_query.field}" + url = "/#/data/exports/#{set.id}?keyword=#{@state.searched_query.keyword}&field=#{@state.searched_query.field ? ''}" matches = [] safe_keyword = (w.replace(/\W/g, "\\$&") for w in @state.searched_query.keyword.toLowerCase().replace(/"/g,'').split(' ')).join("|") @@ -136,7 +143,7 @@ module.exports = React.createClass
    • @@ -152,12 +159,8 @@ module.exports = React.createClass
    • }
    - { if @state.fetching_keyword && @state.fetching_keyword == @state.searched_query?.keyword - - else if @state.more_pages - - } + { @renderPagination() if @state.results.length > 0 }
    }
    diff --git a/app/assets/javascripts/components/final-subject-set-page.cjsx b/app/assets/javascripts/components/final-subject-set-page.cjsx index cb7d0e9f..07a690a5 100644 --- a/app/assets/javascripts/components/final-subject-set-page.cjsx +++ b/app/assets/javascripts/components/final-subject-set-page.cjsx @@ -15,6 +15,7 @@ module.exports = React.createClass API.type("final_subject_sets").get(@props.params.final_subject_set_id).then (set) => tabs = [] tabs.push 'export-doc' if set.export_document + tabs.push 'source-metadata' if set.meta_data tabs.push 'assertions' @setState set: set @@ -38,7 +39,7 @@ module.exports = React.createClass
    - Back + Back Download Raw Data { if @state.set.export_document? && (display_field = @state.set.export_document.export_fields[0])? @@ -49,56 +50,83 @@ module.exports = React.createClass - { if @state.tabs.length == 2 + { if @state.tabs.length > 1 } - { if @state.tab == 'export-doc' - if @state.set.export_document - for field,i in @state.set.export_document.export_fields - if field.assertion_ids - assertion = subject = null - for s in @state.set.subjects - for a in s.assertions - if field.assertion_ids.indexOf(a.id) >= 0 - assertion = a - subject = s - if assertion && subject -
    - -
    + { if @state.tab == 'export-doc' && @state.set.export_document +
    +

    These data points represent numerous individual classifications that have been merged and lightly cleaned up to adhere to {@props.project.title}'s data model.

    + + { for field,i in @state.set.export_document.export_fields + if field.assertion_ids + assertion = subject = null + for s in @state.set.subjects + for a in s.assertions + if field.assertion_ids.indexOf(a.id) >= 0 + assertion = a + subject = s + if assertion && subject +
    + +
    + } +
    } - { if @state.tab == 'assertions' -
      - { for subject in @state.set.subjects -
    • -
        - { - # Sort assertions by order they appear in document: - assertions = subject.assertions.sort (a1,a2) -> - if a1.region.y < a2.region.y - -1 - else - 1 - null - } - { for assertion,i in assertions when assertion.name -
      • - -
      • - } -
      - -
    • - } -
    +
    +

    These data points represent all distinct assertions made upon this {@props.project.term('subject set')} - without cleanup. Each assertion may represent several distinct contributions.

    +
      + { for subject in @state.set.subjects +
    • +
        + { + # Sort assertions by order they appear in document: + assertions = subject.assertions.sort (a1,a2) -> + if a1.region.y < a2.region.y + -1 + else + 1 + null + } + { for assertion,i in assertions when assertion.name +
      • + +
      • + } +
      + +
    • + } +
    +
    + } + + { if @state.tab == 'source-metadata' + +
    +

    This metadata was imported alongside the source images at the beginning of the project and may include high res source URIs and processing details.

    + +
    + { for k,v of @state.set.meta_data +
    +
    {k.split('_').map( (v) => v.capitalize() ).join(' ')}
    + { if v.match(/https?:\/\//) +
    {v}
    + else +
    {v}
    + } +
    + } +
    +
    }
    diff --git a/app/assets/javascripts/components/pagination.cjsx b/app/assets/javascripts/components/pagination.cjsx new file mode 100644 index 00000000..3b896132 --- /dev/null +++ b/app/assets/javascripts/components/pagination.cjsx @@ -0,0 +1,59 @@ +React = require 'react' + +module.exports = React.createClass + displayName: 'Pagination' + + getDefaultProps: -> + max_links: 12 + + pageUrl: (page) -> + base = location.href.replace /(&|\?)page=[^&]+/, '' + "#{base}#{if base.indexOf("?") >= 0 then '&' else '?'}page=#{page}" + + render: -> + # Build array of page numbers to show.. + + pages = [] + if @props.total_pages <= @props.max_links + # If fewer pages than max, show them all: + pages = [1..@props.total_pages] + + else + # Too many to show, so truncate.. + # Assuming we want three groups of truncated links (first few, last few, + # and a middle group centered around current page).. + chunk_size = @props.max_links / 3 - 1 + for p in [1..@props.total_pages] + # Add first few pages: + pages.push p if p <= chunk_size + # Add a middle group of pages around the current page: + pages.push p if Math.abs(@props.current_page - p) <= chunk_size/2 && pages.indexOf(p)<0 + # Bookend with last few pages: + pages.push p if p > @props.total_pages - chunk_size && pages.indexOf(p)<0 + + page_links = [] + + # Add leading < link + page_links.push({label: "<", page: @props.prev_page, title: "Previous", disabled: false}) if @props.prev_page + + for page,i in pages + # Add divider if this page is the beginning of a chunk: + page_links.push({dotdotdot: true}) if i > 0 && pages[i-1] != page-1 + # Add page link: + page_links.push({label: page, page: page, title: "Page #{page}", disabled: page == @props.current_page}) + + # Add final > link + page_links.push({label: ">", page: @props.next_page, title: "Next", disabled: false}) if @props.next_page? + +
      + { for link, i in page_links + if link.dotdotdot? +
    • + + else if link.disabled +
    • + + else +
    • + } +
    diff --git a/app/assets/stylesheets/final-subject-set-browser.styl b/app/assets/stylesheets/final-subject-set-browser.styl index baa27fb3..572dc434 100644 --- a/app/assets/stylesheets/final-subject-set-browser.styl +++ b/app/assets/stylesheets/final-subject-set-browser.styl @@ -1,5 +1,29 @@ .final-subject-set-browser + .pagination + list-style none + + li + display inline-block + margin-right 0.5em + line-height 1.2em + + a, span + @extends .standard-button + font-weight bold + color white + padding 0.2em 0.6em + + a + text-decoration none + + &.disabled + opacity 0.5 + + &.divider:after + content "..." + + .tabs padding 0 margin 0 @@ -11,8 +35,8 @@ display inline-block border solid #bbb border-width 1px 1px 0 1px - margin 0 20px 0 0 - font-size 1.5em + margin 0 0 0 10px + font-size 1.4em border-radius 5px 5px 0 0 a @@ -21,13 +45,11 @@ color gray &.active - background-color rgba(255,255,255,0.50) + background-color rgba(255,255,255,0.30) a font-weight bold color TERTIARY_NORMAL - - &.page-content h2 @@ -199,4 +221,13 @@ &:before content "<" + dl.source-metadata + dl + margin-top 10px + + dt + font-size 20px + dd + margin-left 0 + margin-bottom 10px diff --git a/app/controllers/final_subject_sets_controller.rb b/app/controllers/final_subject_sets_controller.rb index ddc9bf2f..b91d9409 100644 --- a/app/controllers/final_subject_sets_controller.rb +++ b/app/controllers/final_subject_sets_controller.rb @@ -47,7 +47,7 @@ def index @sets = @sets.where({"$text" => {"$search" => keyword} } ) if keyword end - respond_with GenericResultSerializer.new(@sets) + respond_with GenericResultSerializer.new(@sets).serializable_hash base_url: request.url end def parse_range(values, format) diff --git a/app/serializers/generic_result_serializer.rb b/app/serializers/generic_result_serializer.rb index b9bda015..f945e2f7 100644 --- a/app/serializers/generic_result_serializer.rb +++ b/app/serializers/generic_result_serializer.rb @@ -39,6 +39,14 @@ def meta end def links - serialization_options[:links] + m = {} + if serialization_options[:base_url] + base_url,query = serialization_options[:base_url].split '?' + query = Rack::Utils.parse_nested_query query + m[:next_page_uri] = "#{base_url}?#{query.merge({"page" => object.next_page}).to_query}" if object.next_page + m[:prev_page_uri] = "#{base_url}?#{query.merge({"page" => object.prev_page}).to_query}" if object.prev_page + end + m.merge! serialization_options[:links] if serialization_options[:links] + m end end diff --git a/lib/tasks/project.rake b/lib/tasks/project.rake index 4db684c6..3181767d 100644 --- a/lib/tasks/project.rake +++ b/lib/tasks/project.rake @@ -522,7 +522,15 @@ namespace :project do end desc "Convenience method that, in one call, builds all data JSONs and zips them up into a single ZIP release" - task :build_and_export_final_data, [:project_key, :rebuild] => :environment do |task, args| + task :build_and_export_final_data, [:project_key, :rebuild, :ensure_day_of_week_is] => :environment do |task, args| + # If ensure_day_of_week_is given, proceed with execution only if weekday matches value + # (Important for heroku scheduler, which can schedule daily but not weekly) + if ! args[:ensure_day_of_week_is].blank? + if Date.today.strftime("%A").downcase != args[:ensure_day_of_week_is].downcase + puts "Aborting because today is not #{args[:ensure_day_of_week_is]}" + exit + end + end Rake::Task['project:build_final_data'].invoke(args[:project_key], args[:rebuild]) Rake::Task['project:export_final_data'].invoke(args[:project_key]) end From 2b0121d6bc7c618f418a98ac21049b4502ffb6f9 Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Mon, 22 Feb 2016 14:16:32 -0500 Subject: [PATCH 14/23] Adding better generic styling for finalsubjectset browsing; new fetch-project-mixin for components; new genericpage component to support configurable left nav and custom content data hooks --- .../javascripts/components/app-router.cjsx | 49 +++-- .../components/final-subject-set-browser.cjsx | 68 +++---- .../final-subject-set-download.cjsx | 39 ++++ .../components/final-subject-set-page.cjsx | 184 +++++++++--------- .../javascripts/components/generic-page.cjsx | 75 +++++++ .../javascripts/lib/fetch-project-mixin.cjsx | 9 + app/assets/stylesheets/common.styl | 39 ++++ .../final-subject-set-browser.styl | 2 +- app/controllers/projects_controller.rb | 5 + app/models/project.rb | 1 + app/models/subject.rb | 1 + app/serializers/project_serializer.rb | 10 +- config/routes.rb | 1 + lib/tasks/project.rake | 112 +++++++---- package.json | 1 + project/emigrant/assets/css/styles.css | 13 +- project/emigrant/content/data.html.erb | 47 ----- project/emigrant/content/pages/data.md | 22 +++ project/emigrant/content/pages/data/_nav.md | 4 + .../emigrant/content/pages/data/download.md | 11 ++ project/emigrant/content/pages/data/tips.md | 17 ++ project/emigrant/scripts/query_subjects.rb | 7 +- 22 files changed, 466 insertions(+), 251 deletions(-) create mode 100644 app/assets/javascripts/components/final-subject-set-download.cjsx create mode 100644 app/assets/javascripts/components/generic-page.cjsx create mode 100644 app/assets/javascripts/lib/fetch-project-mixin.cjsx delete mode 100644 project/emigrant/content/data.html.erb create mode 100644 project/emigrant/content/pages/data.md create mode 100644 project/emigrant/content/pages/data/_nav.md create mode 100644 project/emigrant/content/pages/data/download.md create mode 100644 project/emigrant/content/pages/data/tips.md diff --git a/app/assets/javascripts/components/app-router.cjsx b/app/assets/javascripts/components/app-router.cjsx index 2ee12f02..e534e562 100644 --- a/app/assets/javascripts/components/app-router.cjsx +++ b/app/assets/javascripts/components/app-router.cjsx @@ -13,6 +13,8 @@ GroupPage = require './group-page' GroupBrowser = require './group-browser' FinalSubjectSetBrowser = require './final-subject-set-browser' FinalSubjectSetPage = require './final-subject-set-page' +FinalSubjectSetDownload = require './final-subject-set-download' +GenericPage = require './generic-page' Project = require 'models/project.coffee' @@ -77,28 +79,34 @@ class AppRouter /> } + + { # Project-configured pages: + project.pages?.map (page, key) => + + } + { if project.downloadable_data } - - { # Project-configured pages: - project.pages?.map (page, key) => - - } + pattern = new RegExp('#/[A-z]*#(.*)') selectedID = "#{window.location.hash}".match(pattern) @@ -157,21 +164,11 @@ class AppRouter active: false heightStyle: "content" - navToggle:(e)-> - render: -> formatted_name = page.name.replace("_", " ") -
    -

    {formatted_name}

    -
    - { - if page.group_browser? && page.group_browser != '' -
    - -
    - } -
    Last Update {page.updated_at}
    -
    + base_key = page.key.split('/')[0] + nav = project.page_navs[base_key] + module.exports = AppRouter window.React = React diff --git a/app/assets/javascripts/components/final-subject-set-browser.cjsx b/app/assets/javascripts/components/final-subject-set-browser.cjsx index 321904a9..b952ba90 100644 --- a/app/assets/javascripts/components/final-subject-set-browser.cjsx +++ b/app/assets/javascripts/components/final-subject-set-browser.cjsx @@ -5,11 +5,13 @@ Project = require 'models/project.coffee' GenericButton = require('components/buttons/generic-button') LoadingIndicator = require('components/loading-indicator') Pagination = require('components/pagination') +GenericPage = require './generic-page' +FetchProjectMixin = require 'lib/fetch-project-mixin' module.exports = React.createClass displayName: 'FinalSubjectSetBrowser' - mixins: [Navigation] + mixins: [Navigation, FetchProjectMixin] getInitialState:-> entered_keyword: @props.query.keyword @@ -24,9 +26,6 @@ module.exports = React.createClass componentDidMount: -> @checkQueryString() - API.type('projects').get().then (result)=> - @setState project: new Project(result[0]) - componentWillReceiveProps: (new_props) -> @checkQueryString new_props @@ -93,8 +92,6 @@ module.exports = React.createClass renderSearch: ->
    -

    Browse

    -

    Preview the data by searching by keyword below:

    { if @state.project.export_document_specs?[0]?.spec_fields @@ -123,7 +120,7 @@ module.exports = React.createClass
      { for set in @state.results - url = "/#/data/exports/#{set.id}?keyword=#{@state.searched_query.keyword}&field=#{@state.searched_query.field ? ''}" + url = "/#/data/browse/#{set.id}?keyword=#{@state.searched_query.keyword}&field=#{@state.searched_query.field ? ''}" matches = [] safe_keyword = (w.replace(/\W/g, "\\$&") for w in @state.searched_query.keyword.toLowerCase().replace(/"/g,'').split(' ')).join("|") @@ -165,54 +162,33 @@ module.exports = React.createClass }
    - renderDownloadCopy: -> -
    render: -> return null if ! @state.project? -
    -

    Data Exports

    + data_nav = @state.project.page_navs['data'] + +
    - { if ! @state.project.downloadable_data -
    -

    Data Exports Not Available

    -

    Sorry, but public data exports are not enabled for this project yet.

    -
    - - else -
    - { if @state.project.latest_export? - - - else -

    Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized snapshot of that data, which can be browsed here.

    - } +

    Browse

    - { if ! @state.searched_query?.keyword -

    Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here.

    - } + { if ! @state.project.downloadable_data +
    +

    Data Exports Not Available

    +

    Sorry, but public data exports are not enabled for this project yet.

    +
    + + else +
    + { if ! @state.searched_query?.keyword +

    Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here.

    + } - { @renderSearch() } + { @renderSearch() } - { if ! @state.searched_query?.keyword - @renderDownloadCopy() - } - -
    +
    }
    +
    diff --git a/app/assets/javascripts/components/final-subject-set-download.cjsx b/app/assets/javascripts/components/final-subject-set-download.cjsx new file mode 100644 index 00000000..059803a9 --- /dev/null +++ b/app/assets/javascripts/components/final-subject-set-download.cjsx @@ -0,0 +1,39 @@ +React = require 'react' +FetchProjectMixin = require 'lib/fetch-project-mixin' +GenericPage = require './generic-page' + +module.exports = React.createClass + displayName: 'FinalSubjectSetDownload' + + mixins: [FetchProjectMixin] + + getInitialState: -> + project: null + + render: -> + return null if ! @state.project? + + data_nav = @state.project.page_navs['data'] + + + { if ! @state.project.downloadable_data +
    +

    Data Exports Not Available

    +

    Sorry, but public data exports are not enabled for this project yet.

    +
    + else +
    +

    Download

    + +

    Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here.

    + +

    Download Latest Raw Data

    + +

    For help interpretting the data, see Scribe WIKI on Data Exports.

    + +

    To browse past releases and/or to be notified when new releases are made, you may wish to subscribe to the ATOM Feed of Data Releases

    + +
    + } +
    + diff --git a/app/assets/javascripts/components/final-subject-set-page.cjsx b/app/assets/javascripts/components/final-subject-set-page.cjsx index 07a690a5..113ac6a7 100644 --- a/app/assets/javascripts/components/final-subject-set-page.cjsx +++ b/app/assets/javascripts/components/final-subject-set-page.cjsx @@ -1,10 +1,15 @@ -React = require 'react' -API = require '../lib/api' +React = require 'react' +API = require '../lib/api' +GenericPage = require './generic-page' +FetchProjectMixin = require 'lib/fetch-project-mixin' + +FinalSubjectAssertion = require('components/final-subject-assertion') -FinalSubjectAssertion = require('components/final-subject-assertion') module.exports = React.createClass displayName: 'FinalSubjectSetPage' + + mixins: [FetchProjectMixin] getInitialState:-> set: null @@ -35,99 +40,102 @@ module.exports = React.createClass render: -> return null if ! @state.set -
    - -
    + data_nav = @state.project.page_navs['data'] - Back + +
    +
    - Download Raw Data - { if @state.set.export_document? && (display_field = @state.set.export_document.export_fields[0])? -

    {display_field.name} {display_field.value}

    - else -

    Record {@state.set.id}

    - } + Back - + Download Raw Data + { if @state.set.export_document? && (display_field = @state.set.export_document.export_fields[0])? +

    {display_field.name} {display_field.value}

    + else +

    Record {@state.set.id}

    + } - { if @state.tabs.length > 1 - - } + - { if @state.tab == 'export-doc' && @state.set.export_document -
    -

    These data points represent numerous individual classifications that have been merged and lightly cleaned up to adhere to {@props.project.title}'s data model.

    - - { for field,i in @state.set.export_document.export_fields - if field.assertion_ids - assertion = subject = null - for s in @state.set.subjects - for a in s.assertions - if field.assertion_ids.indexOf(a.id) >= 0 - assertion = a - subject = s - if assertion && subject -
    - -
    + { if @state.tabs.length > 1 +
      + { if @state.tabs.indexOf('export-doc') >= 0 +
    • Best Data
    • } -
    - } +
  • All Data
  • +
  • @showTab('source-metadata')}>Source Metadata
  • +
+ } + + { if @state.tab == 'export-doc' && @state.set.export_document +
+

These data points represent numerous individual classifications that have been merged and lightly cleaned up to adhere to {@props.project.title}'s data model.

+ + { for field,i in @state.set.export_document.export_fields + if field.assertion_ids + assertion = subject = null + for s in @state.set.subjects + for a in s.assertions + if field.assertion_ids.indexOf(a.id) >= 0 + assertion = a + subject = s + if assertion && subject +
+ +
+ } +
+ } + + { if @state.tab == 'assertions' +
+

These data points represent all distinct assertions made upon this {@props.project.term('subject set')} - without cleanup. Each assertion may represent several distinct contributions.

+
    + { for subject in @state.set.subjects +
  • +
      + { + # Sort assertions by order they appear in document: + assertions = subject.assertions.sort (a1,a2) -> + if a1.region.y < a2.region.y + -1 + else + 1 + null + } + { for assertion,i in assertions when assertion.name +
    • + +
    • + } +
    + +
  • + } +
+
+ } + + { if @state.tab == 'source-metadata' - { if @state.tab == 'assertions'
-

These data points represent all distinct assertions made upon this {@props.project.term('subject set')} - without cleanup. Each assertion may represent several distinct contributions.

-
    - { for subject in @state.set.subjects -
  • -
      - { - # Sort assertions by order they appear in document: - assertions = subject.assertions.sort (a1,a2) -> - if a1.region.y < a2.region.y - -1 - else - 1 - null - } - { for assertion,i in assertions when assertion.name -
    • - -
    • - } -
    - -
  • - } -
+

This metadata was imported alongside the source images at the beginning of the project and may include high res source URIs and processing details.

+ +
+ { for k,v of @state.set.meta_data +
+
{k.split('_').map( (v) => v.capitalize() ).join(' ')}
+ { if v.match(/https?:\/\//) +
{v}
+ else +
{v}
+ } +
+ } +
- } - - { if @state.tab == 'source-metadata' - -
-

This metadata was imported alongside the source images at the beginning of the project and may include high res source URIs and processing details.

- -
- { for k,v of @state.set.meta_data -
-
{k.split('_').map( (v) => v.capitalize() ).join(' ')}
- { if v.match(/https?:\/\//) -
{v}
- else -
{v}
- } -
- } -
-
- } + } +
-
+ diff --git a/app/assets/javascripts/components/generic-page.cjsx b/app/assets/javascripts/components/generic-page.cjsx new file mode 100644 index 00000000..077c62ab --- /dev/null +++ b/app/assets/javascripts/components/generic-page.cjsx @@ -0,0 +1,75 @@ +React = require("react") +FetchProjectMixin = require 'lib/fetch-project-mixin' + +module.exports = React.createClass + displayName: "GenericPage" + + mixins: [FetchProjectMixin] + + getInitialState: -> + project: null + + getDefaultProps: -> + key: null + title: null + content: null + nav: null + footer: null + current_nav: location.hash + + propTypes: + title: React.PropTypes.string + content: React.PropTypes.string + nav: React.PropTypes.string + footer: React.PropTypes.string + + # Returns true if given nav link href appears to link to this page + isCurrentNavLink: (href) -> + # Known limitation: This will will assume equivalency of two URLs that don't have hashes + # But use of the nav assumes hashes. A nav item really shouldn't link to a different domain/ctrl endpoint + href.replace(/.*#/, '') == @props.current_nav.replace(/.*#/,'') + + componentDidMount: -> + # Find nav link matching @props.current_nav + matching = (el for el in $(React.findDOMNode(this)).find('.custom-page-nav li a') when @isCurrentNavLink($(el).attr('href')) ) + $(matching[0]).parent('li').addClass('current') if matching.length > 0 + + htmlContent: -> + content = @props.content + + replacements = + "project.classification_count": @state.project?.classification_count ? '__' + "project.latest_export.created_at": @state.project?.latest_export?.created_at ? '__' + "project.root_subjects_count": @state.project?.root_subjects_count ? '__' + "project.title": @state.project?.title ? '__' + + for pattern, replacement of replacements + pattern = new RegExp("{{#{pattern}}}", 'gi') + + # assume, if it's an int, we want to comma format it: + if typeof(replacement) == 'number' + replacement = replacement.toLocaleString() + # If it's a date, parse it and make it human: + if replacement.match /^\d{4}-\d{2}/ + replacement = moment(replacement, moment.ISO_8601).calendar() + + content = content.replace pattern, replacement + + marked(content) + + render: -> + +
+

{@props.title}

+
+ { if @props.nav +
+ } + { if @props.content? +
+ } + { @props.children if @props.children? } +
+
{@props.footer}
+
+ diff --git a/app/assets/javascripts/lib/fetch-project-mixin.cjsx b/app/assets/javascripts/lib/fetch-project-mixin.cjsx new file mode 100644 index 00000000..c200d63f --- /dev/null +++ b/app/assets/javascripts/lib/fetch-project-mixin.cjsx @@ -0,0 +1,9 @@ +API = require './api' +Project = require 'models/project.coffee' + +module.exports = + componentDidMount: -> + API.type('projects').get('current').then (result) => + @setState project: new Project(result) + + diff --git a/app/assets/stylesheets/common.styl b/app/assets/stylesheets/common.styl index d4af2d8a..dfdf334b 100644 --- a/app/assets/stylesheets/common.styl +++ b/app/assets/stylesheets/common.styl @@ -292,3 +292,42 @@ div.home-page margin-bottom 50px overflow: hidden min-height 800px + + +.custom-page-inner-wrapper + flexbox(flex) + min-height 300px + // overflow hidden // just to clear floats + +.custom-page-nav + flex(grow: 1) + min-width 200px + + ul + list-style none + padding 0 + + li + font-size 16px + line-height 16px + margin-left 0 + margin-bottom 15px + padding-left 15px + border-left 2px solid transparent + + a + color SECONDARY_NORMAL + text-decoration none + + &:hover + color SECONDARY_HOVER + + &.current + font-weight bold + border-left 2px solid MAIN_HIGHLIGHT + + + +custom-page-body + overflow hidden + flex(grow: 10) diff --git a/app/assets/stylesheets/final-subject-set-browser.styl b/app/assets/stylesheets/final-subject-set-browser.styl index 572dc434..a7a00a3b 100644 --- a/app/assets/stylesheets/final-subject-set-browser.styl +++ b/app/assets/stylesheets/final-subject-set-browser.styl @@ -219,7 +219,7 @@ text-decoration: none; &:before - content "<" + content "< " dl.source-metadata dl diff --git a/app/controllers/projects_controller.rb b/app/controllers/projects_controller.rb index 6a7c7072..76fe0f48 100644 --- a/app/controllers/projects_controller.rb +++ b/app/controllers/projects_controller.rb @@ -3,7 +3,12 @@ class ProjectsController < ApplicationController caches_action :index, :cache_path => "projects/index" + # TODO deprecate this nonsensical usage. Use /projects/current instead def index + current + end + + def current respond_with Project.current end diff --git a/app/models/project.rb b/app/models/project.rb index 87cea299..4f97f036 100644 --- a/app/models/project.rb +++ b/app/models/project.rb @@ -16,6 +16,7 @@ class Project field :scientists, type: Array, default: [] field :developers, type: Array, default: [] field :pages, type: Array, default: [] + field :page_navs, type: Hash, default: {} field :menus, type: Hash, default: {} field :partials, type: Hash, default: {} field :logo, type: String, default: nil diff --git a/app/models/subject.rb b/app/models/subject.rb index 1016da8c..e6f726a6 100644 --- a/app/models/subject.rb +++ b/app/models/subject.rb @@ -57,6 +57,7 @@ class Subject belongs_to :group belongs_to :parent_subject, :class_name => "Subject", :foreign_key => "parent_subject_id" belongs_to :subject_set, :class_name => "SubjectSet", :foreign_key => "subject_set_id" + belongs_to :project has_many :child_subjects, :class_name => "Subject" has_many :classifications, inverse_of: :subject diff --git a/app/serializers/project_serializer.rb b/app/serializers/project_serializer.rb index 4b6f50a3..a0319e60 100644 --- a/app/serializers/project_serializer.rb +++ b/app/serializers/project_serializer.rb @@ -1,6 +1,6 @@ class ProjectSerializer < ActiveModel::MongoidSerializer - attributes :id, :title, :short_title, :summary, :home_page_content, :organizations , :team, :pages, :menus, :partials, :logo, :background, :workflows, :forum, :tutorial, :feedback_form_url, :metadata_search, :terms_map, :blog_url, :discuss_url, :privacy_policy, :downloadable_data, :latest_export - attributes :classification_count + attributes :id, :title, :short_title, :summary, :home_page_content, :organizations , :team, :pages, :page_navs, :menus, :partials, :logo, :background, :workflows, :forum, :tutorial, :feedback_form_url, :metadata_search, :terms_map, :blog_url, :discuss_url, :privacy_policy, :downloadable_data, :latest_export + attributes :classification_count, :root_subjects_count has_many :workflows has_many :export_document_specs @@ -10,9 +10,15 @@ def latest_export end def classification_count + # TODO: This should be scoped to project, but Classification#project_id doesn't exist Classification.count end + def root_subjects_count + # TODO: This too should be scoped to project, but Subject#project_id doesn't exist... + Subject.root.count + end + def id object._id.to_s end diff --git a/config/routes.rb b/config/routes.rb index bb9148bc..eb8162aa 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -8,6 +8,7 @@ get '/projects', to: 'projects#index', defaults: { format: 'json' } + get '/projects/current', to: 'projects#current', defaults: { format: 'json' } get '/workflows', to: 'workflow#index', defaults: { format: 'json' } get '/workflows/:id', to: 'workflow#show', defaults: { format: 'json' } diff --git a/lib/tasks/project.rake b/lib/tasks/project.rake index 3181767d..faeb4898 100644 --- a/lib/tasks/project.rake +++ b/lib/tasks/project.rake @@ -136,46 +136,24 @@ namespace :project do content_path = Rails.root.join('project', project_key, 'content') puts "Loading pages from #{content_path}:" - prev_pages = project.pages project.pages = [] - Dir.foreach(content_path).each do |file| - path = Rails.root.join content_path, file - next if File.directory? path - next if ! ['.html','.erb','.md'].include? path.extname - ext = path.extname - page_key = file.split('.').first - name = page_key.capitalize - content = File.read path - - puts " Loading page: \"#{name}\" (#{content.size}b)" - if page_key == 'home' - project.home_page_content = content - + # Dir.foreach(content_path).each do |file| + # path = Rails.root.join content_path, file + # next if File.directory? path + # next if ! ['.html','.erb','.md'].include? path.extname + + # Load legacy pages from content folder directly: + Dir.glob("#{content_path}/*.{erb,html,md}").each do |path| + load_page project, path + end + + # Also load anything inside content/pages: + Dir.glob("#{content_path}/pages/*").each do |path| + if File.directory?(path) + load_page_group project, path else - # Set updated at if content changed: - updated_at = Time.now - if ! prev_pages.nil? && ! prev_pages.empty? - previous_page = prev_pages.select { |p| p[:key] == page_key } - if ! previous_page.empty? && (previous_page = previous_page.first) - updated_at = ! previous_page[:updated_at].nil? && previous_page[:content] == content ? previous_page[:updated_at] : Time.now - end - end - - # Check if we should include group browser content - group_match = //.match(content) - group_browser = '' - if group_match && !group_match.captures.empty? - group_browser = group_match.captures[0] - end - - project.pages << { - key: page_key, - name: name, - content: content, - updated_at: updated_at, - group_browser: group_browser - } + load_page project, path end end @@ -202,6 +180,66 @@ namespace :project do project end + def load_page_group(project, path) + base_key = File.basename path + + nav_content = nil + nav_path = File.join(path, "_nav.md") + if File.exist?(nav_path) + nav_content = File.read nav_path + puts "got nav: #{nav_content}" + end + + Dir.glob("#{path}/*.{erb,html,md}").each do |path| + load_page project, path, {base_key: base_key, nav: nav_content} unless File.basename(path).match(/^_/) + end + end + + def load_page(project, path, options = {}) + filename = File.basename path + + page_key = filename.split('.').first + name = page_key.capitalize + name = "#{options[:base_key].capitalize} | #{name}" if options[:base_key] + content = File.read path + + if page_key == 'home' + project.home_page_content = content + + else + # Set updated at if content changed: + updated_at = Time.now + if ! project.pages.nil? && ! project.pages.empty? + previous_page = project.pages.select { |p| p[:key] == page_key } + if ! previous_page.empty? && (previous_page = previous_page.first) + updated_at = ! previous_page[:updated_at].nil? && previous_page[:content] == content ? previous_page[:updated_at] : Time.now + end + end + + # PB 20160219 deprecating this cause doesn't appear in use + # Check if we should include group browser content + # group_match = //.match(content) + # group_browser = '' + # if group_match && !group_match.captures.empty? + # group_browser = group_match.captures[0] + # end + + # Place page nav in special page_navs hash by base key: + project.page_navs = {} if options[:nav] + project.page_navs[options[:base_key]] = options[:nav] if options[:nav] + + project.pages << { + key: ( options[:base_key].nil? ? '' : "#{options[:base_key]}/" ) + page_key, + name: name, + content: content, + updated_at: updated_at + # group_browser: group_browser + } + end + puts " Loaded page: \"#{options[:base_key]}/#{name}\" (#{content.size}b)" + + end + def load_export_specs(project, config) project.export_document_specs = config.map do |h| Export::Spec::Document.from_hash h, project diff --git a/package.json b/package.json index e08165df..060c3ceb 100644 --- a/package.json +++ b/package.json @@ -15,6 +15,7 @@ "coffee-reactify": "^4.0.0", "coffee-script": "^1.9.3", "json-api-client": "^0.4.4", + "marked": "^0.3.5", "normalize-styl": "^3.0.3", "normalize.css": "^3.0.3", "react": "^0.13.3", diff --git a/project/emigrant/assets/css/styles.css b/project/emigrant/assets/css/styles.css index 44c2582c..cbd6bc0a 100644 --- a/project/emigrant/assets/css/styles.css +++ b/project/emigrant/assets/css/styles.css @@ -111,7 +111,7 @@ html, body, background: rgba(255,255,255,0.85); padding: 20px 40px; } -.page-content.custom-page > div:nth-child(2) > *:first-child { +.page-content.custom-page h2 { color: #fff; background: #3f5765; margin-top: -20px; @@ -120,7 +120,17 @@ html, body, padding: 40px 0; border-top-left-radius: 6px; border-top-right-radius: 6px; + text-align: center; } + +.page-content.custom-page .with-nav h2 { + background: transparent; + color: #3f5765; + text-align: left; + margin: 0; + padding: 10px 0 0 0; +} + .page-content > div.updated-at { border-top-left-radius: 0; border-top-right-radius: 0; @@ -137,7 +147,6 @@ html, body, .page-content h2 { font-size: 36px; font-weight: 400; - text-align: center; } .page-content h3 { font-size: 28px; diff --git a/project/emigrant/content/data.html.erb b/project/emigrant/content/data.html.erb deleted file mode 100644 index 7caf415b..00000000 --- a/project/emigrant/content/data.html.erb +++ /dev/null @@ -1,47 +0,0 @@ -## Data exports - -Participants have made 481,562 contributions to Emigrant City to date. This project periodically builds a merged, anonymized dump of that data, which is made public here. This is an active project and we’re continuing to gather data from the records. The data made available here is refreshed every two weeks. - -### Browse - -Preview the data by searching by keyword below. You may restrict your search by field. See below for Tips & Tricks for browsing the dataset and records. - - -### Download -Use the button in the upper-right to download the entire data set. This is a large dataset in json format containing all the assertions, confidence ratings, and fields for the roughly 6,400 records in Emigrant City. For help interpreting the data, see Scribe WIKI on Data Exports. We are actively gathering contributions to the data set and it is refreshed every two weeks. The last dump was February 16, 2016. Use the RRS button in the upper-right to stay updated as we refresh the data. - -### Tips & Tricks -* **Use quotation marks to search for full phrases.** -* **Capitalization doesn’t matter in search.** -* **Why are there field repetitions in this record? Why is a field missing from this record?** - You may notice that for some records, fields appear multiple times. Sometimes multiple different fields are associated with one record. For example, [Record 3758](http://emigrantcity.nypl.org/#/data/exports/56b109ac7061755afbfdbb00?keyword="Eunice R. Waterbury"&field=undefined) has two mortgagors listed: "Eunice R. Waterbury" and "Eloise B. Crothers." - -* **What fields appear?** -This project intended to create a index to enable further discovery. With this project, the goal is to create an index of the bond and mortgage records to enable further discovery and use. We’ve worked with experts on the material to scope the data collected to fields which appear with greater regularity across documents: Record Date, Record Number, Mortgager Name, Street Address, Amount Loaned, Valuation, and Dimension & Description. Collecting and verifying these fields from the records creates resource that opens up the search of these materials. However, the data being gathered is not an exhaustive. Rather than creating a complete transcription of each record, the transcribed fields create an index to give some toeholds for future reference and querying of these rich materials which previously were largely invisible and difficult to search. - -* **Consider abbreviations.** -We asked users to transcribe exactly what they saw written on the records. For this reason, abbreviations may have been used. For example, the word Brooklyn may appear as “BKLYN” in one record and “Brooklyn” in another. - -* **What is the confidence field?** -The metadata for these records was created through contributions from many users. A result of this process is that we can guage how confident we are for each field. - - - - -### Assets -Feel free to explore the collection assets themselves on the Library's [Digital Collections](http://digitalcollections.nypl.org/collections/emigrant-savings-bank-records) website. In this project, we're working with the subset of these digitized materials, the volumes containing Mortgage and Bond records. The volumes appearing in Emigrant City are: -* [Bond and Mortgage Record Book 1 (1 to 1,555)](http://digitalcollections.nypl.org/items/df712aa0-00b1-0133-fbd7-58d385a7bbd0) -* [Bond and Mortgage Record Book 2 (1,556 to 2, 721)](http://digitalcollections.nypl.org/items/c0c38370-015a-0133-065e-58d385a7bbd0) -* [Bond and Mortgage Record Book 3 (2,722 to 3,699)](http://digitalcollections.nypl.org/items/5bb969d0-0241-0133-f196-58d385a7b928) -* [Bond and Mortgage Record Book 4 (3,700 to 4,499)](http://digitalcollections.nypl.org/items/109c0900-02e7-0133-03cf-58d385a7bbd0) -* [Bond and Mortgage Record Book 5 (4,500 to 5,499)](http://digitalcollections.nypl.org/items/e53b4fe0-02fc-0133-0e0d-58d385a7bbd0) -* [Bond and Mortgage Record Book 6 (5,500 to 6,403)](http://digitalcollections.nypl.org/items/20aa00a0-0311-0133-9d30-58d385a7bbd0) -* [Real Estate Loans No. 9](http://digitalcollections.nypl.org/items/6cf0ed60-23ef-0133-6b54-58d385a7b928) -* [Real Estate Loans No. 10](http://digitalcollections.nypl.org/items/59b0a100-23fd-0133-b24f-58d385a7bbd0) -* [Real Estate Loans No. 11](http://digitalcollections.nypl.org/items/cf0c3ee0-24bd-0133-5e2d-58d385a7b928) -* [Real Estate Loans No. 13](http://digitalcollections.nypl.org/items/3edf3050-24cd-0133-e6df-58d385a7b928) - - -You may also be interested to explore how we’ve been experimenting with opening up data from other crowdsourcing projects: -* [What's on the Menu?](http://menus.nypl.org/data) -* [Building Inspector](http://buildinginspector.nypl.org/data) diff --git a/project/emigrant/content/pages/data.md b/project/emigrant/content/pages/data.md new file mode 100644 index 00000000..285a3c50 --- /dev/null +++ b/project/emigrant/content/pages/data.md @@ -0,0 +1,22 @@ +## Data exports + +Participants have made {{project.classification_count}} contributions to Emigrant City to date. This project periodically builds a merged, anonymized dump of that data, which is made public here. This is an active project and we’re continuing to gather data from the records. The data made available here is refreshed weekly. + + +## Source Assets +Feel free to explore the collection assets themselves on the Library's [Digital Collections](http://digitalcollections.nypl.org/collections/emigrant-savings-bank-records) website. In this project, we're working with the subset of these digitized materials, the volumes containing Mortgage and Bond records. The volumes appearing in Emigrant City are: +* [Bond and Mortgage Record Book 1 (1 to 1,555)](http://digitalcollections.nypl.org/items/df712aa0-00b1-0133-fbd7-58d385a7bbd0) +* [Bond and Mortgage Record Book 2 (1,556 to 2, 721)](http://digitalcollections.nypl.org/items/c0c38370-015a-0133-065e-58d385a7bbd0) +* [Bond and Mortgage Record Book 3 (2,722 to 3,699)](http://digitalcollections.nypl.org/items/5bb969d0-0241-0133-f196-58d385a7b928) +* [Bond and Mortgage Record Book 4 (3,700 to 4,499)](http://digitalcollections.nypl.org/items/109c0900-02e7-0133-03cf-58d385a7bbd0) +* [Bond and Mortgage Record Book 5 (4,500 to 5,499)](http://digitalcollections.nypl.org/items/e53b4fe0-02fc-0133-0e0d-58d385a7bbd0) +* [Bond and Mortgage Record Book 6 (5,500 to 6,403)](http://digitalcollections.nypl.org/items/20aa00a0-0311-0133-9d30-58d385a7bbd0) +* [Real Estate Loans No. 9](http://digitalcollections.nypl.org/items/6cf0ed60-23ef-0133-6b54-58d385a7b928) +* [Real Estate Loans No. 10](http://digitalcollections.nypl.org/items/59b0a100-23fd-0133-b24f-58d385a7bbd0) +* [Real Estate Loans No. 11](http://digitalcollections.nypl.org/items/cf0c3ee0-24bd-0133-5e2d-58d385a7b928) +* [Real Estate Loans No. 13](http://digitalcollections.nypl.org/items/3edf3050-24cd-0133-e6df-58d385a7b928) + + +You may also be interested to explore how we’ve been experimenting with opening up data from other crowdsourcing projects: +* [What's on the Menu?](http://menus.nypl.org/data) +* [Building Inspector](http://buildinginspector.nypl.org/data) diff --git a/project/emigrant/content/pages/data/_nav.md b/project/emigrant/content/pages/data/_nav.md new file mode 100644 index 00000000..e8b747d3 --- /dev/null +++ b/project/emigrant/content/pages/data/_nav.md @@ -0,0 +1,4 @@ + * [About](/#/data) + * [Browse](/#/data/browse) + * [Download](/#/data/download) + * [Tips & Tricks](/#/data/tips) diff --git a/project/emigrant/content/pages/data/download.md b/project/emigrant/content/pages/data/download.md new file mode 100644 index 00000000..8ee57ac6 --- /dev/null +++ b/project/emigrant/content/pages/data/download.md @@ -0,0 +1,11 @@ +## Download + +Participants have made {{project.classification_count}} contributions to {{project.title}} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here. + +This is a large dataset in json format containing all the assertions, confidence ratings, and fields for the {{project.root_subjects_count}} records in {{project.title}}. We are actively gathering contributions to the data set and it is refreshed weekly. The last dump was {{project.latest_export.created_at}}. + +Download Latest Raw Data + +For help interpretting the data, see Scribe WIKI on Data Exports. + +To browse past releases and/or to be notified when new releases are made, you may wish to subscribe to the ATOM Feed of Data Releases diff --git a/project/emigrant/content/pages/data/tips.md b/project/emigrant/content/pages/data/tips.md new file mode 100644 index 00000000..80386187 --- /dev/null +++ b/project/emigrant/content/pages/data/tips.md @@ -0,0 +1,17 @@ +## Tips & Tricks +* **Use quotation marks to search for full phrases.** +* **Capitalization doesn’t matter in search.** +* **Why are there field repetitions in this record? Why is a field missing from this record?** + You may notice that for some records, fields appear multiple times. Sometimes multiple different fields are associated with one record. For example, [Record 3758](http://emigrantcity.nypl.org/#/data/exports/56b109ac7061755afbfdbb00?keyword="Eunice R. Waterbury"&field=undefined) has two mortgagors listed: "Eunice R. Waterbury" and "Eloise B. Crothers." + +* **What fields appear?** +This project intended to create a index to enable further discovery. With this project, the goal is to create an index of the bond and mortgage records to enable further discovery and use. We’ve worked with experts on the material to scope the data collected to fields which appear with greater regularity across documents: Record Date, Record Number, Mortgager Name, Street Address, Amount Loaned, Valuation, and Dimension & Description. Collecting and verifying these fields from the records creates resource that opens up the search of these materials. However, the data being gathered is not an exhaustive. Rather than creating a complete transcription of each record, the transcribed fields create an index to give some toeholds for future reference and querying of these rich materials which previously were largely invisible and difficult to search. + +* **Consider abbreviations.** +We asked users to transcribe exactly what they saw written on the records. For this reason, abbreviations may have been used. For example, the word Brooklyn may appear as “BKLYN” in one record and “Brooklyn” in another. + +* **What is the confidence field?** +The metadata for these records was created through contributions from many users. A result of this process is that we can guage how confident we are for each field. + + + diff --git a/project/emigrant/scripts/query_subjects.rb b/project/emigrant/scripts/query_subjects.rb index a4a8340d..589ca7ce 100644 --- a/project/emigrant/scripts/query_subjects.rb +++ b/project/emigrant/scripts/query_subjects.rb @@ -11,7 +11,8 @@ client = NyplRepo::Client.new ENV['DC_API_KEY'] item_uuids = [ - "be6d6300-ecf4-0132-456e-58d385a7b928", # Book 1 (1 to 1,555) http://digitalcollections.nypl.org/items/df712aa0-00b1-0133-fbd7-58d385a7bbd0 +=begin +"be6d6300-ecf4-0132-456e-58d385a7b928", # Book 1 (1 to 1,555) http://digitalcollections.nypl.org/items/df712aa0-00b1-0133-fbd7-58d385a7bbd0 "bf0c1890-ecf4-0132-faa2-58d385a7b928", # Book 2 (1,556 to 2, 721) http://digitalcollections.nypl.org/items/c0c38370-015a-0133-065e-58d385a7bbd0 "bfe9fbe0-ecf4-0132-7e52-58d385a7b928", # Book 3 (2,722 to 3,699) http://digitalcollections.nypl.org/items/5bb969d0-0241-0133-f196-58d385a7b928 "c0921750-ecf4-0132-1737-58d385a7b928", # Book 4 (3,700 to 4,499) http://digitalcollections.nypl.org/items/109c0900-02e7-0133-03cf-58d385a7bbd0 @@ -20,6 +21,8 @@ "c53c32d0-ecf4-0132-b51f-58d385a7b928", # Real Estate Loans No. 9 http://digitalcollections.nypl.org/items/6cf0ed60-23ef-0133-6b54-58d385a7b928 "c5d23760-ecf4-0132-8bed-58d385a7b928", # Real Estate Loans No. 10 http://digitalcollections.nypl.org/items/59b0a100-23fd-0133-b24f-58d385a7bbd0 "c6697fe0-ecf4-0132-b1fc-58d385a7b928", # Real Estate Loans No. 11 http://digitalcollections.nypl.org/items/cf0c3ee0-24bd-0133-5e2d-58d385a7b928 +=end + "c7d4b670-ecf4-0132-5854-58d385a7b928", # Real Estate Loans No. 11 http://digitalcollections.nypl.org/items/01af2f60-8701-0133-b22c-00505686a51c ] @@ -45,7 +48,7 @@ end end -out_path = "#{File.dirname(File.dirname(__FILE__))}/subjects/subjects_from_api.building.csv" +out_path = "#{File.dirname(File.dirname(__FILE__))}/subjects/subjects_from_api.book14.csv" CSV.open(out_path, "wb") do |csv| csv << subjects.first.keys From ebd8c2fbd2098f844affad52dc0816304204f689 Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Mon, 22 Feb 2016 16:20:30 -0500 Subject: [PATCH 15/23] styling fixes in finalsubjectbrowser --- app/assets/javascripts/components/pagination.cjsx | 4 ++++ app/assets/stylesheets/final-subject-set-browser.styl | 11 +++++++---- project/emigrant/assets/css/styles.css | 5 +++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/app/assets/javascripts/components/pagination.cjsx b/app/assets/javascripts/components/pagination.cjsx index 3b896132..998508a1 100644 --- a/app/assets/javascripts/components/pagination.cjsx +++ b/app/assets/javascripts/components/pagination.cjsx @@ -31,6 +31,9 @@ module.exports = React.createClass # Bookend with last few pages: pages.push p if p > @props.total_pages - chunk_size && pages.indexOf(p)<0 + # Don't show anything if no usable links: + return null if pages.length < 2 + page_links = [] # Add leading < link @@ -45,6 +48,7 @@ module.exports = React.createClass # Add final > link page_links.push({label: ">", page: @props.next_page, title: "Next", disabled: false}) if @props.next_page? +
+ +* **Best Data versus All Data** +A record's best data consolidates and lightly cleans all the fields for that record. During the transcription process, the same field may have been marked multiple times and made it all the way through the Scribe work-flow resulting in duplications of that field. Best data consolidates these duplications. There is also minimal formatting cleanup so, for instance, Amount Loaned appears as a dollar amount rather than a number. + +* **What is the Source Metadata?** +The content on this page is the technical metadata associated with a record. + + +* **What is the confidence field?** +The metadata for these records was created through contributions from many users. A result of this process is that we can gauge how confident we are for each field. Fields with an 100% confidence ratings are fields for which every transcription was the same. Lower confidence ratings mean that there was disagreement on how best to transcribe a field. + +* **What is a field's status?** +The status of each field is displayed. This corresponds with where the field's transcription is in the work-flow of the project. + +* **What are distinct transcriptions?** +As you may notice, fields display the number of distinct transcriptions they have received during the project's run. A distinct transcription of 1 means everyone transcribed the same thing for that field. + * **Why are there field repetitions in this record? Why is a field missing from this record?** You may notice that for some records, fields appear multiple times. Sometimes multiple different fields are associated with one record. For example, [Record 3758](http://emigrantcity.nypl.org/#/data/exports/56b109ac7061755afbfdbb00?keyword="Eunice R. Waterbury"&field=undefined) has two mortgagors listed: "Eunice R. Waterbury" and "Eloise B. Crothers." -* **What fields appear?** +* **Which fields appear?** This project intended to create a index to enable further discovery. With this project, the goal is to create an index of the bond and mortgage records to enable further discovery and use. We’ve worked with experts on the material to scope the data collected to fields which appear with greater regularity across documents: Record Date, Record Number, Mortgager Name, Street Address, Amount Loaned, Valuation, and Dimension & Description. Collecting and verifying these fields from the records creates resource that opens up the search of these materials. However, the data being gathered is not an exhaustive. Rather than creating a complete transcription of each record, the transcribed fields create an index to give some toeholds for future reference and querying of these rich materials which previously were largely invisible and difficult to search. -* **Consider abbreviations.** -We asked users to transcribe exactly what they saw written on the records. For this reason, abbreviations may have been used. For example, the word Brooklyn may appear as “BKLYN” in one record and “Brooklyn” in another. - -* **What is the confidence field?** -The metadata for these records was created through contributions from many users. A result of this process is that we can guage how confident we are for each field. - From 923e929792bf588e893b26a4e65296d2bc3a202d Mon Sep 17 00:00:00 2001 From: Paul Beaudoin Date: Mon, 22 Feb 2016 18:34:08 -0500 Subject: [PATCH 17/23] moving emigrant data pages into data_new for live testing --- .../javascripts/components/app-router.cjsx | 6 +-- .../components/final-subject-set-browser.cjsx | 6 +-- .../components/final-subject-set-page.cjsx | 14 +++---- app/assets/javascripts/models/project.coffee | 10 +++++ project/emigrant/content/pages/data.md | 6 +-- project/emigrant/content/pages/data/_nav.md | 4 -- .../emigrant/content/pages/data_new/_nav.md | 4 ++ .../pages/{data => data_new}/download.md | 0 .../content/pages/{data => data_new}/tips.md | 0 project/emigrant/project.json | 42 ++++++++++++++++++- 10 files changed, 70 insertions(+), 22 deletions(-) delete mode 100644 project/emigrant/content/pages/data/_nav.md create mode 100644 project/emigrant/content/pages/data_new/_nav.md rename project/emigrant/content/pages/{data => data_new}/download.md (100%) rename project/emigrant/content/pages/{data => data_new}/tips.md (100%) diff --git a/app/assets/javascripts/components/app-router.cjsx b/app/assets/javascripts/components/app-router.cjsx index e534e562..ef4e10bd 100644 --- a/app/assets/javascripts/components/app-router.cjsx +++ b/app/assets/javascripts/components/app-router.cjsx @@ -91,19 +91,19 @@ class AppRouter } { if project.downloadable_data } diff --git a/app/assets/javascripts/components/final-subject-set-browser.cjsx b/app/assets/javascripts/components/final-subject-set-browser.cjsx index b952ba90..78fa1992 100644 --- a/app/assets/javascripts/components/final-subject-set-browser.cjsx +++ b/app/assets/javascripts/components/final-subject-set-browser.cjsx @@ -120,7 +120,7 @@ module.exports = React.createClass