From 660606321e27961dc45129c27f69044af1414835 Mon Sep 17 00:00:00 2001 From: "Michael J. Giarlo" Date: Wed, 30 Aug 2023 15:47:43 -0700 Subject: [PATCH] [SPIKE] Decouple DSA from Solr Fixes #4459 This is a spike commit towards an SDR Evolution the team has been batting around for a while now, namely severing DSA's dependency on Solr. The spike largely replaces Solr queries with direct DB queries, and for most use cases this works just fine. The key word here is "most..." * The Solr queries have been replaced with DB queries that reach into JSONB columns which results in table scans. I tested all of these queries in stage with large-ish, but not prod-huge, data sets (~25K records) and most of them perform fine. That said, we might want to test this with prod-like data and do some benchmarking to determine if we want to index more of the JSONB data. * A notable performance outlier is `MemberService.for` which needs to make a single Workflow API call for *each* member of a virtual object. These are impressively slow for a virtual object with a few thousand members, taking over a minute to complete. Another question we'd need to answer to take this work forward is what to do about `bin/generate-druid-list`, which allows a user to issue Solr queries directly, and `lib/tasks/missing_druids.rake`, which compares what's in the DSA DB and what's in Solr to determine if any objects need (re-)indexing. Are these still useful? If so, could they live elsewhere or could we solve these problems in a different way? If the answer is no, we may not want to proceed with this decoupling. **NOTE:** Since this is a spike meant to generate discussion, I have not yet bothered with deal with changing the tests (or caring about linting). That will naturally come later if we decide the idea and implementation has merit. --- app/models/dro.rb | 5 +++ app/reports/apo_catkey.rb | 14 +------ app/services/delete_service.rb | 1 - app/services/embargo_release_service.rb | 16 +++----- app/services/member_service.rb | 36 +++++++++++++----- .../publish/public_desc_metadata_service.rb | 6 +-- .../published_relationships_filter.rb | 4 +- app/services/solr_service.rb | 38 ------------------- app/services/virtual_object.rb | 9 +++-- bin/clean-druid-list | 4 +- bin/generate-druid-list | 2 + lib/tasks/missing_druids.rake | 2 + 12 files changed, 57 insertions(+), 80 deletions(-) delete mode 100644 app/services/solr_service.rb diff --git a/app/models/dro.rb b/app/models/dro.rb index d5deb89f5..53c563d2b 100644 --- a/app/models/dro.rb +++ b/app/models/dro.rb @@ -5,6 +5,11 @@ class Dro < RepositoryRecord validates :content_type, :access, :administrative, :description, :identification, :structural, presence: true + scope :has_admin_policy, ->(admin_policy_druid) { where("administrative ->> 'hasAdminPolicy' = '#{admin_policy_druid}'").select(:external_identifier).order(:external_identifier) } + scope :in_virtual_objects, ->(member_druid) { where("structural #> '{hasMemberOrders,0}' -> 'members' ? :druid", druid: member_druid) } + scope :members_of_collection, ->(collection_druid) { where("structural -> 'isMemberOf' ? :druid", druid: collection_druid).select(:external_identifier, :version, :content_type) } + scope :embargoed_and_releaseable, -> { where("(access -> 'embargo' ->> 'releaseDate')::timestamp <= ?", Time.zone.now).select(:external_identifier) } + def self.find_by_source_id(source_id) find_by("identification->>'sourceId' = ?", source_id) end diff --git a/app/reports/apo_catkey.rb b/app/reports/apo_catkey.rb index 4255656a8..d0affa8e6 100644 --- a/app/reports/apo_catkey.rb +++ b/app/reports/apo_catkey.rb @@ -2,7 +2,7 @@ require 'csv' -# Find items that are goverened by the provided APO and then return all catkeys and refresh status. +# Find items that are governed by the provided APO and then return all catkeys and refresh status. # https://github.com/sul-dlss/dor-services-app/issues/4373 # Invoke via: # bin/rails r -e production "ApoCatkey.report('druid:bx911tp9024')" @@ -12,17 +12,7 @@ def self.report(apo_druid) CSV.open(output_file, 'w') do |csv| csv << %w[druid catkey refresh] - query = "is_governed_by_ssim:\"info:fedora/#{apo_druid}\"&objectType_ssim:\"item\"" - druids = [] - # borrowed from bin/generate-druid-list - loop do - results = SolrService.query('*:*', fl: 'id', rows: 10000, fq: query, start: druids.length, sort: 'id asc') - break if results.empty? - - results.each { |r| druids << r['id'] } - sleep(0.5) - end - + druids = Dro.has_admin_policy(apo_druid).map(&:external_identifier) num_dros = druids.size puts "Found #{num_dros} objects that are governed by APO #{apo_druid}" druids.each_with_index do |druid, i| diff --git a/app/services/delete_service.rb b/app/services/delete_service.rb index 8cc955603..031798b52 100644 --- a/app/services/delete_service.rb +++ b/app/services/delete_service.rb @@ -4,7 +4,6 @@ class DeleteService # Tries to remove any existence of the object in our systems # Does the following: - # - Removes item from Fedora/Solr # - Removes content from dor workspace # - Removes content from assembly workspace # - Removes content from sdr export area diff --git a/app/services/embargo_release_service.rb b/app/services/embargo_release_service.rb index d819b43d3..b2baa2ca8 100644 --- a/app/services/embargo_release_service.rb +++ b/app/services/embargo_release_service.rb @@ -1,27 +1,23 @@ # frozen_string_literal: true # Finds objects where the embargo release date has passed for embargoed items -# Builds list of candidate objects by doing a Solr query +# Builds list of candidate objects by querying the database # # Should run once a day from cron class EmbargoReleaseService - RELEASEABLE_NOW_QUERY = 'embargo_status_ssim:"embargoed" AND embargo_release_dtsim:[* TO NOW]' - def self.release_all # Find objects to process - Rails.logger.info("***** Querying solr: #{RELEASEABLE_NOW_QUERY}") - solr = SolrService.get(RELEASEABLE_NOW_QUERY, 'rows' => '5000', 'fl' => 'id') + embargoed_items_to_release = Dro.embargoed_and_releaseable - num_found = solr['response']['numFound'].to_i - if num_found.zero? + if embargoed_items_to_release.none? Rails.logger.info('No objects to process') return end - Rails.logger.info("Found #{num_found} objects") + Rails.logger.info("Found #{embargoed_items_to_release.count} objects") count = 0 - solr['response']['docs'].each do |doc| - release(doc['id']) + embargoed_items_to_release.each do |item| + release(item.external_identifier) count += 1 end diff --git a/app/services/member_service.rb b/app/services/member_service.rb index 95c13348c..591f2cdb9 100644 --- a/app/services/member_service.rb +++ b/app/services/member_service.rb @@ -1,19 +1,37 @@ # frozen_string_literal: true -# Finds the members of a collection by using Solr +# Finds the members of a collection class MemberService # @param [String] druid the identifier of the collection # @param [Boolean] only_published when true, restrict to only published items # @param [Boolean] exclude_opened when true, exclude opened items # @return [Array>] the members of this collection def self.for(druid, only_published: false, exclude_opened: false) - query = "is_member_of_collection_ssim:\"info:fedora/#{druid}\"" - query += ' published_dttsim:[* TO *]' if only_published - query += ' -processing_status_text_ssi:Opened' if exclude_opened - args = { - fl: 'id,objectType_ssim', - rows: 100_000_000 - } - SolrService.query query, args + Dro + .members_of_collection(druid) + .then { |members| reject_opened_members(members, exclude_opened) } + .then { |members| select_published_members(members, only_published) } + .map do |member| + { + 'id' => member.external_identifier, + 'objectType' => member.content_type == Cocina::Models::ObjectType.agreement ? 'agreement' : 'item' + } + end + end + + def self.reject_opened_members(members, exclude_opened) + return members unless exclude_opened + + members.reject do |member| + WorkflowClientFactory.build.status(druid: member.external_identifier, version: member.version).display_simplified == 'Opened' + end + end + + def self.select_published_members(members, only_published) + return members unless only_published + + members.select do |member| + WorkflowClientFactory.build.lifecycle(druid: member.external_identifier, milestone_name: 'published', version: member.version).present? + end end end diff --git a/app/services/publish/public_desc_metadata_service.rb b/app/services/publish/public_desc_metadata_service.rb index 92774671c..5b8fd80d4 100644 --- a/app/services/publish/public_desc_metadata_service.rb +++ b/app/services/publish/public_desc_metadata_service.rb @@ -57,7 +57,7 @@ def add_doi # expand constituent relations into relatedItem references -- see JUMBO-18 # @return [Void] def add_constituent_relations! - VirtualObject.for(druid: cocina_object.externalIdentifier).each do |solr_doc| + VirtualObject.for(druid: cocina_object.externalIdentifier).each do |virtual_object_hash| # create the MODS relation relatedItem = doc.create_element('relatedItem', xmlns: MODS_NS) relatedItem['type'] = 'host' @@ -66,14 +66,14 @@ def add_constituent_relations! # load the title from the virtual object's DC.title titleInfo = doc.create_element('titleInfo', xmlns: MODS_NS) title = doc.create_element('title', xmlns: MODS_NS) - title.content = solr_doc.fetch(:title) + title.content = virtual_object_hash.fetch(:title) titleInfo << title relatedItem << titleInfo # point to the PURL for the virtual object location = doc.create_element('location', xmlns: MODS_NS) url = doc.create_element('url', xmlns: MODS_NS) - url.content = purl_url(solr_doc.fetch(:id)) + url.content = purl_url(virtual_object_hash.fetch(:id)) location << url relatedItem << location diff --git a/app/services/published_relationships_filter.rb b/app/services/published_relationships_filter.rb index c1aa88892..9a43724e8 100644 --- a/app/services/published_relationships_filter.rb +++ b/app/services/published_relationships_filter.rb @@ -36,8 +36,8 @@ def collections def virtual_objects return unless cocina_object.dro? - VirtualObject.for(druid: cocina_object.externalIdentifier).map do |solr_doc| - "" + VirtualObject.for(druid: cocina_object.externalIdentifier).map do |virtual_object_hash| + "" end.join(INDENT) end end diff --git a/app/services/solr_service.rb b/app/services/solr_service.rb deleted file mode 100644 index 7ac194855..000000000 --- a/app/services/solr_service.rb +++ /dev/null @@ -1,38 +0,0 @@ -# frozen_string_literal: true - -# Functions for querying solr -class SolrService - include Singleton - - def options - { timeout: Settings.solr.timeout, url: Settings.solr.url } - end - - def conn - @conn ||= RSolr.connect options - end - - class << self - delegate :conn, to: :instance - - def select_path - Settings.solr.select_path - end - - # @param [Hash] options - def get(query, args = {}) - args = args.merge(q: query, wt: :json) - conn.get(select_path, params: args) - end - - def query(query, args = {}) - unless args.key?(:rows) - Rails.logger.warn "Calling SolrService.get without passing an explicit value for ':rows' is not recommended. " \ - "You will end up with Solr's default (usually set to 10)\nCalled by #{caller(1..1).first}" - end - get(query, args).fetch('response').fetch('docs') - end - - delegate :add, :commit, to: :conn - end -end diff --git a/app/services/virtual_object.rb b/app/services/virtual_object.rb index 79ccf3269..d8ccc741b 100644 --- a/app/services/virtual_object.rb +++ b/app/services/virtual_object.rb @@ -6,10 +6,11 @@ class VirtualObject # @param [String] druid # @return [Array] a list of results with ids and titles def self.for(druid:) - query = "has_constituents_ssim:#{druid.sub(':', '\:')}" - response = SolrService.get(query, { fl: 'id sw_display_title_tesim' }) - response.fetch('response').fetch('docs').map do |row| - { id: row.fetch('id'), title: row.fetch('sw_display_title_tesim').first } + Dro.in_virtual_objects(druid).map do |dro| + { + id: dro.external_identifier, + title: Cocina::Models::Builders::TitleBuilder.build(dro.to_cocina.description.title) + } end end end diff --git a/bin/clean-druid-list b/bin/clean-druid-list index 5e2cf91cd..ae811ff20 100755 --- a/bin/clean-druid-list +++ b/bin/clean-druid-list @@ -23,7 +23,9 @@ count = 0 File.open(options[:output], 'w') do |file| druids.each_with_index do |druid, index| puts "Finding #{druid} (#{index + 1})" - next if SolrService.query('*:*', fl: 'id', rows: 1, fq: "id:\"#{druid}\"").empty? + next unless AdminPolicy.exists?(external_identifier: druid) || + Collection.exists?(external_identifier: druid) || + Dro.exists?(external_identifier: druid) file.write("#{druid}\n") count += 1 diff --git a/bin/generate-druid-list b/bin/generate-druid-list index bfa89ae80..4b1bf5b48 100755 --- a/bin/generate-druid-list +++ b/bin/generate-druid-list @@ -4,6 +4,8 @@ require_relative '../config/environment' require 'optparse' +# TODO: Figure out if we still want this or not, given how tightly coupled this functionality is to Solr + options = { output: 'druids.txt', quiet: false } parser = OptionParser.new do |option_parser| option_parser.banner = 'Usage: bin/generate-druid-list \'\' [options]' diff --git a/lib/tasks/missing_druids.rake b/lib/tasks/missing_druids.rake index 26cb87a5d..c1a18e18c 100644 --- a/lib/tasks/missing_druids.rake +++ b/lib/tasks/missing_druids.rake @@ -1,5 +1,7 @@ # frozen_string_literal: true +# TODO: Figure out if we still want this or not, given how tightly coupled this functionality is to Solr + namespace :missing_druids do desc 'Find unindexed druids' task unindexed_objects: :environment do