Add LoC(-inspired) MARC -> Dublin Core crosswalk

Adding new class to convert marc to Dublin Core, and use it to build up the dublin core record in addition to stuff pulled directly from the solr document. * Add DC Crosswalk * Retool dublin_core.rb to use it. This eliminates any uses of MARC::Record.to_dublin_core that were present. * Add tests that reflect the new fields being pulled out. This mostly follows the definition laid out by the Library of Congress at https://www.loc.gov/marc/marc2dc.html, with deviations based on what we used to do and requests from our most active user. Use of the new crosswalk object is as follows. ``` require "oai_solr/dublin_core_crosswalk" dcc = OAISolr:DublinCoreCrosswalk.new hash_of_element_value_pairs = dcc.full_map(record) title = dcc.title(rec) identifier = dcc.identifier(rec) ```
hathitrust · Mar 21, 2023 · 05168ab · 05168ab
1 parent fd511bb
commit 05168ab
Show file tree

Hide file tree

Showing 7 changed files with 496 additions and 33 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -31,6 +31,19 @@ services:
       - solr-sdr-catalog
       - mariadb
 
+  test-persist:
+    build: .
+    volumes:
+      - .:/usr/src/app
+      - gem_cache:/gems
+    command: bash -c "bin/wait-for solr-sdr-catalog:9033 mariadb:3306"
+    environment:
+      SOLR_URL: http://solr-sdr-catalog:9033/solr/catalog
+      RIGHTS_DATABASE_CONNECTION_STRING: "mysql2://ht_rights:ht_rights@mariadb/ht"
+    depends_on:
+      - solr-sdr-catalog
+      - mariadb
+
   solr-sdr-catalog:
     image: ghcr.io/hathitrust/catalog-solr-sample
     ports:

diff --git a/lib/oai_solr/basic_marc_extractor.rb b/lib/oai_solr/basic_marc_extractor.rb
@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+
+require "set"
+require_relative "basic_marc_single_extractor"
+
+module OAISolr
+  # A collection of BasicMARCSingleExtractors that will collect their combined values from
+  # a MARC::Record.
+  class BasicMARCExtractor
+    # Create a new object, optionally passing tags/codes to add a first BasicMARCSingleExtractor
+    # @param [String,Array<String>,Range<String>] tags Single, array or, or range over 3-digit marc tags
+    # @param [String, Range<String>] subfield_codes Either a single string with all the desired subfield codes
+    #   e.g., "abcek", or a range, e.g., "'a'..'m'". Optional.
+    # @example
+    #   bme = BasicMARCExtractor.new; bme << BasicMARCSingleExtractor.new("245", "ab")
+    #   bme = BasicMARCExtractor.new("245", "ab")
+    #   bme = BasicMARCExtractor.new("600".."699", "a".."z")
+    def initialize(tags = nil, subfield_codes = nil)
+      @single_extractors = []
+      if tags
+        self << BasicMARCSingleExtractor.new(tags, subfield_codes)
+      end
+    end
+
+    # Given an array of duples (as from config), build up an extractor using `#<<`
+    # @param [Array<Array<String>>] tag_code_pairs Array of arrays of the form [ [tags, subfield_codes], ...]
+    # @example
+    #   bme = BasicMARCExtractor.from_pairs([["245", "ab"], ["100".."111", "abd"]])
+    # @see OAI::BasicMARCSingleExtractor#initialize for supported syntax
+    def self.from_pairs(tag_code_pairs)
+      unless tag_code_pairs.first&.is_a?(Array)
+        raise "#{self.class}.from_pairs takes an array of arrays"
+      end
+      basic_marc_extractor = new
+      tag_code_pairs.each { |tag, codes| basic_marc_extractor << BasicMARCSingleExtractor.new(tag, codes) }
+      basic_marc_extractor
+    end
+
+    # Add a previously constructed single extractor, and re-compute the set of interesting tags
+    # @param [OAI::BasicMARCSingleExtractor] basic_marc_single_extractor
+    # @return [OAI::BasicMARCExtractor]
+    def <<(basic_marc_single_extractor)
+      @single_extractors << basic_marc_single_extractor
+      set_interesting_tags!
+      self
+    end
+
+    # For efficiently, keep track of which field tags are "interesting" to this specific extractor,
+    # so we don't have to check the whole list of field tags for every BasicMARCSingleExtractor
+    # @see set_interesting_tags!
+    # @param [String] tag The field tag
+    # @return [Boolean]
+    def interesting_tag?(tag)
+      @interesting_ranges.any? { |rng| rng.cover?(tag) } or @interesting_single_tags.include?(tag)
+    end
+
+    # Get a list of the "interesting" fields (by tag), and run each single extractor in turn
+    # on them. Flatten, compact, and uniq the resulting strings and return
+    # @param [MARC::Record] rec The record from which to extract data
+    # @return [Array<String>] array of extracts
+    def values(rec)
+      rec.select { |field| interesting_tag?(field.tag) }
+        .flat_map { |f| @single_extractors.flat_map { |extractor| extractor.value(f) } }
+        .compact.uniq
+    end
+
+    private
+
+    # We want to efficiently determine if the tag is one that we're interested in.
+    # We support single tags, arrays of (single) tags, and tag ranges. The first two
+    # merge into one set; the ranges we handle separately for efficiency (no sense in
+    # turning '600'..'699' into an array)
+    def set_interesting_tags!
+      @interesting_single_tags = Set.new
+      @interesting_ranges = Set.new
+      @single_extractors.map(&:computed_tags).each do |tags|
+        case tags
+        when Range
+          @interesting_ranges << tags
+        else
+          @interesting_single_tags += Array(tags)
+        end
+        @interesting_single_tags.flatten!
+      end
+    end
+  end
+end
diff --git a/lib/oai_solr/basic_marc_single_extractor.rb b/lib/oai_solr/basic_marc_single_extractor.rb
@@ -0,0 +1,161 @@
+module OAISolr
+  # Build up a simple object to quasi-efficiently extract values from MARC tag/subfield-codes
+  # based on a simplistic query specification.
+  #
+  # A single BasicMARCSingleExtractor will extract a specific set of subfields from the
+  # given tag specification.
+  #
+  # The set (or single) of tags_to_match you want can be passed in as:
+  #   * A single string. `"245"`
+  #   * A three digit integer, which will be coerced into a string. `245`
+  #     * Note that if you want a zero-led field (e.g., "050") you can't use the integer option
+  #   * An array of tags_to_match. ["245", "100", "111"]
+  #   * A range of Strings that encompass all the tags you want,. "600".."699"
+  #
+  # Subfield codes can be expressed as:
+  #   * A string containing all the subfields you want. "abdek"
+  #   * A range of one-character strings. "a".."n"
+  #
+  # Control field: for "codes", pass a range of characters to fetch
+  #   * When dealing with a control field, the "codes" passed should actually be a range of integers
+  #     corresponding to the indexes (zero-based) of the characters you want from that value.
+  class BasicMARCSingleExtractor
+    # Generally, MARC fields have the data in alphabetical subfields fields, and metadata (e.g., links to
+    # other fields) in numbered subfields.  We'll use all the "letter" subfields as the
+    # default for which subfields to use.
+    ALPHA = "a".."z"
+
+    attr_reader :tags, :codes, :computed_tags
+
+    # Create a new extractor for the given tag(s) and subfield code(s)
+    # Note that this code just creates a method to determine if a field matches the desired tags_to_match,
+    # and another to actually extract data from the subfields of those matched fields.
+    #
+    # Everything else in this class is just support to create the #matches_tag? and
+    # #extract methods.
+    #
+    # @param [String, Array<String>, Range<String>] tags
+    # @param [String] codes A list of the
+    # @example One field tag, two subfield codes
+    #   extractor = BasicMARCSingleExtractor.new("245", "ab")
+    # @example An array of tags_to_match
+    #   extractor = BasicMARCSingleExtractor.new(["100", "110", "111"], "abd")
+    # @example A range of tags_to_match, and the default (all alphabetic) subfield codes
+    #   extractor = BasicMARCSingleExtractor.new("600".."699") # subfield codes defaults to ALPHA
+    # @example A single tag, with a range of subfields
+    #   extractor = BasicMARCSingleExtractor.new("245", "a".."e")
+    # @example Get the "date1" characters from the 008 field
+    #   extractor = BasicMARCStringExtractor.new("008", 7..10)
+    def initialize(tags, codes)
+      @tags = tags
+      @codes = codes || ALPHA
+      define_singleton_method(:matches_tag?, tag_matcher(@tags))
+      define_singleton_method(:extract, value_extractor(@codes))
+    end
+
+    # @!method matches_tag?(tag)
+    #   Determines if the passed field tag (e.g., "245") is one that this extractor
+    #   cares about.
+    #   @param [String] tag
+    #   @return [Boolean]
+
+    # @!method extract(field)
+    #   Takes a MARC::DataField or MARC::ControlField and:
+    #     * get the values of the subfields with the wanted codes and
+    #       return them as a single, space-delimited string
+    #     * Get a range of characters from a control field, when the "codes" specified was
+    #       actually an integer range.
+    #   @param [MARC::DataField, MARC::ControlField] field
+    #   @return [String] the desired value(s), with subfield values joined with a space
+
+    # If the "codes" that were passed was actually an integer range, we assume that we're dealing
+    # with a control field.
+    def control_field?
+      codes.is_a?(Range) and codes.begin.is_a?(Integer)
+    end
+
+    # Try to extract strings from the desired subfield values. If none match, or we end
+    # up with an empty string, return nil
+    # @param [MARC::DataField] field
+    # @return [String, nil] Space-delimited values of the wanted subfields
+    def value(field)
+      val = if matches_tag?(field.tag)
+        extract(field) # defined dynamically in the constructor
+      else
+        return nil
+      end
+
+      val.empty? ? nil : val
+    end
+
+    # To decide what values to extract, we first need to decide if a given field's tag
+    # is one of the ones we care about for this extractor.
+    #
+    # Use the tag specification passed in the constructor and figure out
+    # the best way to test if a field tag string (e.g., "245") matches the tags
+    # covered by this extractor. Then build a lambda that will do that test.
+    #
+    # The returned lambda is used in the constructor to create the #matches_tag? method
+    # @param  [String, Array<String>, Range<String>] tags_to_match
+    # @return [Proc] a lambda that takes a single tag and sees if it matches this extractor
+    def tag_matcher(tags_to_match)
+      case tags_to_match
+      when Integer, String
+        @computed_tags = tags_to_match.to_s
+        ->(t) { t.to_s == @computed_tags }
+      when Array
+        @computed_tags = tags_to_match.map(&:to_s).uniq
+        ->(t) { @computed_tags.include? t }
+      when Range
+        @computed_tags = tags_to_match
+        ->(t) { @computed_tags.cover?(t) }
+      else
+        raise "Illegal argumrnt '#{tags_to_match.inspect}'"
+      end
+    end
+
+    # Given a subfield codes specification from the constructor, build an efficient
+    # lambda to pull out the data from the given code(s) as a string. Used in the
+    # constructor to make the #extract method.
+    # @param [String, Range<String>, Range<Integer>] codes_or_control_field_range
+    # @return [Proc] lambda that take a MARC::ControlField or MARC::DataField and pulls
+    #   out the requested data.
+    def value_extractor(codes_or_control_field_range)
+      if control_field?
+        control_field_extractor(codes_or_control_field_range)
+      else
+        datafield_extractor(codes_or_control_field_range)
+      end
+    end
+
+    private
+
+    # A control field extractor just gets the characters in the given range
+    # @param [Range] integer_range  Integer range (zero-based) of the chars you want
+    # @return [Proc] lambda that will take a control field and extract the right characters
+    def control_field_extractor(integer_range)
+      ->(control_field) { control_field.value.slice(integer_range) }
+    end
+
+    # Subfield extraction for when the codes are specified as a single char, a bunch of chars,
+    # or a char range.  Each is treated separately to get the best performance for
+    # each situation, because these things can add up when doing lots and lots of records.
+    # @param [String] codes A string of which subfield codes to extract
+    # @return [Proc] lambda that will correctly do the extraction and joining of values on the passed field.
+    def datafield_extractor(codes)
+      case codes
+      when String
+        if codes.size == 1
+          ->(data_field) { data_field.select { |sf| sf.code == codes }.map(&:value).join(" ").strip }
+        else
+          codesarray = codes.chars
+          ->(data_field) { data_field.select { |sf| codesarray.include? sf.code }.map(&:value).join(" ").strip }
+        end
+      when Range
+        ->(data_field) { data_field.select { |sf| codes.cover? sf.code }.map(&:value).join(" ").strip }
+      else
+        raise "Subfield codes must be either a string of chars, a range of chars, or a range of ints for control field extraction"
+      end
+    end
+  end
+end
diff --git a/lib/oai_solr/dublin_core.rb b/lib/oai_solr/dublin_core.rb
@@ -1,8 +1,13 @@
 require "oai"
 require "rights_database"
+require "oai_solr/dublin_core_crosswalk"
 
 module OAISolr
   class DublinCore < OAI::Provider::Metadata::DublinCore
+    # A dublic core crosswalk object for translating MARC records into
+    # the dublin core fields.
+    CROSSWALK = OAISolr::DublinCoreCrosswalk.new
+
     def encode _, record
       dc_hash = dublin_core_hash(record)
 
@@ -33,41 +38,53 @@ def self.rights_statement(record, statements = access_statements(record))
 
     private
 
+    # @param [OAISolr::Record] record
     def dublin_core_hash(record)
-      # TODO: to_dublin_core doesn't do much useful in the current release of
-      # ruby-marc - the only things we're keeping from it are "source" and
-      # "relation"
-      record.marc_record.to_dublin_core.compact.tap do |dc|
-        dc.default_proc = proc { |hash, key| hash[key] = [] }
-
-        dc["type"] = "text"
-        dc["date"] = record.solr_document["display_date"]
-        dc["description"] = description(record)
-        dc["rights"] = self.class.rights_statement(record)
-
-        %w[publisher language format subject_display authorStr]
-          .reject { |k| record.solr_document[k].nil? }
-          .each { |k| dc[k] = [record.solr_document[k]].flatten }
-
-        dc["subject"] = dc.delete("subject_display")
-        dc["creator"] = dc.delete("authorStr")
-
-        # the old OAI provider doesn't include dc:coverage, and what rubymarc
-        # gives is as badly-formatted as the authors & subjects
-        dc.delete("coverage")
-
-        record.solr_document["oclc"]&.each { |o| dc["identifier"] << "(OCoLC)#{o}" }
-        record.solr_document["ht_id"].each { |htid| dc["identifier"] << "#{Settings.handle}#{htid}" }
-        record.solr_document["isbn"]&.each { |isbn| dc["identifier"] << isbn }
-      end.reject { |_k, v| v.nil? || v.empty? }
-    end
+      dc = {}
+
+      # Set stuff that's constant for HT items
+      dc["type"] = "text"
+      dc["rights"] = self.class.rights_statement(record)
+
+      # Get stuff out of the solr documment
+      dc["date"] = record.first_solr_value("display_date")
+      dc["language"] = record.first_solr_value("language")
+      dc["publisher"] = record.first_solr_value("publisher")
+      dc["subject"] = record.solr_value("subject_display")
+      dc["format"] = record.first_solr_value("format")
 
-    # Current implementation appears to use 300
-    # ruby-marc's next release will likely use 500
-    def description(record)
-      return unless record.marc_record["300"]
+      marc = record.marc_record
+
+      # The LoC spec says to NOT use creator, and instead use contributor, but our users
+      # have asked that we keep this the same as before, using creator.
+      dc["creator"] = CROSSWALK.contributor(marc)
+
+      # Pull the rest from the record according to the Library of Congress crosswalk
+      dc["publisher"] ||= CROSSWALK.publisher(marc)
+      dc["coverage"] = CROSSWALK.coverage(marc)
+      dc["description"] = CROSSWALK.description(marc)
+      dc["format"] ||= CROSSWALK.format(marc)
+      dc["relation"] = CROSSWALK.relation(marc)
+      dc["source"] = CROSSWALK.source(marc)
+      dc["title"] = CROSSWALK.title(marc)
+
+      # Get the identifiers
+      dc["identifier"] = record.solr_array("oclc").map { |id| "(OCoLC)#{id}" }
+        .concat(record.solr_array("ht_id").map { |htid| "#{Settings.handle}#{htid}" })
+        .concat(record.solr_array("isbn").map { |isbn| "ISBN #{isbn}" })
+        .concat(record.solr_array("issn").map { |issn| "ISBN #{issn}" })
+        .concat(record.solr_array("lccn").map { |lccn| "LCCN #{lccn}" })
+      # Flatten it all out and get rid of nils and duplicates
+      dc.select { |k, v| v.is_a?(Array) }.each_pair do |_field, values|
+        values.flatten!
+        values.compact!
+        values.uniq!
+        values.reject! { |x| x == "".freeze }
+      end
 
-      record.marc_record["300"].subfields.select { |sub| %w[a b c].include? sub.code }.map { |sub| sub.value }.join(" ")
+      # Ditch everything that's empty or nil
+      dc.reject! { |_k, v| v.nil? || v.empty? }
+      dc
     end
 
     # Returns an array of unique access statements for each HTID on record