Merge branch 'dev'

# Conflicts: # source/tei/shan.L001.xml
CDRH · Nov 27, 2019 · 7c23fbe · 7c23fbe
2 parents 542196c + 0f4edf0
commit 7c23fbe
Show file tree

Hide file tree

Showing 267 changed files with 19,675 additions and 7,252 deletions.
diff --git a/Gemfile b/Gemfile
@@ -1,3 +1,3 @@
 source "https://rubygems.org"
 
-gem "datura", git: "https://github.com/CDRH/datura.git", tag: "v0.1.0"
+gem "datura", git: "https://github.com/CDRH/datura.git", tag: "v0.1.4"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -1,35 +1,35 @@
 GIT
   remote: https://github.com/CDRH/datura.git
-  revision: 44f60a131bb26283789a89f0458eb070880023f3
-  tag: v0.1.0
+  revision: 4cc7911a978f41e0f48605b793f8914b39d9cdfc
+  tag: v0.1.4
   specs:
-    datura (0.1.0)
+    datura (0.1.4)
       colorize (~> 0.8.1)
-      nokogiri (~> 1.8.1)
+      nokogiri (~> 1.8)
       rest-client (~> 2.0.2)
 
 GEM
   remote: https://rubygems.org/
   specs:
     colorize (0.8.1)
-    domain_name (0.5.20180417)
+    domain_name (0.5.20190701)
       unf (>= 0.0.5, < 1.0.0)
     http-cookie (1.0.3)
       domain_name (~> 0.5)
-    mime-types (3.2.2)
+    mime-types (3.3)
       mime-types-data (~> 3.2015)
-    mime-types-data (3.2018.0812)
-    mini_portile2 (2.3.0)
+    mime-types-data (3.2019.1009)
+    mini_portile2 (2.4.0)
     netrc (0.11.0)
-    nokogiri (1.8.5)
-      mini_portile2 (~> 2.3.0)
+    nokogiri (1.10.5)
+      mini_portile2 (~> 2.4.0)
     rest-client (2.0.2)
       http-cookie (>= 1.0.2, < 2.0)
       mime-types (>= 1.16, < 4.0)
       netrc (~> 0.8)
     unf (0.1.4)
       unf_ext
-    unf_ext (0.0.7.5)
+    unf_ext (0.0.7.6)
 
 PLATFORMS
   ruby
@@ -38,4 +38,4 @@ DEPENDENCIES
   datura!
 
 BUNDLED WITH
-   1.16.6
+   1.17.3
diff --git a/README.md b/README.md
@@ -0,0 +1,9 @@
+# Cartas a la Familia: De la migración de Jesusita a Jane
+
+Family Letters: On the Migration from Jesusita to Jane
+
+This repository is intended for use with the [CDRH API](https://github.com/CDRH/api) and the [Family Letters Ruby on Rails application](https://github.com/CDRH/family_letters).
+
+## Updating
+
+See the [Datura documentation](https://github.com/CDRH/datura) for general updating and posting instructions. **NOTE: do not edit the CSV files -- they are generated from a spreadsheet (documents.csv) and a [mediacommons](https://mediacommons.unl.edu/luna/servlet/UNL~111~111) export (photographs.csv)**
diff --git a/config/public.yml b/config/public.yml
@@ -4,7 +4,10 @@ default:
   data_base: https://cdrhmedia.unl.edu
   media_base: https://cdrhmedia.unl.edu
   tei_html_xsl: scripts/overrides/tei_to_html.xsl
-threads: 20
+  threads: 20
+  # scrape_endpoint is tacked onto the end of site_url
+  scrape_endpoint: content_pages
+  scrape_website: true
 development:
   data_base: https://cdrhdev1.unl.edu/media
   es_path: https://cdrhdev1.unl.edu/elastic
@@ -13,3 +16,5 @@ development:
 production:
   es_path: https://cdrhapi.unl.edu/elastic
   es_index: cdrhapi-v1
+  site_url: https://familyletters.unl.edu
+  scrape_website: false
diff --git a/scripts/overrides/data_manager.rb b/scripts/overrides/data_manager.rb
@@ -0,0 +1,63 @@
+require "json"
+require "open-uri"
+require "uri"
+
+class Datura::DataManager
+
+  def build_html(urls)
+    combined = ""
+    # retrieve and then combine into a single file which can be parsed
+    urls.each do |url|
+      lang = url.include?("/en/") ? "en" : "es"
+      raw = open(url) { |f| f.read }
+
+      # wrap the web scraping results in a div that describes the language
+      combined << "<div lang=\"#{lang}\">"
+      html = Nokogiri::HTML(raw)
+      combined << html.at_xpath("//div[@id='content-wrapper']").inner_html
+      combined << "</div>"
+    end
+    combined
+  rescue => exception
+    print_error(exception, urls)
+  end
+
+  def pre_file_preparation
+    if @options["scrape_website"]
+      url = File.join(@options["site_url"], @options["scrape_endpoint"])
+      puts "getting list of urls to scrape from #{url}"
+      list_of_pages = open(url) { |f| f.read }
+      # family letters has urls such as research and en/research
+      # representing spanish and english views of the same content
+      # so the urls are returned in pairs
+      JSON.parse(list_of_pages).each do |pair|
+        # share an id for the two files
+        site_url_for_regex = @options["site_url"]
+          .gsub("/", "\/")
+          .gsub(".", "\.")
+        id = pair
+          .first[/^#{site_url_for_regex}\/(.*)/, 1]
+          .gsub("/", "_")
+        output_file = "#{@options["collection_dir"]}/source/webs/#{id}.html"
+
+        html = build_html(pair)
+        File.open(output_file, 'w') { |file| file.write(html) }
+      end
+    else
+      puts %{Files in source/webs are not being refreshed from the website
+        contents. If you wish to scrape the family letters website, please
+        add or update config/public.yml to use "scrape_website: true"}
+    end
+  rescue => exception
+    print_error(exception, url)
+  end
+
+  def print_error(e, url)
+    puts %{Something went wrong while scraping the family letters website:
+  URL(S): #{url}
+  ERROR: #{e}
+To post content, please check the endpoint in config/public.yml, or
+temporarily disable the scrape_website setting in that file}.red
+  end
+
+end