From 5d3b88f0e1921f7c5af440bf9e28660215cd38c7 Mon Sep 17 00:00:00 2001 From: Jayson Virissimo Date: Sat, 27 Apr 2024 19:40:57 -0700 Subject: [PATCH] Ensure PLS files are valid --- lexicons/Latin00.pls | 1 + lexicons/Latin01.pls | 1 + lexicons/Latin02.pls | 1 + lexicons/Latin03.pls | 1 + lexicons/Latin04.pls | 1 + lib/medieval_latina/lexicon_builder.rb | 4 +++- spec/medieval_latina_spec.rb | 18 ++++++++++++++++++ 7 files changed, 26 insertions(+), 1 deletion(-) diff --git a/lexicons/Latin00.pls b/lexicons/Latin00.pls index fb0e65c..3177386 100644 --- a/lexicons/Latin00.pls +++ b/lexicons/Latin00.pls @@ -1796,3 +1796,4 @@ pedis pɛdɪs + diff --git a/lexicons/Latin01.pls b/lexicons/Latin01.pls index e62a56b..8d9e100 100644 --- a/lexicons/Latin01.pls +++ b/lexicons/Latin01.pls @@ -1891,3 +1891,4 @@ parcere parkɛre + diff --git a/lexicons/Latin02.pls b/lexicons/Latin02.pls index abc227b..4cec7ab 100644 --- a/lexicons/Latin02.pls +++ b/lexicons/Latin02.pls @@ -1851,3 +1851,4 @@ desitum desitumː + diff --git a/lexicons/Latin03.pls b/lexicons/Latin03.pls index 01b0664..1b98575 100644 --- a/lexicons/Latin03.pls +++ b/lexicons/Latin03.pls @@ -1888,3 +1888,4 @@ insulto inˈsul.to + diff --git a/lexicons/Latin04.pls b/lexicons/Latin04.pls index 12e12cb..d711bd4 100644 --- a/lexicons/Latin04.pls +++ b/lexicons/Latin04.pls @@ -192,3 +192,4 @@ vulnero ˈvul.ne.ro + diff --git a/lib/medieval_latina/lexicon_builder.rb b/lib/medieval_latina/lexicon_builder.rb index a83305d..763809c 100644 --- a/lib/medieval_latina/lexicon_builder.rb +++ b/lib/medieval_latina/lexicon_builder.rb @@ -25,17 +25,19 @@ def call xml.lexicon(xmlns: URL, version: "1.0") do grouped_hash.each do |phonetics, words| if xml.target!.length > MAX_SIZE + xml.lexicon # Close the current lexicon tag write_file(xml.target!, file_index) file_index += 1 xml = Builder::XmlMarkup.new(indent: 2) # Reset XML builder xml.instruct! :xml, encoding: "UTF-8" - xml.lexicon(xmlns: URL, version: "1.0") + xml.lexicon(xmlns: URL, version: "1.0") # Start a new lexicon tag end xml.lexeme do words.each { |word| xml.grapheme word } xml.phoneme phonetics end end + xml.lexicon # Close the final lexicon tag end write_file(xml.target!, file_index) # Write the last file end diff --git a/spec/medieval_latina_spec.rb b/spec/medieval_latina_spec.rb index d6407e1..17c3657 100644 --- a/spec/medieval_latina_spec.rb +++ b/spec/medieval_latina_spec.rb @@ -191,5 +191,23 @@ expect { Nokogiri::XML(content) }.not_to raise_error end end + + it "includes the XML declaration" do + lexicon_files.each do |file| + content = File.read(file) + expect(content).to match(/\A<\?xml version="1\.0" encoding="UTF-8"\?>\n/) + end + end + + it "contains valid IPA characters in the elements" do + valid_ipa_regex = /\A[\p{L}\p{M}\p{N}\p{P}\p{S}\p{Z}]+\z/ + lexicon_files.each do |file| + content = File.read(file) + doc = Nokogiri::XML(content) + doc.xpath("//phoneme").each do |phoneme| + expect(phoneme.text).to match(valid_ipa_regex) + end + end + end end end